mealie/mealie/services/parser_services/crfpp/pre_processor.py
Ben Boeckel 2e6b877ba9
docs: fix typos (#1665)
* docs: fix typos

* typos: fix typos found by `codespell` across the codebase

* docs: fix `macOS` spelling

* docs: fix `authentification` terminology

"Authentification" is not a thing.

* docs: fix `localhost` typo in example link

* typos: fix in-code typos

These are potentially higher risk, but no other mentions of these typos
show up in the codebase.
2022-09-25 15:17:27 -08:00

85 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import unicodedata
replace_abbreviations = {
"cup": " cup ",
"g": " gram ",
"kg": " kilogram ",
"lb": " pound ",
"ml": " milliliter ",
"oz": " ounce ",
"pint": " pint ",
"qt": " quart ",
"tbsp": " tablespoon ",
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
"tsp": " teaspoon ",
}
def replace_common_abbreviations(string: str) -> str:
for k, v in replace_abbreviations.items():
regex = rf"(?<=\d)\s?({k}\bs?)"
string = re.sub(regex, v, string)
return string
def remove_periods(string: str) -> str:
"""Removes periods not sournded by digets"""
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
def replace_fraction_unicode(string: str):
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
# TODO: Breaks on multiple unicode fractions
for c in string:
try:
name = unicodedata.name(c)
except ValueError:
continue
if name.startswith("VULGAR FRACTION"):
normalized = unicodedata.normalize("NFKC", c)
numerator, _, denominator = normalized.partition("") # _ = slash
text = f" {numerator}/{denominator}"
return string.replace(c, text).replace(" ", " ")
return string
def wrap_or_clause(string: str):
"""
Attempts to wrap or clauses in ()
Examples:
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
"""
# TODO: Needs more adequite testing to be sure this doesn't have side effects.
split_by_or = string.split(" or ")
split_by_comma = split_by_or[1].split(",")
if len(split_by_comma) > 0:
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
return string
def pre_process_string(string: str) -> str:
"""
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
{qty} {unit} {food}, {additional}
1 tbs. wine, expensive or other white wine, plus more
"""
string = string.lower()
string = replace_fraction_unicode(string)
string = remove_periods(string)
string = replace_common_abbreviations(string)
if " or " in string:
string = wrap_or_clause(string)
return string