mirror of
https://github.com/hay-kot/mealie.git
synced 2025-07-16 10:03:54 -07:00
* docs: fix typos * typos: fix typos found by `codespell` across the codebase * docs: fix `macOS` spelling * docs: fix `authentification` terminology "Authentification" is not a thing. * docs: fix `localhost` typo in example link * typos: fix in-code typos These are potentially higher risk, but no other mentions of these typos show up in the codebase.
85 lines
2.4 KiB
Python
85 lines
2.4 KiB
Python
import re
|
||
import unicodedata
|
||
|
||
replace_abbreviations = {
|
||
"cup": " cup ",
|
||
"g": " gram ",
|
||
"kg": " kilogram ",
|
||
"lb": " pound ",
|
||
"ml": " milliliter ",
|
||
"oz": " ounce ",
|
||
"pint": " pint ",
|
||
"qt": " quart ",
|
||
"tbsp": " tablespoon ",
|
||
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
|
||
"tsp": " teaspoon ",
|
||
}
|
||
|
||
|
||
def replace_common_abbreviations(string: str) -> str:
|
||
|
||
for k, v in replace_abbreviations.items():
|
||
regex = rf"(?<=\d)\s?({k}\bs?)"
|
||
string = re.sub(regex, v, string)
|
||
|
||
return string
|
||
|
||
|
||
def remove_periods(string: str) -> str:
|
||
"""Removes periods not sournded by digets"""
|
||
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
|
||
|
||
|
||
def replace_fraction_unicode(string: str):
|
||
# TODO: I'm not confident this works well enough for production needs some testing and/or refacorting
|
||
# TODO: Breaks on multiple unicode fractions
|
||
for c in string:
|
||
try:
|
||
name = unicodedata.name(c)
|
||
except ValueError:
|
||
continue
|
||
if name.startswith("VULGAR FRACTION"):
|
||
normalized = unicodedata.normalize("NFKC", c)
|
||
numerator, _, denominator = normalized.partition("⁄") # _ = slash
|
||
text = f" {numerator}/{denominator}"
|
||
return string.replace(c, text).replace(" ", " ")
|
||
|
||
return string
|
||
|
||
|
||
def wrap_or_clause(string: str):
|
||
"""
|
||
Attempts to wrap or clauses in ()
|
||
|
||
Examples:
|
||
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more' -> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
|
||
|
||
"""
|
||
# TODO: Needs more adequite testing to be sure this doesn't have side effects.
|
||
split_by_or = string.split(" or ")
|
||
|
||
split_by_comma = split_by_or[1].split(",")
|
||
|
||
if len(split_by_comma) > 0:
|
||
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
|
||
|
||
return string
|
||
|
||
|
||
def pre_process_string(string: str) -> str:
|
||
"""
|
||
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
|
||
|
||
{qty} {unit} {food}, {additional}
|
||
1 tbs. wine, expensive or other white wine, plus more
|
||
|
||
"""
|
||
string = string.lower()
|
||
string = replace_fraction_unicode(string)
|
||
string = remove_periods(string)
|
||
string = replace_common_abbreviations(string)
|
||
|
||
if " or " in string:
|
||
string = wrap_or_clause(string)
|
||
|
||
return string
|