mirror of
https://github.com/hay-kot/mealie.git
synced 2025-07-05 12:36:54 -07:00
feat: Migrate from CRF++ to Ingredient Parser (a Python package) (#5061)
This commit is contained in:
parent
ec1a9d78ac
commit
b12aea8272
19 changed files with 367 additions and 592 deletions
|
@ -31,6 +31,4 @@ venv
|
|||
*/mealie/.temp
|
||||
/mealie/frontend/
|
||||
|
||||
model.crfmodel
|
||||
|
||||
crowdin.yml
|
||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -157,10 +157,8 @@ dev/data/backups/dev_sample_data*.zip
|
|||
dev/data/recipes/*
|
||||
dev/scripts/output/app_routes.py
|
||||
dev/scripts/output/javascriptAPI/*
|
||||
mealie/services/scraper/ingredient_nlp/model.crfmodel
|
||||
dev/code-generation/generated/openapi.json
|
||||
dev/code-generation/generated/test_routes.py
|
||||
mealie/services/parser_services/crfpp/model.crfmodel
|
||||
lcov.info
|
||||
dev/code-generation/openapi.json
|
||||
|
||||
|
|
15
Taskfile.yml
15
Taskfile.yml
|
@ -61,26 +61,11 @@ tasks:
|
|||
- pyproject.toml
|
||||
- .pre-commit-config.yaml
|
||||
|
||||
setup:model:
|
||||
desc: setup nlp model
|
||||
vars:
|
||||
MODEL_URL: https://github.com/mealie-recipes/nlp-model/releases/download/v1.0.0/model.crfmodel
|
||||
OUTPUT: ./mealie/services/parser_services/crfpp/model.crfmodel
|
||||
sources:
|
||||
# using pyproject.toml as the dependency since this should only ever need to run once
|
||||
# during setup. There is perhaps a better way to do this.
|
||||
- ./pyproject.toml
|
||||
generates:
|
||||
- ./mealie/services/parser_services/crfpp/model.crfmodel
|
||||
cmds:
|
||||
- curl -L0 {{ .MODEL_URL }} --output {{ .OUTPUT }}
|
||||
|
||||
setup:
|
||||
desc: setup all dependencies
|
||||
deps:
|
||||
- setup:ui
|
||||
- setup:py
|
||||
- setup:model
|
||||
|
||||
dev:generate:
|
||||
desc: run code generators
|
||||
|
|
|
@ -116,13 +116,6 @@ COPY --from=packages * /dist/
|
|||
RUN . $VENV_PATH/bin/activate \
|
||||
&& pip install --require-hashes -r /dist/requirements.txt --find-links /dist
|
||||
|
||||
###############################################
|
||||
# CRFPP Image
|
||||
###############################################
|
||||
FROM hkotel/crfpp as crfpp
|
||||
|
||||
RUN echo "crfpp-container"
|
||||
|
||||
###############################################
|
||||
# Production Image
|
||||
###############################################
|
||||
|
@ -145,19 +138,9 @@ RUN apt-get update \
|
|||
# create directory used for Docker Secrets
|
||||
RUN mkdir -p /run/secrets
|
||||
|
||||
# copy CRF++ and add it to the library path
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
COPY --from=crfpp /usr/local/lib/ /usr/local/lib
|
||||
COPY --from=crfpp /usr/local/bin/crf_learn /usr/local/bin/crf_learn
|
||||
COPY --from=crfpp /usr/local/bin/crf_test /usr/local/bin/crf_test
|
||||
|
||||
# Copy venv into image. It contains a fully-installed mealie backend and frontend.
|
||||
COPY --from=venv-builder $VENV_PATH $VENV_PATH
|
||||
|
||||
|
||||
# Grab CRF++ Model Release
|
||||
RUN python -m mealie.scripts.install_model
|
||||
|
||||
VOLUME [ "$MEALIE_HOME/data/" ]
|
||||
ENV APP_PORT=9000
|
||||
|
||||
|
|
|
@ -646,7 +646,6 @@
|
|||
"nextStep": "Next step",
|
||||
"recipe-actions": "Recipe Actions",
|
||||
"parser": {
|
||||
"experimental-alert-text": "Mealie uses natural language processing to parse and create units and food items for your recipe ingredients. This feature is experimental and may not always work as expected. If you prefer not to use the parsed results, you can select 'Cancel' and your changes will not be saved.",
|
||||
"ingredient-parser": "Ingredient Parser",
|
||||
"explanation": "To use the ingredient parser, click the 'Parse All' button to start the process. Once the processed ingredients are available, you can review the items and verify that they were parsed correctly. The model's confidence score is displayed on the right of the item title. This score is an average of all the individual scores and may not always be completely accurate.",
|
||||
"alerts-explainer": "Alerts will be displayed if a matching foods or unit is found but does not exists in the database.",
|
||||
|
|
|
@ -1,13 +1,6 @@
|
|||
<template>
|
||||
<v-container v-if="recipe">
|
||||
<v-container>
|
||||
<v-alert dismissible border="left" colored-border type="warning" elevation="2" :icon="$globals.icons.alert">
|
||||
<b>{{ $tc("banner-experimental.title") }}</b>
|
||||
<div>
|
||||
{{ $tc("recipe.parser.experimental-alert-text") }}
|
||||
</div>
|
||||
</v-alert>
|
||||
|
||||
<BaseCardSectionTitle :title="$tc('recipe.parser.ingredient-parser')">
|
||||
<div class="mt-4">{{ $tc("recipe.parser.explanation") }}</div>
|
||||
|
||||
|
|
|
@ -250,7 +250,7 @@ class RepositoryGeneric(Generic[Schema, Model]):
|
|||
match_key = match_key or self.primary_key
|
||||
|
||||
result = self._query_one(value, match_key)
|
||||
results_as_model = self.schema.model_validate(result)
|
||||
result_as_model = self.schema.model_validate(result)
|
||||
|
||||
try:
|
||||
self.session.delete(result)
|
||||
|
@ -259,10 +259,10 @@ class RepositoryGeneric(Generic[Schema, Model]):
|
|||
self.session.rollback()
|
||||
raise e
|
||||
|
||||
return results_as_model
|
||||
return result_as_model
|
||||
|
||||
def delete_many(self, values: Iterable) -> Schema:
|
||||
query = self._query().filter(self.model.id.in_(values)) # type: ignore
|
||||
def delete_many(self, values: Iterable) -> list[Schema]:
|
||||
query = self._query().filter(self.model.id.in_(values))
|
||||
results = self.session.execute(query).unique().scalars().all()
|
||||
results_as_model = [self.schema.model_validate(result) for result in results]
|
||||
|
||||
|
@ -277,7 +277,7 @@ class RepositoryGeneric(Generic[Schema, Model]):
|
|||
self.session.rollback()
|
||||
raise e
|
||||
|
||||
return results_as_model # type: ignore
|
||||
return results_as_model
|
||||
|
||||
def delete_all(self) -> None:
|
||||
delete(self.model)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import re as re
|
||||
from collections.abc import Sequence
|
||||
from collections.abc import Iterable, Sequence
|
||||
from random import randint
|
||||
from typing import Self, cast
|
||||
from uuid import UUID
|
||||
|
@ -103,6 +103,51 @@ class RepositoryRecipes(HouseholdRepositoryGeneric[Recipe, RecipeModel]):
|
|||
if i >= max_retries:
|
||||
raise
|
||||
|
||||
def _delete_recipe(self, recipe: RecipeModel) -> Recipe:
|
||||
recipe_as_model = self.schema.model_validate(recipe)
|
||||
|
||||
# first remove UserToRecipe entries so we don't run into stale data errors
|
||||
try:
|
||||
user_to_recipe_delete_query = sa.delete(UserToRecipe).where(UserToRecipe.recipe_id == recipe.id)
|
||||
self.session.execute(user_to_recipe_delete_query)
|
||||
self.session.commit()
|
||||
except Exception:
|
||||
self.session.rollback()
|
||||
raise
|
||||
|
||||
# remove the recipe
|
||||
try:
|
||||
self.session.delete(recipe)
|
||||
self.session.commit()
|
||||
except Exception:
|
||||
self.session.rollback()
|
||||
raise
|
||||
|
||||
return recipe_as_model
|
||||
|
||||
def delete(self, value, match_key: str | None = None) -> Recipe:
|
||||
match_key = match_key or self.primary_key
|
||||
recipe_in_db = self._query_one(value, match_key)
|
||||
return self._delete_recipe(recipe_in_db)
|
||||
|
||||
def delete_many(self, values: Iterable) -> list[Recipe]:
|
||||
query = self._query().filter(self.model.id.in_(values))
|
||||
recipes_in_db = self.session.execute(query).unique().scalars().all()
|
||||
results: list[Recipe] = []
|
||||
|
||||
# we create a delete statement for each row
|
||||
# we don't delete the whole query in one statement because postgres doesn't cascade correctly
|
||||
for recipe_in_db in recipes_in_db:
|
||||
results.append(self._delete_recipe(recipe_in_db))
|
||||
|
||||
try:
|
||||
self.session.commit()
|
||||
except Exception as e:
|
||||
self.session.rollback()
|
||||
raise e
|
||||
|
||||
return results
|
||||
|
||||
def update_image(self, slug: str, _: str | None = None) -> int:
|
||||
entry: RecipeModel = self._query_one(match_value=slug)
|
||||
entry.image = randint(0, 255)
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
import requests
|
||||
|
||||
from mealie.services.parser_services import crfpp
|
||||
|
||||
MODEL_URL = "https://github.com/mealie-recipes/nlp-model/releases/download/v1.0.0/model.crfmodel"
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Install the model into the crfpp directory
|
||||
"""
|
||||
|
||||
r = requests.get(MODEL_URL, stream=True, allow_redirects=True)
|
||||
with open(crfpp.MODEL_PATH, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1 +0,0 @@
|
|||
from .processor import *
|
|
@ -1,69 +0,0 @@
|
|||
import re
|
||||
|
||||
from mealie.services.parser_services.parser_utils import convert_vulgar_fractions_to_regular_fractions
|
||||
|
||||
replace_abbreviations = {
|
||||
"cup": " cup ",
|
||||
"g": " gram ",
|
||||
"kg": " kilogram ",
|
||||
"lb": " pound ",
|
||||
"ml": " milliliter ",
|
||||
"oz": " ounce ",
|
||||
"pint": " pint ",
|
||||
"qt": " quart ",
|
||||
"tbsp": " tablespoon ",
|
||||
"tbs": " tablespoon ", # Order Matters!, 'tsb' must come after 'tbsp' in case of duplicate matches
|
||||
"tsp": " teaspoon ",
|
||||
}
|
||||
|
||||
|
||||
def replace_common_abbreviations(string: str) -> str:
|
||||
for k, v in replace_abbreviations.items():
|
||||
regex = rf"(?<=\d)\s?({k}\bs?)"
|
||||
string = re.sub(regex, v, string)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def remove_periods(string: str) -> str:
|
||||
"""Removes periods not sournded by digets"""
|
||||
return re.sub(r"(?<!\d)\.(?!\d)", "", string)
|
||||
|
||||
|
||||
def wrap_or_clause(string: str):
|
||||
"""
|
||||
Attempts to wrap or clauses in ()
|
||||
|
||||
Examples:
|
||||
'1 tsp. Diamond Crystal or ½ tsp. Morton kosher salt, plus more'
|
||||
-> '1 teaspoon diamond crystal (or 1/2 teaspoon morton kosher salt), plus more'
|
||||
|
||||
"""
|
||||
# TODO: Needs more adequite testing to be sure this doesn't have side effects.
|
||||
split_by_or = string.split(" or ")
|
||||
|
||||
split_by_comma = split_by_or[1].split(",")
|
||||
|
||||
if len(split_by_comma) > 0:
|
||||
return f"{split_by_or[0]} (or {split_by_comma[0]}),{''.join(split_by_comma[1:])}".strip().removesuffix(",")
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def pre_process_string(string: str) -> str:
|
||||
"""
|
||||
Series of preprocessing functions to make best use of the CRF++ model. The ideal string looks something like...
|
||||
|
||||
{qty} {unit} {food}, {additional}
|
||||
1 tbs. wine, expensive or other white wine, plus more
|
||||
|
||||
"""
|
||||
string = string.lower()
|
||||
string = convert_vulgar_fractions_to_regular_fractions(string)
|
||||
string = remove_periods(string)
|
||||
string = replace_common_abbreviations(string)
|
||||
|
||||
if " or " in string:
|
||||
string = wrap_or_clause(string)
|
||||
|
||||
return string
|
|
@ -1,63 +0,0 @@
|
|||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from fractions import Fraction
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from pydantic_core.core_schema import ValidationInfo
|
||||
|
||||
from mealie.schema._mealie.types import NoneFloat
|
||||
|
||||
from . import utils
|
||||
from .pre_processor import pre_process_string
|
||||
|
||||
CWD = Path(__file__).parent
|
||||
MODEL_PATH = os.getenv("CRF_MODEL_PATH", default=CWD / "model.crfmodel")
|
||||
|
||||
|
||||
class CRFConfidence(BaseModel):
|
||||
average: float = 0.0
|
||||
comment: NoneFloat = None
|
||||
name: NoneFloat = None
|
||||
unit: NoneFloat = None
|
||||
qty: Annotated[NoneFloat, Field(validate_default=True)] = None
|
||||
|
||||
|
||||
class CRFIngredient(BaseModel):
|
||||
input: str = ""
|
||||
name: str = ""
|
||||
other: str = ""
|
||||
qty: Annotated[str, Field(validate_default=True)] = ""
|
||||
comment: str = ""
|
||||
unit: str = ""
|
||||
confidence: CRFConfidence
|
||||
|
||||
@field_validator("qty", mode="before")
|
||||
def validate_qty(cls, qty, info: ValidationInfo):
|
||||
if qty is not None and qty != "":
|
||||
return qty
|
||||
|
||||
# Check if other contains a fraction
|
||||
try:
|
||||
if info.data["other"] is not None and info.data["other"].find("/") != -1:
|
||||
return str(round(float(Fraction(info.data["other"])), 3))
|
||||
else:
|
||||
return "0"
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _exec_crf_test(input_text):
|
||||
with tempfile.NamedTemporaryFile(mode="w") as input_file:
|
||||
input_file.write(utils.export_data(input_text))
|
||||
input_file.flush()
|
||||
return subprocess.check_output(["crf_test", "--verbose=1", "--model", MODEL_PATH, input_file.name]).decode(
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
|
||||
def convert_list_to_crf_model(list_of_ingrdeint_text: list[str]):
|
||||
crf_output = _exec_crf_test([pre_process_string(x) for x in list_of_ingrdeint_text])
|
||||
return [CRFIngredient(**ingredient) for ingredient in utils.import_data(crf_output.split("\n"))]
|
|
@ -1,38 +0,0 @@
|
|||
import re
|
||||
|
||||
|
||||
def clumpFractions(s):
|
||||
"""
|
||||
Replaces the whitespace between the integer and fractional part of a quantity
|
||||
with a dollar sign, so it's interpreted as a single token. The rest of the
|
||||
string is left alone.
|
||||
clumpFractions("aaa 1 2/3 bbb")
|
||||
# => "aaa 1$2/3 bbb"
|
||||
"""
|
||||
|
||||
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
"""
|
||||
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
|
||||
We sometimes give American units and metric units for baking recipes. For example:
|
||||
* 2 tablespoons/30 mililiters milk or cream
|
||||
* 2 1/2 cups/300 grams all-purpose flour
|
||||
The recipe database only allows for one unit, and we want to use the American one.
|
||||
But we must split the text on "cups/" etc. in order to pick it up.
|
||||
"""
|
||||
|
||||
# handle abbreviation like "100g" by treating it as "100 grams"
|
||||
s = re.sub(r"(\d+)g", r"\1 grams", s)
|
||||
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
|
||||
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
|
||||
|
||||
# TODO: Replace american_units with list of units from database?
|
||||
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
|
||||
# The following removes slashes following American units and replaces it with a space.
|
||||
for unit in american_units:
|
||||
s = s.replace(unit + "/", unit + " ")
|
||||
s = s.replace(unit + "s/", unit + "s ")
|
||||
|
||||
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]
|
|
@ -1,266 +0,0 @@
|
|||
import re
|
||||
from statistics import mean
|
||||
|
||||
from . import tokenizer
|
||||
|
||||
|
||||
def joinLine(columns):
|
||||
return "\t".join(columns)
|
||||
|
||||
|
||||
def unclump(s):
|
||||
"""
|
||||
Replacess $'s with spaces. The reverse of clumpFractions.
|
||||
"""
|
||||
return re.sub(r"\$", " ", s)
|
||||
|
||||
|
||||
def getFeatures(token, index, tokens):
|
||||
"""
|
||||
Returns a list of features for a given token.
|
||||
"""
|
||||
length = len(tokens)
|
||||
|
||||
return [
|
||||
f"I{index}",
|
||||
f"L{lengthGroup(length)}",
|
||||
f"{'Yes' if isCapitalized(token) else 'No'}CAP",
|
||||
f"{'Yes' if insideParenthesis(token, tokens) else 'No'}PAREN",
|
||||
]
|
||||
|
||||
|
||||
def singularize(word):
|
||||
"""
|
||||
A poor replacement for the pattern.en singularize function, but ok for now.
|
||||
"""
|
||||
|
||||
units = {
|
||||
"cups": "cup",
|
||||
"tablespoons": "tablespoon",
|
||||
"teaspoons": "teaspoon",
|
||||
"pounds": "pound",
|
||||
"ounces": "ounce",
|
||||
"cloves": "clove",
|
||||
"sprigs": "sprig",
|
||||
"pinches": "pinch",
|
||||
"bunches": "bunch",
|
||||
"slices": "slice",
|
||||
"grams": "gram",
|
||||
"heads": "head",
|
||||
"quarts": "quart",
|
||||
"stalks": "stalk",
|
||||
"pints": "pint",
|
||||
"pieces": "piece",
|
||||
"sticks": "stick",
|
||||
"dashes": "dash",
|
||||
"fillets": "fillet",
|
||||
"cans": "can",
|
||||
"ears": "ear",
|
||||
"packages": "package",
|
||||
"strips": "strip",
|
||||
"bulbs": "bulb",
|
||||
"bottles": "bottle",
|
||||
}
|
||||
|
||||
if word in units.keys():
|
||||
return units[word]
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def isCapitalized(token):
|
||||
"""
|
||||
Returns true if a given token starts with a capital letter.
|
||||
"""
|
||||
return re.match(r"^[A-Z]", token) is not None
|
||||
|
||||
|
||||
def lengthGroup(actualLength):
|
||||
"""
|
||||
Buckets the length of the ingredient into 6 buckets.
|
||||
"""
|
||||
for n in [4, 8, 12, 16, 20]:
|
||||
if actualLength < n:
|
||||
return str(n)
|
||||
|
||||
return "X"
|
||||
|
||||
|
||||
def insideParenthesis(token, tokens):
|
||||
"""
|
||||
Returns true if the word is inside parenthesis in the phrase.
|
||||
"""
|
||||
if token in ["(", ")"]:
|
||||
return True
|
||||
else:
|
||||
line = " ".join(tokens)
|
||||
return (
|
||||
re.match(r".*\(.*" + re.escape(token) + r".*\).*", line) is not None # - invalid dscape sequence
|
||||
)
|
||||
|
||||
|
||||
def displayIngredient(ingredient):
|
||||
"""
|
||||
Format a list of (tag, [tokens]) tuples as an HTML string for display.
|
||||
|
||||
displayIngredient([("qty", ["1"]), ("name", ["cat", "pie"])])
|
||||
# => <span class='qty'>1</span> <span class='name'>cat pie</span>
|
||||
"""
|
||||
|
||||
return "".join(["<span class='{}'>{}</span>".format(tag, " ".join(tokens)) for tag, tokens in ingredient])
|
||||
|
||||
|
||||
# HACK: fix this
|
||||
def smartJoin(words):
|
||||
"""
|
||||
Joins list of words with spaces, but is smart about not adding spaces
|
||||
before commas.
|
||||
"""
|
||||
|
||||
input = " ".join(words)
|
||||
|
||||
# replace " , " with ", "
|
||||
input = input.replace(" , ", ", ")
|
||||
|
||||
# replace " ( " with " ("
|
||||
input = input.replace("( ", "(")
|
||||
|
||||
# replace " ) " with ") "
|
||||
input = input.replace(" )", ")")
|
||||
|
||||
return input
|
||||
|
||||
|
||||
def import_data(lines):
|
||||
"""
|
||||
This thing takes the output of CRF++ and turns it into an actual
|
||||
data structure.
|
||||
"""
|
||||
data = [{}]
|
||||
display = [[]]
|
||||
prevTag = None
|
||||
|
||||
confidence_all = [{}]
|
||||
|
||||
#
|
||||
# iterate lines in the data file, which looks like:
|
||||
#
|
||||
# # 0.511035
|
||||
# 1/2 I1 L12 NoCAP X B-QTY/0.982850
|
||||
# teaspoon I2 L12 NoCAP X B-UNIT/0.982200
|
||||
# fresh I3 L12 NoCAP X B-COMMENT/0.716364
|
||||
# thyme I4 L12 NoCAP X B-NAME/0.816803
|
||||
# leaves I5 L12 NoCAP X I-NAME/0.960524
|
||||
# , I6 L12 NoCAP X B-COMMENT/0.772231
|
||||
# finely I7 L12 NoCAP X I-COMMENT/0.825956
|
||||
# chopped I8 L12 NoCAP X I-COMMENT/0.893379
|
||||
#
|
||||
# # 0.505999
|
||||
# Black I1 L8 YesCAP X B-NAME/0.765461
|
||||
# pepper I2 L8 NoCAP X I-NAME/0.756614
|
||||
# , I3 L8 NoCAP X OTHER/0.798040
|
||||
# to I4 L8 NoCAP X B-COMMENT/0.683089
|
||||
# taste I5 L8 NoCAP X I-COMMENT/0.848617
|
||||
#
|
||||
# i.e. the output of crf_test -v 1
|
||||
#
|
||||
for line in lines:
|
||||
# blank line starts a new ingredient
|
||||
if line in ("", "\n"):
|
||||
data.append({})
|
||||
display.append([])
|
||||
prevTag = None
|
||||
|
||||
confidence_all.append({})
|
||||
|
||||
# ignore comments
|
||||
elif line[0] == "#":
|
||||
pass
|
||||
|
||||
# otherwise it's a token
|
||||
# e.g.: potato \t I2 \t L5 \t NoCAP \t B-NAME/0.978253
|
||||
else:
|
||||
columns = re.split("\t", line.strip())
|
||||
token = columns[0].strip()
|
||||
|
||||
# unclump fractions
|
||||
token = unclump(token)
|
||||
|
||||
# turn B-NAME/123 back into "name"
|
||||
tag, confidence = re.split(r"/", columns[-1], maxsplit=1)
|
||||
tag = re.sub(r"^[BI]\-", "", tag).lower() # - invalid dscape sequence
|
||||
|
||||
# ====================
|
||||
# Confidence Getter
|
||||
if prevTag != tag:
|
||||
if confidence_all[-1].get(tag):
|
||||
confidence_all[-1][tag].append(confidence)
|
||||
else:
|
||||
confidence_all[-1][tag] = [confidence]
|
||||
else:
|
||||
if confidence_all[-1].get(tag):
|
||||
confidence_all[-1][tag].append(confidence)
|
||||
else:
|
||||
confidence_all[-1][tag] = [confidence]
|
||||
# ---- DISPLAY ----
|
||||
# build a structure which groups each token by its tag, so we can
|
||||
# rebuild the original display name later.
|
||||
|
||||
if prevTag != tag:
|
||||
display[-1].append((tag, [token]))
|
||||
prevTag = tag
|
||||
|
||||
else:
|
||||
display[-1][-1][1].append(token)
|
||||
# ^- token
|
||||
# ^---- tag
|
||||
# ^-------- ingredient
|
||||
|
||||
# ---- DATA ----
|
||||
# build a dict grouping tokens by their tag
|
||||
|
||||
# initialize this attribute if this is the first token of its kind
|
||||
if tag not in data[-1]:
|
||||
data[-1][tag] = []
|
||||
|
||||
# HACK: If this token is a unit, singularize it so Scoop accepts it.
|
||||
if tag == "unit":
|
||||
token = singularize(token)
|
||||
|
||||
data[-1][tag].append(token)
|
||||
|
||||
# reassemble the output into a list of dicts.
|
||||
output = [{k: smartJoin(tokens) for k, tokens in ingredient.items()} for ingredient in data if len(ingredient)]
|
||||
|
||||
# Preclean Confidence
|
||||
for i, c in enumerate(confidence_all):
|
||||
avg_of_all = []
|
||||
for k, v in c.items():
|
||||
v = [float(x) for x in v]
|
||||
avg = round(mean(v), 2)
|
||||
avg_of_all.append(avg)
|
||||
confidence_all[i][k] = avg
|
||||
|
||||
if avg_of_all:
|
||||
confidence_all[i]["average"] = round(mean(avg_of_all), 2)
|
||||
|
||||
# Add the raw ingredient phrase
|
||||
for i, _ in enumerate(output):
|
||||
output[i]["input"] = smartJoin([" ".join(tokens) for _, tokens in display[i]])
|
||||
output[i]["confidence"] = confidence_all[i]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def export_data(lines):
|
||||
"""Parse "raw" ingredient lines into CRF-ready output"""
|
||||
output = []
|
||||
for line in lines:
|
||||
line_clean = re.sub("<[^<]+?>", "", line)
|
||||
tokens = tokenizer.tokenize(line_clean)
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
features = getFeatures(token, i + 1, tokens)
|
||||
output.append(joinLine([token, *features]))
|
||||
output.append("")
|
||||
return "\n".join(output)
|
|
@ -1,12 +1,12 @@
|
|||
from fractions import Fraction
|
||||
|
||||
from ingredient_parser import parse_ingredient
|
||||
from ingredient_parser.dataclasses import CompositeIngredientAmount, IngredientAmount
|
||||
from ingredient_parser.dataclasses import ParsedIngredient as IngredientParserParsedIngredient
|
||||
from pydantic import UUID4
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.schema.recipe import RecipeIngredient
|
||||
from mealie.schema.recipe.recipe_ingredient import (
|
||||
MAX_INGREDIENT_DENOMINATOR,
|
||||
CreateIngredientFood,
|
||||
CreateIngredientUnit,
|
||||
IngredientConfidence,
|
||||
|
@ -14,8 +14,9 @@ from mealie.schema.recipe.recipe_ingredient import (
|
|||
RegisteredParser,
|
||||
)
|
||||
|
||||
from . import brute, crfpp, openai
|
||||
from . import brute, openai
|
||||
from ._base import ABCIngredientParser
|
||||
from .parser_utils import extract_quantity_from_string
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -47,50 +48,110 @@ class BruteForceParser(ABCIngredientParser):
|
|||
|
||||
class NLPParser(ABCIngredientParser):
|
||||
"""
|
||||
Class for CRFPP ingredient parsers.
|
||||
Class for Ingredient Parser library
|
||||
"""
|
||||
|
||||
def _crf_to_ingredient(self, crf_model: crfpp.CRFIngredient) -> ParsedIngredient:
|
||||
ingredient = None
|
||||
@staticmethod
|
||||
def _extract_amount(ingredient: IngredientParserParsedIngredient) -> IngredientAmount:
|
||||
if not (ingredient_amounts := ingredient.amount):
|
||||
return IngredientAmount(quantity=0, quantity_max=0, unit="", text="", confidence=0, starting_index=-1)
|
||||
|
||||
try:
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.comment,
|
||||
unit=CreateIngredientUnit(name=crf_model.unit),
|
||||
food=CreateIngredientFood(name=crf_model.name),
|
||||
disable_amount=False,
|
||||
quantity=float(
|
||||
sum(Fraction(s).limit_denominator(MAX_INGREDIENT_DENOMINATOR) for s in crf_model.qty.split())
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse ingredient: {crf_model}: {e}")
|
||||
# TODO: Capture some sort of state for the user to see that an exception occurred
|
||||
ingredient = RecipeIngredient(
|
||||
title="",
|
||||
note=crf_model.input,
|
||||
)
|
||||
ingredient_amount = ingredient_amounts[0]
|
||||
if isinstance(ingredient_amount, CompositeIngredientAmount):
|
||||
ingredient_amount = ingredient_amount.amounts[0]
|
||||
|
||||
return ingredient_amount
|
||||
|
||||
@staticmethod
|
||||
def _extract_quantity(ingredient_amount: IngredientAmount) -> tuple[float, float]:
|
||||
confidence = ingredient_amount.confidence
|
||||
|
||||
if isinstance(ingredient_amount.quantity, str):
|
||||
return extract_quantity_from_string(ingredient_amount.quantity)[0], confidence
|
||||
else:
|
||||
try:
|
||||
return float(ingredient_amount.quantity), confidence
|
||||
except ValueError:
|
||||
return 0, 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_unit(ingredient_amount: IngredientAmount) -> tuple[str, float]:
|
||||
confidence = ingredient_amount.confidence
|
||||
unit = str(ingredient_amount.unit) if ingredient_amount.unit else ""
|
||||
return unit, confidence
|
||||
|
||||
@staticmethod
|
||||
def _extract_food(ingredient: IngredientParserParsedIngredient) -> tuple[str, float]:
|
||||
confidence = ingredient.name.confidence if ingredient.name else 0
|
||||
food = str(ingredient.name.text) if ingredient.name else ""
|
||||
return food, confidence
|
||||
|
||||
@staticmethod
|
||||
def _extract_note(ingredient: IngredientParserParsedIngredient) -> tuple[str, float]:
|
||||
confidences: list[float] = []
|
||||
note_parts: list[str] = []
|
||||
if ingredient.size:
|
||||
note_parts.append(ingredient.size.text)
|
||||
confidences.append(ingredient.size.confidence)
|
||||
if ingredient.preparation:
|
||||
note_parts.append(ingredient.preparation.text)
|
||||
confidences.append(ingredient.preparation.confidence)
|
||||
if ingredient.comment:
|
||||
note_parts.append(ingredient.comment.text)
|
||||
confidences.append(ingredient.comment.confidence)
|
||||
|
||||
# average confidence among all note parts
|
||||
confidence = sum(confidences) / len(confidences) if confidences else 0
|
||||
note = ", ".join(note_parts)
|
||||
note = note.replace("(", "").replace(")", "")
|
||||
|
||||
return note, confidence
|
||||
|
||||
def _convert_ingredient(self, ingredient: IngredientParserParsedIngredient) -> ParsedIngredient:
|
||||
ingredient_amount = self._extract_amount(ingredient)
|
||||
qty, qty_conf = self._extract_quantity(ingredient_amount)
|
||||
unit, unit_conf = self._extract_unit(ingredient_amount)
|
||||
food, food_conf = self._extract_food(ingredient)
|
||||
note, note_conf = self._extract_note(ingredient)
|
||||
|
||||
# average confidence for components which were parsed
|
||||
confidences: list[float] = []
|
||||
if qty:
|
||||
confidences.append(qty_conf)
|
||||
if unit:
|
||||
confidences.append(unit_conf)
|
||||
if food:
|
||||
confidences.append(food_conf)
|
||||
if note:
|
||||
confidences.append(note_conf)
|
||||
|
||||
parsed_ingredient = ParsedIngredient(
|
||||
input=crf_model.input,
|
||||
ingredient=ingredient,
|
||||
input=ingredient.sentence,
|
||||
confidence=IngredientConfidence(
|
||||
quantity=crf_model.confidence.qty,
|
||||
food=crf_model.confidence.name,
|
||||
**crf_model.confidence.model_dump(),
|
||||
average=(sum(confidences) / len(confidences)) if confidences else 0,
|
||||
quantity=qty_conf,
|
||||
unit=unit_conf,
|
||||
food=food_conf,
|
||||
comment=note_conf,
|
||||
),
|
||||
ingredient=RecipeIngredient(
|
||||
title="",
|
||||
quantity=qty,
|
||||
unit=CreateIngredientUnit(name=unit) if unit else None,
|
||||
food=CreateIngredientFood(name=food) if food else None,
|
||||
disable_amount=False,
|
||||
note=note,
|
||||
),
|
||||
)
|
||||
|
||||
return self.find_ingredient_match(parsed_ingredient)
|
||||
|
||||
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
crf_models = crfpp.convert_list_to_crf_model(ingredients)
|
||||
return [self._crf_to_ingredient(crf_model) for crf_model in crf_models]
|
||||
|
||||
async def parse_one(self, ingredient_string: str) -> ParsedIngredient:
|
||||
items = await self.parse([ingredient_string])
|
||||
return items[0]
|
||||
parsed_ingredient = parse_ingredient(ingredient_string)
|
||||
return self._convert_ingredient(parsed_ingredient)
|
||||
|
||||
async def parse(self, ingredients: list[str]) -> list[ParsedIngredient]:
|
||||
return [await self.parse_one(ingredient) for ingredient in ingredients]
|
||||
|
||||
|
||||
__registrar: dict[RegisteredParser, type[ABCIngredientParser]] = {
|
||||
|
|
199
poetry.lock
generated
199
poetry.lock
generated
|
@ -697,6 +697,42 @@ docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.
|
|||
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
|
||||
typing = ["typing-extensions (>=4.12.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "flexcache"
|
||||
version = "0.3"
|
||||
description = "Saves and loads to the cache a transformed versions of a source object."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "flexcache-0.3-py3-none-any.whl", hash = "sha256:d43c9fea82336af6e0115e308d9d33a185390b8346a017564611f1466dcd2e32"},
|
||||
{file = "flexcache-0.3.tar.gz", hash = "sha256:18743bd5a0621bfe2cf8d519e4c3bfdf57a269c15d1ced3fb4b64e0ff4600656"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
typing-extensions = "*"
|
||||
|
||||
[package.extras]
|
||||
test = ["pytest", "pytest-cov", "pytest-mpl", "pytest-subtests"]
|
||||
|
||||
[[package]]
|
||||
name = "flexparser"
|
||||
version = "0.4"
|
||||
description = "Parsing made fun ... using typing."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "flexparser-0.4-py3-none-any.whl", hash = "sha256:3738b456192dcb3e15620f324c447721023c0293f6af9955b481e91d00179846"},
|
||||
{file = "flexparser-0.4.tar.gz", hash = "sha256:266d98905595be2ccc5da964fe0a2c3526fbbffdc45b65b3146d75db992ef6b2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
typing-extensions = "*"
|
||||
|
||||
[package.extras]
|
||||
test = ["pytest", "pytest-cov", "pytest-mpl", "pytest-subtests"]
|
||||
|
||||
[[package]]
|
||||
name = "freezegun"
|
||||
version = "1.5.1"
|
||||
|
@ -737,7 +773,7 @@ description = "Lightweight in-process concurrent programming"
|
|||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"
|
||||
markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""
|
||||
files = [
|
||||
{file = "greenlet-3.0.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9da2bd29ed9e4f15955dd1595ad7bc9320308a3b766ef7f837e23ad4b4aac31a"},
|
||||
{file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d353cadd6083fdb056bb46ed07e4340b0869c305c8ca54ef9da3421acbdf6881"},
|
||||
|
@ -993,6 +1029,23 @@ files = [
|
|||
{file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ingredient-parser-nlp"
|
||||
version = "1.3.2"
|
||||
description = "A Python package to parse structured information from recipe ingredient sentences"
|
||||
optional = false
|
||||
python-versions = "<3.14,>=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "ingredient_parser_nlp-1.3.2-py3-none-any.whl", hash = "sha256:4e9b18a977e6b93985edd5a2668e5bb4f1dd3c570374316fb7f811a21ca55523"},
|
||||
{file = "ingredient_parser_nlp-1.3.2.tar.gz", hash = "sha256:12f4d34717364881b828b476bd5b5f8a72c96474883b8cbe94911a39fd71e719"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
nltk = ">=3.9.1"
|
||||
pint = ">=0.24.4"
|
||||
python-crfsuite = "*"
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.0.0"
|
||||
|
@ -1136,6 +1189,18 @@ files = [
|
|||
{file = "jiter-0.5.0.tar.gz", hash = "sha256:1d916ba875bcab5c5f7d927df998c4cb694d27dceddf3392e58beaf10563368a"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "joblib"
|
||||
version = "1.4.2"
|
||||
description = "Lightweight pipelining with Python functions"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
|
||||
{file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jstyleson"
|
||||
version = "0.0.2"
|
||||
|
@ -1645,6 +1710,32 @@ files = [
|
|||
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nltk"
|
||||
version = "3.9.1"
|
||||
description = "Natural Language Toolkit"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1"},
|
||||
{file = "nltk-3.9.1.tar.gz", hash = "sha256:87d127bd3de4bd89a4f81265e5fa59cb1b199b27440175370f7417d2bc7ae868"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
click = "*"
|
||||
joblib = "*"
|
||||
regex = ">=2021.8.3"
|
||||
tqdm = "*"
|
||||
|
||||
[package.extras]
|
||||
all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"]
|
||||
corenlp = ["requests"]
|
||||
machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"]
|
||||
plot = ["matplotlib"]
|
||||
tgrep = ["pyparsing"]
|
||||
twitter = ["twython"]
|
||||
|
||||
[[package]]
|
||||
name = "nodeenv"
|
||||
version = "1.7.0"
|
||||
|
@ -1995,13 +2086,43 @@ docs = ["sphinx (>=4.4)", "sphinx-issues (>=3.0.1)", "sphinx-rtd-theme (>=1.0)"]
|
|||
tests = ["defusedxml", "numpy", "packaging", "pympler", "pytest"]
|
||||
tests-min = ["defusedxml", "packaging", "pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "pint"
|
||||
version = "0.24.4"
|
||||
description = "Physical quantities module"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "Pint-0.24.4-py3-none-any.whl", hash = "sha256:aa54926c8772159fcf65f82cc0d34de6768c151b32ad1deb0331291c38fe7659"},
|
||||
{file = "pint-0.24.4.tar.gz", hash = "sha256:35275439b574837a6cd3020a5a4a73645eb125ce4152a73a2f126bf164b91b80"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
flexcache = ">=0.3"
|
||||
flexparser = ">=0.4"
|
||||
platformdirs = ">=2.1.0"
|
||||
typing-extensions = ">=4.0.0"
|
||||
|
||||
[package.extras]
|
||||
babel = ["babel (<=2.8)"]
|
||||
bench = ["pytest", "pytest-codspeed"]
|
||||
dask = ["dask"]
|
||||
mip = ["mip (>=1.13)"]
|
||||
numpy = ["numpy (>=1.23)"]
|
||||
pandas = ["pint-pandas (>=0.3)"]
|
||||
test = ["pytest", "pytest-benchmark", "pytest-cov", "pytest-mpl", "pytest-subtests"]
|
||||
testbase = ["pytest", "pytest-benchmark", "pytest-cov", "pytest-subtests"]
|
||||
uncertainties = ["uncertainties (>=3.1.6)"]
|
||||
xarray = ["xarray"]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.6"
|
||||
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
|
||||
{file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
|
||||
|
@ -2499,6 +2620,74 @@ pytest = ">=8.2,<9"
|
|||
docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1)"]
|
||||
testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "python-crfsuite"
|
||||
version = "0.9.11"
|
||||
description = "Python binding for CRFsuite"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f5ed569517e7b1fa3d32cf5d5cbe2fb6c85486195bf5cad03d52072fef7aa8a"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aed10ee4334c99173940e88318d312a4f9e70ba653b8ac0e6f3ef816431af811"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd8cc52f853436bbed580ad6c17e37c3657466fdfa28ddc55efcbba28b92cdf"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:182fad0415697d5acbe18364333f8255016c8609d570cba78c20d8d71a392f90"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05cd988aaa7ac87a54d4bd1d756455f6e3b078f07b4fcbda3bccfd91a784dd20"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2dead957809b92b7f0fc4c03fc70af9cbcaf35518ff1fd3a3fe2862dd0bb52fa"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66f24e5281b8a10091c3a9eef5a85115aea9570bcb9e0c03c738b0eab7070cb5"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-win32.whl", hash = "sha256:b5a9492686e3dde5739ea19a3ec37397eb7cff787362e403a411acb6431aaf84"},
|
||||
{file = "python_crfsuite-0.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:d2c361819ba331c48038f1b231b8863b886205e9decae2fb89f69da44b28d00a"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4a2f2ff5b6b0b6cf72ee476436f3926ccd0045c97e7703478a025c9badd180c6"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83bc133fc2a411144778bb03d56a95f88a4da0386462fb99d32b45428959101f"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5e52bfe54c1cb94009f1edb9c1dec3fe6d31823c60fafee04d63354c342303"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a23a96dc9a25a0d143430236158ca0d836b94a26d5752ffdf7efe315c14045f5"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dd95a8ab9d92ac6756c17dde8150d7edcc696e49b4ca5f537e347143d19c94bc"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:800fd345f2eb822d574eeaa6099bb88a23942272f62ea3e182e8ec07f4cf5ca8"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c17dc2c5ac63d10993afbab0288bb1949e4ac856361c83e8041fff4493d7dab"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-win32.whl", hash = "sha256:9a00f1f32203d9cb66658df75ee62ce4809b24f26b982b7f482934a683abc96c"},
|
||||
{file = "python_crfsuite-0.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:d255f02c890628337c970d76cba787afb7991340b3a7b201d3a158add5f78989"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:346a37d1ffa9f161d56c523d2386eaa5026c663e70f65db4478adb292d7c047c"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec40a7924d2e79a06f8eb0cec613ade54d677b73c4041c6052cd890aca2db89"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5b3836e8ee8d684fb9d76d287035db51039b30cadac3332664655acf970831"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f498cb82686dc18f7cecaf0a7ebceb4590ee2137cfa8cfe1b75f53514d0e956"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:29cdf55c54c388c62148ba310bf8ad1b93b352d62dd84856d15c421dae2e902d"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7e6738ed044ba91d8284716f87525ca95bc857ece0b226910a80126a8ce6ad06"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a365a70e54dbd20a9251a3b6df91e1406cab1b1b5995a9d68e8c748fc9b3af7"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-win32.whl", hash = "sha256:4b230ab1b69c6025e4f64e72c445f7492cccf00d94fc2c0bf2f337fafc05d5d5"},
|
||||
{file = "python_crfsuite-0.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:c89d7ad4ca520a5f045c676865ec09a2accc25dc5dce387f2199e5b2c9d8f337"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:89b45426f28b39dfc4789d29bcd7398f177746e4ab27f6ae3c7b48a082ecb73b"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:788b6ca5fd43797f6822bb7aed8d5b0255d7d53be62746c77ca91dad5dfd2f2b"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:609ce1e2ea1ff36379e91a4af9f10bcaaca0b22d089ec7489181ae0d9d098419"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:893af206342196e37c84af73941d7c2498e3ab926a67f846f78de6f48a7cb067"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a387c4c4794ecccc712e01091b2887fc90b63dbc6612947232c2593116545e8a"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:00db049cc46f716cef6626fbcf5b8abc258f4740e39dcceccc706ba77200992b"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c0f95fd723e7a684188c541106f301a1d87104a07acd1e5687df849d2a86391a"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-win32.whl", hash = "sha256:5664cebdc82d20b374641f2d0e77a86e8b010fafaf8efeb8862c3fc567d41c08"},
|
||||
{file = "python_crfsuite-0.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:00123f42dca02897aaa1fc129ea99b815f800c2893ffb210d8b8f71235ffeef4"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bb02962c16e3c84bb056ed86f2227b3d0432995c047acb7eb15032c1b645044c"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5cc941f1e22cd52e1965cd353b67edfbae06dc5ceb6556bf3176d8523113f66"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8b3ceefc199b46e562a8bfaac9ef71f86108f0435e28f40007da48618f53837"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b0c244c0ac04f1213576d28743dae133ca3ff2ebba98b3c4abda3327f37ed23"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8919fec4638133b3e95afe1496b5b771bb8464741bd467534cc1414ae7f0efc6"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:993705405b979047a9c66141f4ef886635278f244b5371c25db94751f4b7d326"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:48fb8b11ae294a3f5986dc4ae9a20047d850e1dc20dae3725c3a9d0c70e14418"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-win32.whl", hash = "sha256:f8df18614e5c6c3c95d3e20a7968f75201693a0cc1284d893f7bbc04a392f8e3"},
|
||||
{file = "python_crfsuite-0.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:01a0078292fff9e171ab9f4cabc67cbd2c629647b8fc67187c1335520a7a45fa"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0e1fad868fe15cb5bca7c0015995bd962de2f0b100e3e5b7dd3c14273fdc806"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcb60d6ac04e6f7e64f02aceaea88b6ad4ffdc183c5301f7fd8b8a280c3efc8e"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e6e9a3439c503884d6bb4311f9e7bb34cd4c5e83da28f8c8abcfa34332b2f7"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3064a4902b18c8a0916e48db4f94bc323e9390b96ae41098674ceb36f107acee"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:cac7a8bb6f629dc42408f3df45a892010321ba539a30cecc54bdea8f05580003"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:796b6b84d4af5b848786f05c378a32f08ef6a5c67dd929f9845f0f7217177db8"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:92ebc0f4291b6beae87eb6b9999c3381db5299852f7bdd88cdfca62d759630db"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-win32.whl", hash = "sha256:d6b4705cd7657efa8fc7742b09783537595944d18c0708e362252c2a9cd2a58d"},
|
||||
{file = "python_crfsuite-0.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:c7aeec4be4056b0c6dd4a1357707c8d5b9c88b3f74e51d2f4d407692cad4877f"},
|
||||
{file = "python_crfsuite-0.9.11.tar.gz", hash = "sha256:6eff965ca70567396d822c9a35ea74b0f7edb27d9471524997bdabe7a6da5f5a"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["black", "flake8", "isort", "tox"]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0"
|
||||
|
@ -2806,7 +2995,7 @@ version = "2022.10.31"
|
|||
description = "Alternative regular expression module, to replace re."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
groups = ["dev"]
|
||||
groups = ["main", "dev"]
|
||||
files = [
|
||||
{file = "regex-2022.10.31-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a8ff454ef0bb061e37df03557afda9d785c905dab15584860f982e88be73015f"},
|
||||
{file = "regex-2022.10.31-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1eba476b1b242620c266edf6325b443a2e22b633217a9835a52d8da2b5c051f9"},
|
||||
|
@ -3579,5 +3768,5 @@ pgsql = ["psycopg2-binary"]
|
|||
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.12"
|
||||
content-hash = "3442bd32ecbf82e5d49975511c3b01b0baa877712d9c786b12cfb5dfdda0c08f"
|
||||
python-versions = ">=3.12,<3.13"
|
||||
content-hash = "9a19a1b0f75cce3df8c69bcb8b41da14f12fc127e0d26e1a8fb0de2776666448"
|
||||
|
|
|
@ -31,7 +31,7 @@ orjson = "^3.8.0"
|
|||
psycopg2-binary = { version = "^2.9.1", optional = true }
|
||||
pydantic = "^2.6.1"
|
||||
pyhumps = "^3.5.3"
|
||||
python = "^3.12"
|
||||
python = ">=3.12,<3.13"
|
||||
python-dateutil = "^2.8.2"
|
||||
python-dotenv = "^1.0.0"
|
||||
python-ldap = "^3.3.1"
|
||||
|
@ -54,6 +54,7 @@ pyjwt = "^2.8.0"
|
|||
openai = "^1.63.0"
|
||||
typing-extensions = "^4.12.2"
|
||||
itsdangerous = "^2.2.0"
|
||||
ingredient-parser-nlp = "^1.3.2"
|
||||
|
||||
[tool.poetry.group.postgres.dependencies]
|
||||
psycopg2-binary = { version = "^2.9.1" }
|
||||
|
|
|
@ -2,10 +2,29 @@ import pytest
|
|||
from fastapi.testclient import TestClient
|
||||
|
||||
from mealie.schema.recipe.recipe_ingredient import RegisteredParser
|
||||
from tests.unit_tests.test_ingredient_parser import TestIngredient, crf_exists, test_ingredients
|
||||
from tests.unit_tests.test_ingredient_parser import TestIngredient
|
||||
from tests.utils import api_routes
|
||||
from tests.utils.fixture_schemas import TestUser
|
||||
|
||||
nlp_test_ingredients = [
|
||||
TestIngredient("½ cup all-purpose flour", 0.5, "cup", "all-purpose flour", ""),
|
||||
TestIngredient("1 ½ teaspoons ground black pepper", 1.5, "teaspoon", "ground black pepper", ""),
|
||||
TestIngredient("⅔ cup unsweetened flaked coconut", 0.667, "cup", "unsweetened flaked coconut", ""),
|
||||
TestIngredient("⅓ cup panko bread crumbs", 0.333, "cup", "panko bread crumbs", ""),
|
||||
TestIngredient("1/8 cup all-purpose flour", 0.125, "cup", "all-purpose flour", ""),
|
||||
TestIngredient("1/32 cup all-purpose flour", 0.031, "cup", "all-purpose flour", ""),
|
||||
TestIngredient("1 1/2 cups chopped onion ", 1.5, "cup", "onion", "chopped"),
|
||||
TestIngredient(
|
||||
"2 pounds russet potatoes, peeled, and cut into 3/4-inch cubes ",
|
||||
2,
|
||||
"pound",
|
||||
"russet potatoes",
|
||||
"peeled, and cut into 3/4 inch cubes",
|
||||
),
|
||||
TestIngredient("2 tablespoons (30ml) vegetable oil ", 2, "tablespoon", "vegetable oil", ""),
|
||||
TestIngredient("2 teaspoons salt (to taste) ", 2, "teaspoon", "salt", "to taste"),
|
||||
]
|
||||
|
||||
|
||||
def assert_ingredient(api_response: dict, test_ingredient: TestIngredient):
|
||||
assert api_response["ingredient"]["quantity"] == pytest.approx(test_ingredient.quantity)
|
||||
|
@ -14,8 +33,7 @@ def assert_ingredient(api_response: dict, test_ingredient: TestIngredient):
|
|||
assert api_response["ingredient"]["note"] == test_ingredient.comments
|
||||
|
||||
|
||||
@pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed")
|
||||
@pytest.mark.parametrize("test_ingredient", test_ingredients)
|
||||
@pytest.mark.parametrize("test_ingredient", nlp_test_ingredients)
|
||||
def test_recipe_ingredient_parser_nlp(api_client: TestClient, test_ingredient: TestIngredient, unique_user: TestUser):
|
||||
payload = {"parser": RegisteredParser.nlp, "ingredient": test_ingredient.input}
|
||||
response = api_client.post(api_routes.parser_ingredient, json=payload, headers=unique_user.token)
|
||||
|
@ -23,13 +41,12 @@ def test_recipe_ingredient_parser_nlp(api_client: TestClient, test_ingredient: T
|
|||
assert_ingredient(response.json(), test_ingredient)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed")
|
||||
def test_recipe_ingredients_parser_nlp(api_client: TestClient, unique_user: TestUser):
|
||||
payload = {"parser": RegisteredParser.nlp, "ingredients": [x.input for x in test_ingredients]}
|
||||
payload = {"parser": RegisteredParser.nlp, "ingredients": [x.input for x in nlp_test_ingredients]}
|
||||
response = api_client.post(api_routes.parser_ingredients, json=payload, headers=unique_user.token)
|
||||
assert response.status_code == 200
|
||||
|
||||
for api_ingredient, test_ingredient in zip(response.json(), test_ingredients, strict=False):
|
||||
for api_ingredient, test_ingredient in zip(response.json(), nlp_test_ingredients, strict=False):
|
||||
assert_ingredient(api_ingredient, test_ingredient)
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
import asyncio
|
||||
import json
|
||||
import shutil
|
||||
from dataclasses import dataclass
|
||||
from fractions import Fraction
|
||||
|
||||
import pytest
|
||||
from pydantic import UUID4
|
||||
|
@ -27,10 +25,6 @@ from mealie.schema.recipe.recipe_ingredient import (
|
|||
from mealie.schema.user.user import GroupBase
|
||||
from mealie.services.openai import OpenAIService
|
||||
from mealie.services.parser_services import RegisteredParser, get_parser
|
||||
from mealie.services.parser_services.crfpp.processor import (
|
||||
CRFIngredient,
|
||||
convert_list_to_crf_model,
|
||||
)
|
||||
from tests.utils.factories import random_int, random_string
|
||||
|
||||
|
||||
|
@ -43,10 +37,6 @@ class TestIngredient:
|
|||
comments: str
|
||||
|
||||
|
||||
def crf_exists() -> bool:
|
||||
return shutil.which("crf_test") is not None
|
||||
|
||||
|
||||
def build_parsed_ing(food: str | None, unit: str | None) -> ParsedIngredient:
|
||||
ing = RecipeIngredient(unit=None, food=None)
|
||||
if food:
|
||||
|
@ -134,32 +124,6 @@ def parsed_ingredient_data(
|
|||
return foods, units
|
||||
|
||||
|
||||
# TODO - add more robust test cases
|
||||
test_ingredients = [
|
||||
TestIngredient("½ cup all-purpose flour", 0.5, "cup", "all-purpose flour", ""),
|
||||
TestIngredient("1 ½ teaspoons ground black pepper", 1.5, "teaspoon", "black pepper", "ground"),
|
||||
TestIngredient("⅔ cup unsweetened flaked coconut", 0.667, "cup", "coconut", "unsweetened flaked"),
|
||||
TestIngredient("⅓ cup panko bread crumbs", 0.333, "cup", "panko bread crumbs", ""),
|
||||
# Small Fraction Tests - PR #1369
|
||||
# Reported error is was for 1/8 - new lowest expected threshold is 1/32
|
||||
TestIngredient("1/8 cup all-purpose flour", 0.125, "cup", "all-purpose flour", ""),
|
||||
TestIngredient("1/32 cup all-purpose flour", 0.031, "cup", "all-purpose flour", ""),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not crf_exists(), reason="CRF++ not installed")
|
||||
def test_nlp_parser() -> None:
|
||||
models: list[CRFIngredient] = convert_list_to_crf_model([x.input for x in test_ingredients])
|
||||
|
||||
# Iterate over models and test_ingredients to gather
|
||||
for model, test_ingredient in zip(models, test_ingredients, strict=False):
|
||||
assert round(float(sum(Fraction(s) for s in model.qty.split())), 3) == pytest.approx(test_ingredient.quantity)
|
||||
|
||||
assert model.comment == test_ingredient.comments
|
||||
assert model.name == test_ingredient.food
|
||||
assert model.unit == test_ingredient.unit
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input, quantity, unit, food, comment",
|
||||
[
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue