Merge branch 'dev' of https://github.com/hay-kot/mealie into sqlite

2025-08-22 06:23:34 -07:00 · 2021-01-10 11:08:56 -09:00 · 2021-01-10 11:08:56 -09:00 · 760d30c8fc
commit 760d30c8fc
parent f523a43893 80468d0c47
7 changed files with 1998 additions and 134 deletions
--- a/.gitignore
+++ b/.gitignore
@ -150,3 +150,4 @@ ENV/
 # Node Modules
 node_modules/
 mealie/data/debug/last_recipe.json
--- a/dev/scripts/scrape_recipe.py
+++ b/dev/scripts/scrape_recipe.py
@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
 The resulting files can be used as test input data.
 """
-import sys, json
+import sys, json, pprint
 import requests
 import extruct
 from scrape_schema_recipe import scrape_url
 from w3lib.html import get_base_url
 for url in sys.argv[1:]:
    try:
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
        print(f"Saved {filename}")
    except Exception as e:
        print(f"Error for {url}: {e}")
        print("Trying extruct instead")
        pp = pprint.PrettyPrinter(indent=2)
        r = requests.get(url)
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url)
        pp.pprint(data)
--- a/mealie/data/debug/last_recipe.json
+++ b/mealie/data/debug/last_recipe.json
@ -1,91 +1,18 @@
 {
-    "@context": "http://schema.org/",
+    "name": "Carottes Rapp\u00e9s with Rice and Sunflower Seeds \u2014 FEED THE SWIMMERS",
-    "@type": "Recipe",
+    "description": "&nbsp;Carottes R\u00e2p\u00e9es with Rice and Sunflower Seeds thanks to @think_rice and @thefeedfeed. Carottes R\u00e2p\u00e9es is a classic French Salad found ready to go (think picnic) at every charcuterie and on most cafe menus. This is one of those insanely simple salads that explode with flavor! The carrots ar",
-    "name": "Jalape\u00f1o Popper Dip",
+    "image": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers.JPG?format=1500w",
-    "author": {
+    "recipeYield": "",
        "@type": "Person",
        "name": "Michelle"
    },
    "description": "Jalapeno Popper Dip is creamy, cheesy and has just the perfect amount of kick. Great appetizer for your next party or watching the big game!",
    "datePublished": "2016-02-22 00:01:37+00:00",
    "image": "jalapeno-popper-dip.jpg",
    "recipeYield": [
        "10",
        "10 to 12 servings"
    ],
    "prepTime": "0:15:00",
    "cookTime": "0:30:00",
    "totalTime": "0:45:00",
    "recipeIngredient": [
-        "16 ounces cream cheese (at room temperature)",
+        "Could not detect ingredients"
        "1 cup mayonnaise",
        "8 pieces of bacon (cooked and chopped)",
        "6  jalape\u00f1os (seeded and minced (if you can't get fresh, substitute a 4-ounce can diced jalape\u00f1o peppers, drained))",
        "2 cloves garlic (minced)",
        "\u00bd teaspoon cumin",
        "6 ounces cheddar cheese (shredded (about 1\u00bd cups))",
        "1 cup panko breadcrumbs",
        "1 cup  grated Parmesan cheese",
        "4 tablespoons  unsalted butter, melted"
    ],
    "recipeInstructions": [
        {
-            "@type": "HowToStep",
+            "text": "Could not detect instructions"
            "text": "Preheat oven to 375 degrees F.",
            "name": "Preheat oven to 375 degrees F.",
            "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-0"
        },
        {
            "@type": "HowToStep",
            "text": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
            "name": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
            "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-1"
        },
        {
            "@type": "HowToStep",
            "text": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
            "name": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
            "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-2"
        },
        {
            "@type": "HowToStep",
            "text": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
            "name": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
            "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-3"
        }
    ],
-    "aggregateRating": {
+    "slug": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers",
-        "@type": "AggregateRating",
+    "orgURL": "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds",
        "ratingValue": "4.34",
        "ratingCount": "15"
    },
    "recipeCategory": [
        "Appetizer"
    ],
    "recipeCuisine": [
        "American"
    ],
    "keywords": "cheese dip, game day food, party food",
    "nutrition": {
        "@type": "NutritionInformation",
        "calories": "560 kcal",
        "carbohydrateContent": "7 g",
        "proteinContent": "14 g",
        "fatContent": "52 g",
        "saturatedFatContent": "21 g",
        "cholesterolContent": "109 mg",
        "sodiumContent": "707 mg",
        "sugarContent": "2 g",
        "servingSize": "1 serving"
    },
    "@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#recipe",
    "isPartOf": {
        "@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#article"
    },
    "mainEntityOfPage": "https://www.browneyedbaker.com/jalapeno-popper-dip/#webpage",
    "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/",
    "slug": "jalapeno-popper-dip",
    "orgURL": "http://www.browneyedbaker.com/2011/08/03/jalapeno-popper-dip/",
    "categories": [],
    "tags": [],
    "dateAdded": null,
--- a/mealie/services/scrape_services.py
+++ b/mealie/services/scrape_services.py
@ -1,10 +1,13 @@
 import json
 from pathlib import Path
-from typing import List
+from typing import List, Tuple
-from scrape_schema_recipe import scrape_url
+import extruct
 import requests
 import scrape_schema_recipe
 from slugify import slugify
 from utils.logger import logger
 from w3lib.html import get_base_url
 from services.image_services import scrape_image
 from services.recipe_services import Recipe
@ -59,21 +62,10 @@ def normalize_data(recipe_data: dict) -> dict:
    recipe_data["recipeInstructions"] = normalize_instructions(
        recipe_data["recipeInstructions"]
    )
    recipe_data["image"] = normalize_image_url(recipe_data["image"])
    return recipe_data
 def create_from_url(url: str) -> dict:
    recipe_data = process_recipe_url(url)
    with open(TEMP_FILE, "w") as f:
        f.write(json.dumps(recipe_data, indent=4, default=str))
    recipe_data = normalize_data(recipe_data)
    recipe = Recipe(**recipe_data)
    return recipe.save_to_db()
 def process_recipe_data(new_recipe: dict, url=None) -> dict:
    slug = slugify(new_recipe["name"])
    mealie_tags = {
@ -91,21 +83,78 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
    return new_recipe
-def process_recipe_url(url: str) -> dict:
+def extract_recipe_from_html(html: str, url: str) -> dict:
-    new_recipe: dict = scrape_url(url, python_objects=True)[0]
+    scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
    if scraped_recipes:
        new_recipe: dict = scraped_recipes[0]
        logger.info(f"Recipe Scraped From Web: {new_recipe}")
        if not new_recipe:
            return "fail"  # TODO: Return Better Error Here
-    new_recipe = process_recipe_data(new_recipe, url)
+        new_recipe = process_recipe_data(new_recipe, url=url)
-
+        new_recipe = normalize_data(new_recipe)
-    try:
+    else:
-        img_path = scrape_image(
+        new_recipe = basic_recipe_from_opengraph(html, url)
-            normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
+        logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
        )
        new_recipe["image"] = img_path.name
    except:
        new_recipe["image"] = None
    return new_recipe
 def download_image_for_recipe(recipe: dict) -> dict:
    try:
        img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
        recipe["image"] = img_path.name
    except:
        recipe["image"] = None
    return recipe
 def og_field(properties: dict, field_name: str) -> str:
    return next((val for name, val in properties if name == field_name), None)
 def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
    return list({val for name, val in properties if name == field_name})
 def basic_recipe_from_opengraph(html: str, url: str) -> dict:
    base_url = get_base_url(html, url)
    data = extruct.extract(html, base_url=base_url)
    properties = data["opengraph"][0]["properties"]
    return {
        "name": og_field(properties, "og:title"),
        "description": og_field(properties, "og:description"),
        "image": og_field(properties, "og:image"),
        "recipeYield": "",
        # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
        "recipeIngredient": ["Could not detect ingredients"],
        # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
        "recipeInstructions": [{"text": "Could not detect instructions"}],
        "slug": slugify(og_field(properties, "og:title")),
        "orgURL": og_field(properties, "og:url"),
        "categories": [],
        "tags": og_fields(properties, "og:article:tag"),
        "dateAdded": None,
        "notes": [],
        "extras": [],
    }
 def process_recipe_url(url: str) -> dict:
    r = requests.get(url)
    new_recipe = extract_recipe_from_html(r.text, url)
    new_recipe = download_image_for_recipe(new_recipe)
    return new_recipe
 def create_from_url(url: str) -> dict:
    recipe_data = process_recipe_url(url)
    with open(TEMP_FILE, "w") as f:
        f.write(json.dumps(recipe_data, indent=4, default=str))
    recipe = Recipe(**recipe_data)
    return recipe.save_to_db()
--- a/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html
+++ b/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html
--- a/mealie/test/data/html-raw/healthy_pasta_bake_60759.html
+++ b/mealie/test/data/html-raw/healthy_pasta_bake_60759.html
--- a/mealie/test/test_scraper.py
+++ b/mealie/test/test_scraper.py
@ -1,16 +1,38 @@
 import json
 import re
 from pathlib import Path
 import pytest
-from services.scrape_services import normalize_data, normalize_instructions
+from services.scrape_services import (
    extract_recipe_from_html,
    normalize_data,
    normalize_instructions,
 )
 CWD = Path(__file__).parent
 RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
 RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
 # https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
 url_validation_regex = re.compile(
    r"^(?:http|ftp)s?://"  # http:// or https://
    r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"  # domain...
    r"localhost|"  # localhost...
    r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
    r"(?::\d+)?"  # optional port
    r"(?:/?|[/?]\S+)$",
    re.IGNORECASE,
 )
-@pytest.mark.parametrize("json_file,num_steps", [
+@pytest.mark.parametrize(
    "json_file,num_steps",
    [
        ("best-homemade-salsa-recipe.json", 2),
-    ("blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json", 3),
+        (
            "blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json",
            3,
        ),
        ("bon_appetit.json", 8),
        ("chunky-apple-cake.json", 4),
        ("dairy-free-impossible-pumpkin-pie.json", 7),
@ -21,19 +43,57 @@ RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
        ("microwave_sweet_potatoes_04783.json", 4),
        ("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4),
        ("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3),
-])
+    ],
 )
 def test_normalize_data(json_file, num_steps):
    recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file))))
    assert len(recipe_data["recipeInstructions"]) == num_steps
-@pytest.mark.parametrize("instructions", [
+@pytest.mark.parametrize(
    "instructions",
    [
        "A\n\nB\n\nC\n\n",
        "A\nB\nC\n",
        "A\r\n\r\nB\r\n\r\nC\r\n\r\n",
        "A\r\nB\r\nC\r\n",
-    ["A","B","C"],
+        ["A", "B", "C"],
-    [{"@type": "HowToStep", "text": x} for x in ["A","B","C"]]
+        [{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]],
-])
+    ],
 )
 def test_normalize_instructions(instructions):
-    assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
+    assert normalize_instructions(instructions) == [
        {"text": "A"},
        {"text": "B"},
        {"text": "C"},
    ]
 def test_html_no_recipe_data():
    path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
    url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
    recipe_data = extract_recipe_from_html(open(path).read(), url)
    assert len(recipe_data["name"]) > 10
    assert len(recipe_data["slug"]) > 10
    assert recipe_data["orgURL"] == url
    assert len(recipe_data["description"]) > 100
    assert url_validation_regex.match(recipe_data["image"])
    assert recipe_data["recipeIngredient"] == ["Could not detect ingredients"]
    assert recipe_data["recipeInstructions"] == [
        {"text": "Could not detect instructions"}
    ]
 def test_html_with_recipe_data():
    path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
    url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
    recipe_data = extract_recipe_from_html(open(path).read(), url)
    assert len(recipe_data["name"]) > 10
    assert len(recipe_data["slug"]) > 10
    assert recipe_data["orgURL"] == url
    assert len(recipe_data["description"]) > 100
    assert url_validation_regex.match(recipe_data["image"])
    assert len(recipe_data["recipeIngredient"]) == 13
    assert len(recipe_data["recipeInstructions"]) == 4