Merge pull request #65 from richardmitic/opengraph

Use opengraph metadata to make basic recipe cards
2025-08-22 14:33:33 -07:00 · 2021-01-10 10:58:48 -09:00 · 2021-01-10 10:58:48 -09:00 · 8d0604da3a
commit 8d0604da3a
parent a4a33af1c3 9818d567b9
5 changed files with 1944 additions and 29 deletions
--- a/dev/scripts/scrape_recipe.py
+++ b/dev/scripts/scrape_recipe.py
@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
 The resulting files can be used as test input data.
 """

-import sys, json
+import sys, json, pprint
+import requests
+import extruct
 from scrape_schema_recipe import scrape_url
+from w3lib.html import get_base_url

 for url in sys.argv[1:]:
    try:
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
        print(f"Saved {filename}")
    except Exception as e:
        print(f"Error for {url}: {e}")
+        print("Trying extruct instead")
+        pp = pprint.PrettyPrinter(indent=2)
+        r = requests.get(url)
+        base_url = get_base_url(r.text, r.url)
+        data = extruct.extract(r.text, base_url=base_url)
+        pp.pprint(data)
--- a/mealie/services/scrape_services.py
+++ b/mealie/services/scrape_services.py
@ -1,8 +1,13 @@
+from typing import List, Tuple
+
 import json
 from pathlib import Path
 from typing import List

-from scrape_schema_recipe import scrape_url
+import extruct
+import requests
+from w3lib.html import get_base_url
+import scrape_schema_recipe
 from slugify import slugify
 from utils.logger import logger

@ -59,21 +64,10 @@ def normalize_data(recipe_data: dict) -> dict:
    recipe_data["recipeInstructions"] = normalize_instructions(
        recipe_data["recipeInstructions"]
    )
+    recipe_data["image"] = normalize_image_url(recipe_data["image"])
    return recipe_data


-def create_from_url(url: str) -> dict:
-    recipe_data = process_recipe_url(url)
-
-    with open(TEMP_FILE, "w") as f:
-        f.write(json.dumps(recipe_data, indent=4, default=str))
-
-    recipe_data = normalize_data(recipe_data)
-    recipe = Recipe(**recipe_data)
-
-    return recipe.save_to_db()
-
-
 def process_recipe_data(new_recipe: dict, url=None) -> dict:
    slug = slugify(new_recipe["name"])
    mealie_tags = {
@ -91,21 +85,76 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
    return new_recipe


-def process_recipe_url(url: str) -> dict:
-    new_recipe: dict = scrape_url(url, python_objects=True)[0]
-    logger.info(f"Recipe Scraped From Web: {new_recipe}")
+def extract_recipe_from_html(html:str, url: str) -> dict:
+    scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
+    if scraped_recipes:
+        new_recipe: dict = scraped_recipes[0]
+        logger.info(f"Recipe Scraped From Web: {new_recipe}")

-    if not new_recipe:
-        return "fail"  # TODO: Return Better Error Here
+        if not new_recipe:
+            return "fail"  # TODO: Return Better Error Here

-    new_recipe = process_recipe_data(new_recipe, url)
-
-    try:
-        img_path = scrape_image(
-            normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
-        )
-        new_recipe["image"] = img_path.name
-    except:
-        new_recipe["image"] = None
+        new_recipe = process_recipe_data(new_recipe, url=url)
+        new_recipe = normalize_data(new_recipe)
+    else:
+        new_recipe = basic_recipe_from_opengraph(html, url)
+        logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")

    return new_recipe
+
+
+def download_image_for_recipe(recipe: dict) -> dict:
+    try:
+        img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
+        recipe["image"] = img_path.name
+    except:
+        recipe["image"] = None
+
+    return recipe
+
+
+def og_field(properties: dict, field_name: str) -> str:
+    return next((val for name, val in properties if name == field_name), None)
+
+def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
+    return list({val for name, val in properties if name == field_name})
+
+def basic_recipe_from_opengraph(html: str, url: str) -> dict:
+    base_url = get_base_url(html, url)
+    data = extruct.extract(html, base_url=base_url)
+    properties = data["opengraph"][0]['properties']
+    return {
+        "name": og_field(properties, "og:title"),
+        "description": og_field(properties, "og:description"),
+        "image": og_field(properties, "og:image"),
+        "recipeYield": "",
+        # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
+        "recipeIngredient": ["Could not detect ingredients"],
+        # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
+        "recipeInstructions": ["Could not detect instructions"],
+        "slug": slugify(og_field(properties, "og:title")),
+        "orgURL": og_field(properties, "og:url"),
+        "categories": [],
+        "tags": og_fields(properties, "og:article:tag"),
+        "dateAdded": None,
+        "notes": [],
+        "extras": [],
+    }
+
+
+def process_recipe_url(url: str) -> dict:
+    r = requests.get(url)
+    new_recipe = extract_recipe_from_html(r.text, url)
+    new_recipe = download_image_for_recipe(new_recipe)
+    return new_recipe
+
+
+def create_from_url(url: str) -> dict:
+    recipe_data = process_recipe_url(url)
+
+    with open(TEMP_FILE, "w") as f:
+        f.write(json.dumps(recipe_data, indent=4, default=str))
+
+    recipe = Recipe(**recipe_data)
+
+    return recipe.save_to_db()
--- a/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html
+++ b/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html
--- a/mealie/test/data/html-raw/healthy_pasta_bake_60759.html
+++ b/mealie/test/data/html-raw/healthy_pasta_bake_60759.html
--- a/mealie/test/test_scraper.py
+++ b/mealie/test/test_scraper.py
@ -1,12 +1,22 @@
 import json
+import re
 from pathlib import Path

 import pytest
-from services.scrape_services import normalize_data, normalize_instructions
+from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html

 CWD = Path(__file__).parent
 RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
+RAW_HTML_DIR = CWD.joinpath("data", "html-raw")

+# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
+url_validation_regex = re.compile(
+    r'^(?:http|ftp)s?://' # http:// or https://
+    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
+    r'localhost|' #localhost...
+    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
+    r'(?::\d+)?' # optional port
+    r'(?:/?|[/?]\S+)$', re.IGNORECASE)

@pytest.mark.parametrize("json_file,num_steps", [
    ("best-homemade-salsa-recipe.json", 2),
@ -37,3 +47,32 @@ def test_normalize_data(json_file, num_steps):
 ])
 def test_normalize_instructions(instructions):
    assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
+
+
+def test_html_no_recipe_data():
+    path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
+    url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
+    recipe_data = extract_recipe_from_html(open(path).read(), url)
+
+    assert len(recipe_data["name"]) > 10
+    assert len(recipe_data["slug"]) > 10
+    assert recipe_data["orgURL"] == url
+    assert len(recipe_data["description"]) > 100
+    assert url_validation_regex.match(recipe_data["image"])
+    assert recipe_data["recipeIngredient"] == []
+    assert recipe_data["recipeInstructions"] == []
+
+
+def test_html_with_recipe_data():
+    path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
+    url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
+    recipe_data = extract_recipe_from_html(open(path).read(), url)
+
+    assert len(recipe_data["name"]) > 10
+    assert len(recipe_data["slug"]) > 10
+    assert recipe_data["orgURL"] == url
+    assert len(recipe_data["description"]) > 100
+    assert url_validation_regex.match(recipe_data["image"])
+    assert len(recipe_data["recipeIngredient"]) == 13
+    assert len(recipe_data["recipeInstructions"]) == 4
+