Merge pull request #65 from richardmitic/opengraph

Use opengraph metadata to make basic recipe cards
2025-08-22 22:43:34 -07:00 · 2021-01-10 10:58:48 -09:00 · 2021-01-10 10:58:48 -09:00 · 8d0604da3a
commit 8d0604da3a
parent a4a33af1c3 9818d567b9
5 changed files with 1944 additions and 29 deletions
--- a/dev/scripts/scrape_recipe.py
+++ b/dev/scripts/scrape_recipe.py
@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
 The resulting files can be used as test input data.
 """
-import sys, json
+import sys, json, pprint
 import requests
 import extruct
 from scrape_schema_recipe import scrape_url
 from w3lib.html import get_base_url
 for url in sys.argv[1:]:
    try:
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
        print(f"Saved {filename}")
    except Exception as e:
        print(f"Error for {url}: {e}")
        print("Trying extruct instead")
        pp = pprint.PrettyPrinter(indent=2)
        r = requests.get(url)
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url)
        pp.pprint(data)
--- a/mealie/services/scrape_services.py
+++ b/mealie/services/scrape_services.py
@ -1,8 +1,13 @@
 from typing import List, Tuple
 import json
 from pathlib import Path
 from typing import List
-from scrape_schema_recipe import scrape_url
+import extruct
 import requests
 from w3lib.html import get_base_url
 import scrape_schema_recipe
 from slugify import slugify
 from utils.logger import logger
@ -59,21 +64,10 @@ def normalize_data(recipe_data: dict) -> dict:
    recipe_data["recipeInstructions"] = normalize_instructions(
        recipe_data["recipeInstructions"]
    )
    recipe_data["image"] = normalize_image_url(recipe_data["image"])
    return recipe_data
 def create_from_url(url: str) -> dict:
    recipe_data = process_recipe_url(url)
    with open(TEMP_FILE, "w") as f:
        f.write(json.dumps(recipe_data, indent=4, default=str))
    recipe_data = normalize_data(recipe_data)
    recipe = Recipe(**recipe_data)
    return recipe.save_to_db()
 def process_recipe_data(new_recipe: dict, url=None) -> dict:
    slug = slugify(new_recipe["name"])
    mealie_tags = {
@ -91,21 +85,76 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
    return new_recipe
-def process_recipe_url(url: str) -> dict:
+def extract_recipe_from_html(html:str, url: str) -> dict:
-    new_recipe: dict = scrape_url(url, python_objects=True)[0]
+    scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
    if scraped_recipes:
        new_recipe: dict = scraped_recipes[0]
        logger.info(f"Recipe Scraped From Web: {new_recipe}")
        if not new_recipe:
            return "fail"  # TODO: Return Better Error Here
-    new_recipe = process_recipe_data(new_recipe, url)
+        new_recipe = process_recipe_data(new_recipe, url=url)
-
+        new_recipe = normalize_data(new_recipe)
-    try:
+    else:
-        img_path = scrape_image(
+        new_recipe = basic_recipe_from_opengraph(html, url)
-            normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
+        logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
        )
        new_recipe["image"] = img_path.name
    except:
        new_recipe["image"] = None
    return new_recipe
 def download_image_for_recipe(recipe: dict) -> dict:
    try:
        img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
        recipe["image"] = img_path.name
    except:
        recipe["image"] = None
    return recipe
 def og_field(properties: dict, field_name: str) -> str:
    return next((val for name, val in properties if name == field_name), None)
 def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
    return list({val for name, val in properties if name == field_name})
 def basic_recipe_from_opengraph(html: str, url: str) -> dict:
    base_url = get_base_url(html, url)
    data = extruct.extract(html, base_url=base_url)
    properties = data["opengraph"][0]['properties']
    return {
        "name": og_field(properties, "og:title"),
        "description": og_field(properties, "og:description"),
        "image": og_field(properties, "og:image"),
        "recipeYield": "",
        # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
        "recipeIngredient": ["Could not detect ingredients"],
        # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
        "recipeInstructions": ["Could not detect instructions"],
        "slug": slugify(og_field(properties, "og:title")),
        "orgURL": og_field(properties, "og:url"),
        "categories": [],
        "tags": og_fields(properties, "og:article:tag"),
        "dateAdded": None,
        "notes": [],
        "extras": [],
    }
 def process_recipe_url(url: str) -> dict:
    r = requests.get(url)
    new_recipe = extract_recipe_from_html(r.text, url)
    new_recipe = download_image_for_recipe(new_recipe)
    return new_recipe
 def create_from_url(url: str) -> dict:
    recipe_data = process_recipe_url(url)
    with open(TEMP_FILE, "w") as f:
        f.write(json.dumps(recipe_data, indent=4, default=str))
    recipe = Recipe(**recipe_data)
    return recipe.save_to_db()
--- a/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html
+++ b/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html
--- a/mealie/test/data/html-raw/healthy_pasta_bake_60759.html
+++ b/mealie/test/data/html-raw/healthy_pasta_bake_60759.html
--- a/mealie/test/test_scraper.py
+++ b/mealie/test/test_scraper.py
@ -1,12 +1,22 @@
 import json
 import re
 from pathlib import Path
 import pytest
-from services.scrape_services import normalize_data, normalize_instructions
+from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html
 CWD = Path(__file__).parent
 RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
 RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
 # https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
 url_validation_regex = re.compile(
    r'^(?:http|ftp)s?://' # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
    r'localhost|' #localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
    r'(?::\d+)?' # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
@pytest.mark.parametrize("json_file,num_steps", [
    ("best-homemade-salsa-recipe.json", 2),
@ -37,3 +47,32 @@ def test_normalize_data(json_file, num_steps):
 ])
 def test_normalize_instructions(instructions):
    assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
 def test_html_no_recipe_data():
    path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
    url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
    recipe_data = extract_recipe_from_html(open(path).read(), url)
    assert len(recipe_data["name"]) > 10
    assert len(recipe_data["slug"]) > 10
    assert recipe_data["orgURL"] == url
    assert len(recipe_data["description"]) > 100
    assert url_validation_regex.match(recipe_data["image"])
    assert recipe_data["recipeIngredient"] == []
    assert recipe_data["recipeInstructions"] == []
 def test_html_with_recipe_data():
    path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
    url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
    recipe_data = extract_recipe_from_html(open(path).read(), url)
    assert len(recipe_data["name"]) > 10
    assert len(recipe_data["slug"]) > 10
    assert recipe_data["orgURL"] == url
    assert len(recipe_data["description"]) > 100
    assert url_validation_regex.match(recipe_data["image"])
    assert len(recipe_data["recipeIngredient"]) == 13
    assert len(recipe_data["recipeInstructions"]) == 4