mirror of
https://github.com/hay-kot/mealie.git
synced 2025-08-22 14:33:33 -07:00
Merge pull request #65 from richardmitic/opengraph
Use opengraph metadata to make basic recipe cards
This commit is contained in:
commit
8d0604da3a
5 changed files with 1944 additions and 29 deletions
|
@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
|
|||
The resulting files can be used as test input data.
|
||||
"""
|
||||
|
||||
import sys, json
|
||||
import sys, json, pprint
|
||||
import requests
|
||||
import extruct
|
||||
from scrape_schema_recipe import scrape_url
|
||||
from w3lib.html import get_base_url
|
||||
|
||||
for url in sys.argv[1:]:
|
||||
try:
|
||||
|
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
|
|||
print(f"Saved {filename}")
|
||||
except Exception as e:
|
||||
print(f"Error for {url}: {e}")
|
||||
print("Trying extruct instead")
|
||||
pp = pprint.PrettyPrinter(indent=2)
|
||||
r = requests.get(url)
|
||||
base_url = get_base_url(r.text, r.url)
|
||||
data = extruct.extract(r.text, base_url=base_url)
|
||||
pp.pprint(data)
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from scrape_schema_recipe import scrape_url
|
||||
import extruct
|
||||
import requests
|
||||
from w3lib.html import get_base_url
|
||||
import scrape_schema_recipe
|
||||
from slugify import slugify
|
||||
from utils.logger import logger
|
||||
|
||||
|
@ -59,21 +64,10 @@ def normalize_data(recipe_data: dict) -> dict:
|
|||
recipe_data["recipeInstructions"] = normalize_instructions(
|
||||
recipe_data["recipeInstructions"]
|
||||
)
|
||||
recipe_data["image"] = normalize_image_url(recipe_data["image"])
|
||||
return recipe_data
|
||||
|
||||
|
||||
def create_from_url(url: str) -> dict:
|
||||
recipe_data = process_recipe_url(url)
|
||||
|
||||
with open(TEMP_FILE, "w") as f:
|
||||
f.write(json.dumps(recipe_data, indent=4, default=str))
|
||||
|
||||
recipe_data = normalize_data(recipe_data)
|
||||
recipe = Recipe(**recipe_data)
|
||||
|
||||
return recipe.save_to_db()
|
||||
|
||||
|
||||
def process_recipe_data(new_recipe: dict, url=None) -> dict:
|
||||
slug = slugify(new_recipe["name"])
|
||||
mealie_tags = {
|
||||
|
@ -91,21 +85,76 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
|
|||
return new_recipe
|
||||
|
||||
|
||||
def process_recipe_url(url: str) -> dict:
|
||||
new_recipe: dict = scrape_url(url, python_objects=True)[0]
|
||||
logger.info(f"Recipe Scraped From Web: {new_recipe}")
|
||||
def extract_recipe_from_html(html:str, url: str) -> dict:
|
||||
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
|
||||
if scraped_recipes:
|
||||
new_recipe: dict = scraped_recipes[0]
|
||||
logger.info(f"Recipe Scraped From Web: {new_recipe}")
|
||||
|
||||
if not new_recipe:
|
||||
return "fail" # TODO: Return Better Error Here
|
||||
if not new_recipe:
|
||||
return "fail" # TODO: Return Better Error Here
|
||||
|
||||
new_recipe = process_recipe_data(new_recipe, url)
|
||||
|
||||
try:
|
||||
img_path = scrape_image(
|
||||
normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
|
||||
)
|
||||
new_recipe["image"] = img_path.name
|
||||
except:
|
||||
new_recipe["image"] = None
|
||||
new_recipe = process_recipe_data(new_recipe, url=url)
|
||||
new_recipe = normalize_data(new_recipe)
|
||||
else:
|
||||
new_recipe = basic_recipe_from_opengraph(html, url)
|
||||
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
|
||||
|
||||
return new_recipe
|
||||
|
||||
|
||||
def download_image_for_recipe(recipe: dict) -> dict:
|
||||
try:
|
||||
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
|
||||
recipe["image"] = img_path.name
|
||||
except:
|
||||
recipe["image"] = None
|
||||
|
||||
return recipe
|
||||
|
||||
|
||||
def og_field(properties: dict, field_name: str) -> str:
|
||||
return next((val for name, val in properties if name == field_name), None)
|
||||
|
||||
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
|
||||
return list({val for name, val in properties if name == field_name})
|
||||
|
||||
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
|
||||
base_url = get_base_url(html, url)
|
||||
data = extruct.extract(html, base_url=base_url)
|
||||
properties = data["opengraph"][0]['properties']
|
||||
return {
|
||||
"name": og_field(properties, "og:title"),
|
||||
"description": og_field(properties, "og:description"),
|
||||
"image": og_field(properties, "og:image"),
|
||||
"recipeYield": "",
|
||||
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
|
||||
"recipeIngredient": ["Could not detect ingredients"],
|
||||
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
|
||||
"recipeInstructions": ["Could not detect instructions"],
|
||||
"slug": slugify(og_field(properties, "og:title")),
|
||||
"orgURL": og_field(properties, "og:url"),
|
||||
"categories": [],
|
||||
"tags": og_fields(properties, "og:article:tag"),
|
||||
"dateAdded": None,
|
||||
"notes": [],
|
||||
"extras": [],
|
||||
}
|
||||
|
||||
|
||||
def process_recipe_url(url: str) -> dict:
|
||||
r = requests.get(url)
|
||||
new_recipe = extract_recipe_from_html(r.text, url)
|
||||
new_recipe = download_image_for_recipe(new_recipe)
|
||||
return new_recipe
|
||||
|
||||
|
||||
def create_from_url(url: str) -> dict:
|
||||
recipe_data = process_recipe_url(url)
|
||||
|
||||
with open(TEMP_FILE, "w") as f:
|
||||
f.write(json.dumps(recipe_data, indent=4, default=str))
|
||||
|
||||
recipe = Recipe(**recipe_data)
|
||||
|
||||
return recipe.save_to_db()
|
||||
|
|
File diff suppressed because one or more lines are too long
500
mealie/test/data/html-raw/healthy_pasta_bake_60759.html
Normal file
500
mealie/test/data/html-raw/healthy_pasta_bake_60759.html
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,12 +1,22 @@
|
|||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from services.scrape_services import normalize_data, normalize_instructions
|
||||
from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html
|
||||
|
||||
CWD = Path(__file__).parent
|
||||
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
|
||||
RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
|
||||
|
||||
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
|
||||
url_validation_regex = re.compile(
|
||||
r'^(?:http|ftp)s?://' # http:// or https://
|
||||
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
|
||||
r'localhost|' #localhost...
|
||||
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
||||
r'(?::\d+)?' # optional port
|
||||
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
||||
|
||||
@pytest.mark.parametrize("json_file,num_steps", [
|
||||
("best-homemade-salsa-recipe.json", 2),
|
||||
|
@ -37,3 +47,32 @@ def test_normalize_data(json_file, num_steps):
|
|||
])
|
||||
def test_normalize_instructions(instructions):
|
||||
assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
|
||||
|
||||
|
||||
def test_html_no_recipe_data():
|
||||
path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
|
||||
url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
|
||||
recipe_data = extract_recipe_from_html(open(path).read(), url)
|
||||
|
||||
assert len(recipe_data["name"]) > 10
|
||||
assert len(recipe_data["slug"]) > 10
|
||||
assert recipe_data["orgURL"] == url
|
||||
assert len(recipe_data["description"]) > 100
|
||||
assert url_validation_regex.match(recipe_data["image"])
|
||||
assert recipe_data["recipeIngredient"] == []
|
||||
assert recipe_data["recipeInstructions"] == []
|
||||
|
||||
|
||||
def test_html_with_recipe_data():
|
||||
path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
|
||||
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
|
||||
recipe_data = extract_recipe_from_html(open(path).read(), url)
|
||||
|
||||
assert len(recipe_data["name"]) > 10
|
||||
assert len(recipe_data["slug"]) > 10
|
||||
assert recipe_data["orgURL"] == url
|
||||
assert len(recipe_data["description"]) > 100
|
||||
assert url_validation_regex.match(recipe_data["image"])
|
||||
assert len(recipe_data["recipeIngredient"]) == 13
|
||||
assert len(recipe_data["recipeInstructions"]) == 4
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue