Merge pull request #65 from richardmitic/opengraph

Use opengraph metadata to make basic recipe cards
This commit is contained in:
Hayden 2021-01-10 10:58:48 -09:00 committed by GitHub
commit 8d0604da3a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 1944 additions and 29 deletions

View file

@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
The resulting files can be used as test input data.
"""
import sys, json
import sys, json, pprint
import requests
import extruct
from scrape_schema_recipe import scrape_url
from w3lib.html import get_base_url
for url in sys.argv[1:]:
try:
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
print(f"Saved {filename}")
except Exception as e:
print(f"Error for {url}: {e}")
print("Trying extruct instead")
pp = pprint.PrettyPrinter(indent=2)
r = requests.get(url)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url)
pp.pprint(data)

View file

@ -1,8 +1,13 @@
from typing import List, Tuple
import json
from pathlib import Path
from typing import List
from scrape_schema_recipe import scrape_url
import extruct
import requests
from w3lib.html import get_base_url
import scrape_schema_recipe
from slugify import slugify
from utils.logger import logger
@ -59,21 +64,10 @@ def normalize_data(recipe_data: dict) -> dict:
recipe_data["recipeInstructions"] = normalize_instructions(
recipe_data["recipeInstructions"]
)
recipe_data["image"] = normalize_image_url(recipe_data["image"])
return recipe_data
def create_from_url(url: str) -> dict:
recipe_data = process_recipe_url(url)
with open(TEMP_FILE, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
recipe_data = normalize_data(recipe_data)
recipe = Recipe(**recipe_data)
return recipe.save_to_db()
def process_recipe_data(new_recipe: dict, url=None) -> dict:
slug = slugify(new_recipe["name"])
mealie_tags = {
@ -91,21 +85,76 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
return new_recipe
def process_recipe_url(url: str) -> dict:
new_recipe: dict = scrape_url(url, python_objects=True)[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}")
def extract_recipe_from_html(html:str, url: str) -> dict:
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}")
if not new_recipe:
return "fail" # TODO: Return Better Error Here
if not new_recipe:
return "fail" # TODO: Return Better Error Here
new_recipe = process_recipe_data(new_recipe, url)
try:
img_path = scrape_image(
normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
)
new_recipe["image"] = img_path.name
except:
new_recipe["image"] = None
new_recipe = process_recipe_data(new_recipe, url=url)
new_recipe = normalize_data(new_recipe)
else:
new_recipe = basic_recipe_from_opengraph(html, url)
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
return new_recipe
def download_image_for_recipe(recipe: dict) -> dict:
try:
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
recipe["image"] = img_path.name
except:
recipe["image"] = None
return recipe
def og_field(properties: dict, field_name: str) -> str:
return next((val for name, val in properties if name == field_name), None)
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
return list({val for name, val in properties if name == field_name})
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url)
properties = data["opengraph"][0]['properties']
return {
"name": og_field(properties, "og:title"),
"description": og_field(properties, "og:description"),
"image": og_field(properties, "og:image"),
"recipeYield": "",
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
"recipeIngredient": ["Could not detect ingredients"],
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
"recipeInstructions": ["Could not detect instructions"],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": og_field(properties, "og:url"),
"categories": [],
"tags": og_fields(properties, "og:article:tag"),
"dateAdded": None,
"notes": [],
"extras": [],
}
def process_recipe_url(url: str) -> dict:
r = requests.get(url)
new_recipe = extract_recipe_from_html(r.text, url)
new_recipe = download_image_for_recipe(new_recipe)
return new_recipe
def create_from_url(url: str) -> dict:
recipe_data = process_recipe_url(url)
with open(TEMP_FILE, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
recipe = Recipe(**recipe_data)
return recipe.save_to_db()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,12 +1,22 @@
import json
import re
from pathlib import Path
import pytest
from services.scrape_services import normalize_data, normalize_instructions
from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html
CWD = Path(__file__).parent
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
url_validation_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
@pytest.mark.parametrize("json_file,num_steps", [
("best-homemade-salsa-recipe.json", 2),
@ -37,3 +47,32 @@ def test_normalize_data(json_file, num_steps):
])
def test_normalize_instructions(instructions):
assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
def test_html_no_recipe_data():
path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
recipe_data = extract_recipe_from_html(open(path).read(), url)
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert recipe_data["recipeIngredient"] == []
assert recipe_data["recipeInstructions"] == []
def test_html_with_recipe_data():
path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
recipe_data = extract_recipe_from_html(open(path).read(), url)
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert len(recipe_data["recipeIngredient"]) == 13
assert len(recipe_data["recipeInstructions"]) == 4