Merge pull request #65 from richardmitic/opengraph

Use opengraph metadata to make basic recipe cards
This commit is contained in:
Hayden 2021-01-10 10:58:48 -09:00 committed by GitHub
commit 8d0604da3a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 1944 additions and 29 deletions

View file

@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
The resulting files can be used as test input data. The resulting files can be used as test input data.
""" """
import sys, json import sys, json, pprint
import requests
import extruct
from scrape_schema_recipe import scrape_url from scrape_schema_recipe import scrape_url
from w3lib.html import get_base_url
for url in sys.argv[1:]: for url in sys.argv[1:]:
try: try:
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
print(f"Saved {filename}") print(f"Saved {filename}")
except Exception as e: except Exception as e:
print(f"Error for {url}: {e}") print(f"Error for {url}: {e}")
print("Trying extruct instead")
pp = pprint.PrettyPrinter(indent=2)
r = requests.get(url)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url)
pp.pprint(data)

View file

@ -1,8 +1,13 @@
from typing import List, Tuple
import json import json
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from scrape_schema_recipe import scrape_url import extruct
import requests
from w3lib.html import get_base_url
import scrape_schema_recipe
from slugify import slugify from slugify import slugify
from utils.logger import logger from utils.logger import logger
@ -59,21 +64,10 @@ def normalize_data(recipe_data: dict) -> dict:
recipe_data["recipeInstructions"] = normalize_instructions( recipe_data["recipeInstructions"] = normalize_instructions(
recipe_data["recipeInstructions"] recipe_data["recipeInstructions"]
) )
recipe_data["image"] = normalize_image_url(recipe_data["image"])
return recipe_data return recipe_data
def create_from_url(url: str) -> dict:
recipe_data = process_recipe_url(url)
with open(TEMP_FILE, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
recipe_data = normalize_data(recipe_data)
recipe = Recipe(**recipe_data)
return recipe.save_to_db()
def process_recipe_data(new_recipe: dict, url=None) -> dict: def process_recipe_data(new_recipe: dict, url=None) -> dict:
slug = slugify(new_recipe["name"]) slug = slugify(new_recipe["name"])
mealie_tags = { mealie_tags = {
@ -91,21 +85,76 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
return new_recipe return new_recipe
def process_recipe_url(url: str) -> dict: def extract_recipe_from_html(html:str, url: str) -> dict:
new_recipe: dict = scrape_url(url, python_objects=True)[0] scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}") logger.info(f"Recipe Scraped From Web: {new_recipe}")
if not new_recipe: if not new_recipe:
return "fail" # TODO: Return Better Error Here return "fail" # TODO: Return Better Error Here
new_recipe = process_recipe_data(new_recipe, url) new_recipe = process_recipe_data(new_recipe, url=url)
new_recipe = normalize_data(new_recipe)
try: else:
img_path = scrape_image( new_recipe = basic_recipe_from_opengraph(html, url)
normalize_image_url(new_recipe.get("image")), new_recipe.get("slug") logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
)
new_recipe["image"] = img_path.name
except:
new_recipe["image"] = None
return new_recipe return new_recipe
def download_image_for_recipe(recipe: dict) -> dict:
try:
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
recipe["image"] = img_path.name
except:
recipe["image"] = None
return recipe
def og_field(properties: dict, field_name: str) -> str:
return next((val for name, val in properties if name == field_name), None)
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
return list({val for name, val in properties if name == field_name})
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url)
properties = data["opengraph"][0]['properties']
return {
"name": og_field(properties, "og:title"),
"description": og_field(properties, "og:description"),
"image": og_field(properties, "og:image"),
"recipeYield": "",
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
"recipeIngredient": ["Could not detect ingredients"],
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
"recipeInstructions": ["Could not detect instructions"],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": og_field(properties, "og:url"),
"categories": [],
"tags": og_fields(properties, "og:article:tag"),
"dateAdded": None,
"notes": [],
"extras": [],
}
def process_recipe_url(url: str) -> dict:
r = requests.get(url)
new_recipe = extract_recipe_from_html(r.text, url)
new_recipe = download_image_for_recipe(new_recipe)
return new_recipe
def create_from_url(url: str) -> dict:
recipe_data = process_recipe_url(url)
with open(TEMP_FILE, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
recipe = Recipe(**recipe_data)
return recipe.save_to_db()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,12 +1,22 @@
import json import json
import re
from pathlib import Path from pathlib import Path
import pytest import pytest
from services.scrape_services import normalize_data, normalize_instructions from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html
CWD = Path(__file__).parent CWD = Path(__file__).parent
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw") RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
url_validation_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
@pytest.mark.parametrize("json_file,num_steps", [ @pytest.mark.parametrize("json_file,num_steps", [
("best-homemade-salsa-recipe.json", 2), ("best-homemade-salsa-recipe.json", 2),
@ -37,3 +47,32 @@ def test_normalize_data(json_file, num_steps):
]) ])
def test_normalize_instructions(instructions): def test_normalize_instructions(instructions):
assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}] assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
def test_html_no_recipe_data():
path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
recipe_data = extract_recipe_from_html(open(path).read(), url)
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert recipe_data["recipeIngredient"] == []
assert recipe_data["recipeInstructions"] == []
def test_html_with_recipe_data():
path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
recipe_data = extract_recipe_from_html(open(path).read(), url)
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert len(recipe_data["recipeIngredient"]) == 13
assert len(recipe_data["recipeInstructions"]) == 4