Merge branch 'dev' of https://github.com/hay-kot/mealie into sqlite

This commit is contained in:
Hayden 2021-01-10 11:08:56 -09:00
commit 760d30c8fc
7 changed files with 1998 additions and 134 deletions

1
.gitignore vendored
View file

@ -150,3 +150,4 @@ ENV/
# Node Modules # Node Modules
node_modules/ node_modules/
mealie/data/debug/last_recipe.json

View file

@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
The resulting files can be used as test input data. The resulting files can be used as test input data.
""" """
import sys, json import sys, json, pprint
import requests
import extruct
from scrape_schema_recipe import scrape_url from scrape_schema_recipe import scrape_url
from w3lib.html import get_base_url
for url in sys.argv[1:]: for url in sys.argv[1:]:
try: try:
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
print(f"Saved {filename}") print(f"Saved {filename}")
except Exception as e: except Exception as e:
print(f"Error for {url}: {e}") print(f"Error for {url}: {e}")
print("Trying extruct instead")
pp = pprint.PrettyPrinter(indent=2)
r = requests.get(url)
base_url = get_base_url(r.text, r.url)
data = extruct.extract(r.text, base_url=base_url)
pp.pprint(data)

View file

@ -1,91 +1,18 @@
{ {
"@context": "http://schema.org/", "name": "Carottes Rapp\u00e9s with Rice and Sunflower Seeds \u2014 FEED THE SWIMMERS",
"@type": "Recipe", "description": " Carottes R\u00e2p\u00e9es with Rice and Sunflower Seeds thanks to @think_rice and @thefeedfeed. Carottes R\u00e2p\u00e9es is a classic French Salad found ready to go (think picnic) at every charcuterie and on most cafe menus. This is one of those insanely simple salads that explode with flavor! The carrots ar",
"name": "Jalape\u00f1o Popper Dip", "image": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers.JPG?format=1500w",
"author": { "recipeYield": "",
"@type": "Person",
"name": "Michelle"
},
"description": "Jalapeno Popper Dip is creamy, cheesy and has just the perfect amount of kick. Great appetizer for your next party or watching the big game!",
"datePublished": "2016-02-22 00:01:37+00:00",
"image": "jalapeno-popper-dip.jpg",
"recipeYield": [
"10",
"10 to 12 servings"
],
"prepTime": "0:15:00",
"cookTime": "0:30:00",
"totalTime": "0:45:00",
"recipeIngredient": [ "recipeIngredient": [
"16 ounces cream cheese (at room temperature)", "Could not detect ingredients"
"1 cup mayonnaise",
"8 pieces of bacon (cooked and chopped)",
"6 jalape\u00f1os (seeded and minced (if you can't get fresh, substitute a 4-ounce can diced jalape\u00f1o peppers, drained))",
"2 cloves garlic (minced)",
"\u00bd teaspoon cumin",
"6 ounces cheddar cheese (shredded (about 1\u00bd cups))",
"1 cup panko breadcrumbs",
"1 cup grated Parmesan cheese",
"4 tablespoons unsalted butter, melted"
], ],
"recipeInstructions": [ "recipeInstructions": [
{ {
"@type": "HowToStep", "text": "Could not detect instructions"
"text": "Preheat oven to 375 degrees F.",
"name": "Preheat oven to 375 degrees F.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-0"
},
{
"@type": "HowToStep",
"text": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
"name": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-1"
},
{
"@type": "HowToStep",
"text": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
"name": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-2"
},
{
"@type": "HowToStep",
"text": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
"name": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-3"
} }
], ],
"aggregateRating": { "slug": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers",
"@type": "AggregateRating", "orgURL": "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds",
"ratingValue": "4.34",
"ratingCount": "15"
},
"recipeCategory": [
"Appetizer"
],
"recipeCuisine": [
"American"
],
"keywords": "cheese dip, game day food, party food",
"nutrition": {
"@type": "NutritionInformation",
"calories": "560 kcal",
"carbohydrateContent": "7 g",
"proteinContent": "14 g",
"fatContent": "52 g",
"saturatedFatContent": "21 g",
"cholesterolContent": "109 mg",
"sodiumContent": "707 mg",
"sugarContent": "2 g",
"servingSize": "1 serving"
},
"@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#recipe",
"isPartOf": {
"@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#article"
},
"mainEntityOfPage": "https://www.browneyedbaker.com/jalapeno-popper-dip/#webpage",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/",
"slug": "jalapeno-popper-dip",
"orgURL": "http://www.browneyedbaker.com/2011/08/03/jalapeno-popper-dip/",
"categories": [], "categories": [],
"tags": [], "tags": [],
"dateAdded": null, "dateAdded": null,

View file

@ -1,10 +1,13 @@
import json import json
from pathlib import Path from pathlib import Path
from typing import List from typing import List, Tuple
from scrape_schema_recipe import scrape_url import extruct
import requests
import scrape_schema_recipe
from slugify import slugify from slugify import slugify
from utils.logger import logger from utils.logger import logger
from w3lib.html import get_base_url
from services.image_services import scrape_image from services.image_services import scrape_image
from services.recipe_services import Recipe from services.recipe_services import Recipe
@ -59,21 +62,10 @@ def normalize_data(recipe_data: dict) -> dict:
recipe_data["recipeInstructions"] = normalize_instructions( recipe_data["recipeInstructions"] = normalize_instructions(
recipe_data["recipeInstructions"] recipe_data["recipeInstructions"]
) )
recipe_data["image"] = normalize_image_url(recipe_data["image"])
return recipe_data return recipe_data
def create_from_url(url: str) -> dict:
recipe_data = process_recipe_url(url)
with open(TEMP_FILE, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
recipe_data = normalize_data(recipe_data)
recipe = Recipe(**recipe_data)
return recipe.save_to_db()
def process_recipe_data(new_recipe: dict, url=None) -> dict: def process_recipe_data(new_recipe: dict, url=None) -> dict:
slug = slugify(new_recipe["name"]) slug = slugify(new_recipe["name"])
mealie_tags = { mealie_tags = {
@ -91,21 +83,78 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
return new_recipe return new_recipe
def process_recipe_url(url: str) -> dict: def extract_recipe_from_html(html: str, url: str) -> dict:
new_recipe: dict = scrape_url(url, python_objects=True)[0] scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}") logger.info(f"Recipe Scraped From Web: {new_recipe}")
if not new_recipe: if not new_recipe:
return "fail" # TODO: Return Better Error Here return "fail" # TODO: Return Better Error Here
new_recipe = process_recipe_data(new_recipe, url) new_recipe = process_recipe_data(new_recipe, url=url)
new_recipe = normalize_data(new_recipe)
try: else:
img_path = scrape_image( new_recipe = basic_recipe_from_opengraph(html, url)
normalize_image_url(new_recipe.get("image")), new_recipe.get("slug") logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
)
new_recipe["image"] = img_path.name
except:
new_recipe["image"] = None
return new_recipe return new_recipe
def download_image_for_recipe(recipe: dict) -> dict:
try:
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
recipe["image"] = img_path.name
except:
recipe["image"] = None
return recipe
def og_field(properties: dict, field_name: str) -> str:
return next((val for name, val in properties if name == field_name), None)
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
return list({val for name, val in properties if name == field_name})
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url)
properties = data["opengraph"][0]["properties"]
return {
"name": og_field(properties, "og:title"),
"description": og_field(properties, "og:description"),
"image": og_field(properties, "og:image"),
"recipeYield": "",
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
"recipeIngredient": ["Could not detect ingredients"],
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
"recipeInstructions": [{"text": "Could not detect instructions"}],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": og_field(properties, "og:url"),
"categories": [],
"tags": og_fields(properties, "og:article:tag"),
"dateAdded": None,
"notes": [],
"extras": [],
}
def process_recipe_url(url: str) -> dict:
r = requests.get(url)
new_recipe = extract_recipe_from_html(r.text, url)
new_recipe = download_image_for_recipe(new_recipe)
return new_recipe
def create_from_url(url: str) -> dict:
recipe_data = process_recipe_url(url)
with open(TEMP_FILE, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
recipe = Recipe(**recipe_data)
return recipe.save_to_db()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,16 +1,38 @@
import json import json
import re
from pathlib import Path from pathlib import Path
import pytest import pytest
from services.scrape_services import normalize_data, normalize_instructions from services.scrape_services import (
extract_recipe_from_html,
normalize_data,
normalize_instructions,
)
CWD = Path(__file__).parent CWD = Path(__file__).parent
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw") RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
url_validation_regex = re.compile(
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
@pytest.mark.parametrize("json_file,num_steps", [ @pytest.mark.parametrize(
"json_file,num_steps",
[
("best-homemade-salsa-recipe.json", 2), ("best-homemade-salsa-recipe.json", 2),
("blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json", 3), (
"blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json",
3,
),
("bon_appetit.json", 8), ("bon_appetit.json", 8),
("chunky-apple-cake.json", 4), ("chunky-apple-cake.json", 4),
("dairy-free-impossible-pumpkin-pie.json", 7), ("dairy-free-impossible-pumpkin-pie.json", 7),
@ -21,19 +43,57 @@ RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
("microwave_sweet_potatoes_04783.json", 4), ("microwave_sweet_potatoes_04783.json", 4),
("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4), ("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4),
("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3), ("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3),
]) ],
)
def test_normalize_data(json_file, num_steps): def test_normalize_data(json_file, num_steps):
recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file)))) recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file))))
assert len(recipe_data["recipeInstructions"]) == num_steps assert len(recipe_data["recipeInstructions"]) == num_steps
@pytest.mark.parametrize("instructions", [ @pytest.mark.parametrize(
"instructions",
[
"A\n\nB\n\nC\n\n", "A\n\nB\n\nC\n\n",
"A\nB\nC\n", "A\nB\nC\n",
"A\r\n\r\nB\r\n\r\nC\r\n\r\n", "A\r\n\r\nB\r\n\r\nC\r\n\r\n",
"A\r\nB\r\nC\r\n", "A\r\nB\r\nC\r\n",
["A","B","C"], ["A", "B", "C"],
[{"@type": "HowToStep", "text": x} for x in ["A","B","C"]] [{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]],
]) ],
)
def test_normalize_instructions(instructions): def test_normalize_instructions(instructions):
assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}] assert normalize_instructions(instructions) == [
{"text": "A"},
{"text": "B"},
{"text": "C"},
]
def test_html_no_recipe_data():
path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
recipe_data = extract_recipe_from_html(open(path).read(), url)
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert recipe_data["recipeIngredient"] == ["Could not detect ingredients"]
assert recipe_data["recipeInstructions"] == [
{"text": "Could not detect instructions"}
]
def test_html_with_recipe_data():
path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
recipe_data = extract_recipe_from_html(open(path).read(), url)
assert len(recipe_data["name"]) > 10
assert len(recipe_data["slug"]) > 10
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert len(recipe_data["recipeIngredient"]) == 13
assert len(recipe_data["recipeInstructions"]) == 4