mirror of
https://github.com/hay-kot/mealie.git
synced 2025-08-22 06:23:34 -07:00
Merge branch 'dev' of https://github.com/hay-kot/mealie into sqlite
This commit is contained in:
commit
760d30c8fc
7 changed files with 1998 additions and 134 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -150,3 +150,4 @@ ENV/
|
||||||
|
|
||||||
# Node Modules
|
# Node Modules
|
||||||
node_modules/
|
node_modules/
|
||||||
|
mealie/data/debug/last_recipe.json
|
||||||
|
|
|
@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk.
|
||||||
The resulting files can be used as test input data.
|
The resulting files can be used as test input data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys, json
|
import sys, json, pprint
|
||||||
|
import requests
|
||||||
|
import extruct
|
||||||
from scrape_schema_recipe import scrape_url
|
from scrape_schema_recipe import scrape_url
|
||||||
|
from w3lib.html import get_base_url
|
||||||
|
|
||||||
for url in sys.argv[1:]:
|
for url in sys.argv[1:]:
|
||||||
try:
|
try:
|
||||||
|
@ -16,3 +19,9 @@ for url in sys.argv[1:]:
|
||||||
print(f"Saved {filename}")
|
print(f"Saved {filename}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error for {url}: {e}")
|
print(f"Error for {url}: {e}")
|
||||||
|
print("Trying extruct instead")
|
||||||
|
pp = pprint.PrettyPrinter(indent=2)
|
||||||
|
r = requests.get(url)
|
||||||
|
base_url = get_base_url(r.text, r.url)
|
||||||
|
data = extruct.extract(r.text, base_url=base_url)
|
||||||
|
pp.pprint(data)
|
||||||
|
|
|
@ -1,91 +1,18 @@
|
||||||
{
|
{
|
||||||
"@context": "http://schema.org/",
|
"name": "Carottes Rapp\u00e9s with Rice and Sunflower Seeds \u2014 FEED THE SWIMMERS",
|
||||||
"@type": "Recipe",
|
"description": " Carottes R\u00e2p\u00e9es with Rice and Sunflower Seeds thanks to @think_rice and @thefeedfeed. Carottes R\u00e2p\u00e9es is a classic French Salad found ready to go (think picnic) at every charcuterie and on most cafe menus. This is one of those insanely simple salads that explode with flavor! The carrots ar",
|
||||||
"name": "Jalape\u00f1o Popper Dip",
|
"image": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers.JPG?format=1500w",
|
||||||
"author": {
|
"recipeYield": "",
|
||||||
"@type": "Person",
|
|
||||||
"name": "Michelle"
|
|
||||||
},
|
|
||||||
"description": "Jalapeno Popper Dip is creamy, cheesy and has just the perfect amount of kick. Great appetizer for your next party or watching the big game!",
|
|
||||||
"datePublished": "2016-02-22 00:01:37+00:00",
|
|
||||||
"image": "jalapeno-popper-dip.jpg",
|
|
||||||
"recipeYield": [
|
|
||||||
"10",
|
|
||||||
"10 to 12 servings"
|
|
||||||
],
|
|
||||||
"prepTime": "0:15:00",
|
|
||||||
"cookTime": "0:30:00",
|
|
||||||
"totalTime": "0:45:00",
|
|
||||||
"recipeIngredient": [
|
"recipeIngredient": [
|
||||||
"16 ounces cream cheese (at room temperature)",
|
"Could not detect ingredients"
|
||||||
"1 cup mayonnaise",
|
|
||||||
"8 pieces of bacon (cooked and chopped)",
|
|
||||||
"6 jalape\u00f1os (seeded and minced (if you can't get fresh, substitute a 4-ounce can diced jalape\u00f1o peppers, drained))",
|
|
||||||
"2 cloves garlic (minced)",
|
|
||||||
"\u00bd teaspoon cumin",
|
|
||||||
"6 ounces cheddar cheese (shredded (about 1\u00bd cups))",
|
|
||||||
"1 cup panko breadcrumbs",
|
|
||||||
"1 cup grated Parmesan cheese",
|
|
||||||
"4 tablespoons unsalted butter, melted"
|
|
||||||
],
|
],
|
||||||
"recipeInstructions": [
|
"recipeInstructions": [
|
||||||
{
|
{
|
||||||
"@type": "HowToStep",
|
"text": "Could not detect instructions"
|
||||||
"text": "Preheat oven to 375 degrees F.",
|
|
||||||
"name": "Preheat oven to 375 degrees F.",
|
|
||||||
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-0"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"@type": "HowToStep",
|
|
||||||
"text": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
|
|
||||||
"name": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
|
|
||||||
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-1"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"@type": "HowToStep",
|
|
||||||
"text": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
|
|
||||||
"name": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
|
|
||||||
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"@type": "HowToStep",
|
|
||||||
"text": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
|
|
||||||
"name": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
|
|
||||||
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-3"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"aggregateRating": {
|
"slug": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers",
|
||||||
"@type": "AggregateRating",
|
"orgURL": "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds",
|
||||||
"ratingValue": "4.34",
|
|
||||||
"ratingCount": "15"
|
|
||||||
},
|
|
||||||
"recipeCategory": [
|
|
||||||
"Appetizer"
|
|
||||||
],
|
|
||||||
"recipeCuisine": [
|
|
||||||
"American"
|
|
||||||
],
|
|
||||||
"keywords": "cheese dip, game day food, party food",
|
|
||||||
"nutrition": {
|
|
||||||
"@type": "NutritionInformation",
|
|
||||||
"calories": "560 kcal",
|
|
||||||
"carbohydrateContent": "7 g",
|
|
||||||
"proteinContent": "14 g",
|
|
||||||
"fatContent": "52 g",
|
|
||||||
"saturatedFatContent": "21 g",
|
|
||||||
"cholesterolContent": "109 mg",
|
|
||||||
"sodiumContent": "707 mg",
|
|
||||||
"sugarContent": "2 g",
|
|
||||||
"servingSize": "1 serving"
|
|
||||||
},
|
|
||||||
"@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#recipe",
|
|
||||||
"isPartOf": {
|
|
||||||
"@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#article"
|
|
||||||
},
|
|
||||||
"mainEntityOfPage": "https://www.browneyedbaker.com/jalapeno-popper-dip/#webpage",
|
|
||||||
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/",
|
|
||||||
"slug": "jalapeno-popper-dip",
|
|
||||||
"orgURL": "http://www.browneyedbaker.com/2011/08/03/jalapeno-popper-dip/",
|
|
||||||
"categories": [],
|
"categories": [],
|
||||||
"tags": [],
|
"tags": [],
|
||||||
"dateAdded": null,
|
"dateAdded": null,
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List, Tuple
|
||||||
|
|
||||||
from scrape_schema_recipe import scrape_url
|
import extruct
|
||||||
|
import requests
|
||||||
|
import scrape_schema_recipe
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
|
from w3lib.html import get_base_url
|
||||||
|
|
||||||
from services.image_services import scrape_image
|
from services.image_services import scrape_image
|
||||||
from services.recipe_services import Recipe
|
from services.recipe_services import Recipe
|
||||||
|
@ -59,21 +62,10 @@ def normalize_data(recipe_data: dict) -> dict:
|
||||||
recipe_data["recipeInstructions"] = normalize_instructions(
|
recipe_data["recipeInstructions"] = normalize_instructions(
|
||||||
recipe_data["recipeInstructions"]
|
recipe_data["recipeInstructions"]
|
||||||
)
|
)
|
||||||
|
recipe_data["image"] = normalize_image_url(recipe_data["image"])
|
||||||
return recipe_data
|
return recipe_data
|
||||||
|
|
||||||
|
|
||||||
def create_from_url(url: str) -> dict:
|
|
||||||
recipe_data = process_recipe_url(url)
|
|
||||||
|
|
||||||
with open(TEMP_FILE, "w") as f:
|
|
||||||
f.write(json.dumps(recipe_data, indent=4, default=str))
|
|
||||||
|
|
||||||
recipe_data = normalize_data(recipe_data)
|
|
||||||
recipe = Recipe(**recipe_data)
|
|
||||||
|
|
||||||
return recipe.save_to_db()
|
|
||||||
|
|
||||||
|
|
||||||
def process_recipe_data(new_recipe: dict, url=None) -> dict:
|
def process_recipe_data(new_recipe: dict, url=None) -> dict:
|
||||||
slug = slugify(new_recipe["name"])
|
slug = slugify(new_recipe["name"])
|
||||||
mealie_tags = {
|
mealie_tags = {
|
||||||
|
@ -91,21 +83,78 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
|
||||||
return new_recipe
|
return new_recipe
|
||||||
|
|
||||||
|
|
||||||
def process_recipe_url(url: str) -> dict:
|
def extract_recipe_from_html(html: str, url: str) -> dict:
|
||||||
new_recipe: dict = scrape_url(url, python_objects=True)[0]
|
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
|
||||||
|
if scraped_recipes:
|
||||||
|
new_recipe: dict = scraped_recipes[0]
|
||||||
logger.info(f"Recipe Scraped From Web: {new_recipe}")
|
logger.info(f"Recipe Scraped From Web: {new_recipe}")
|
||||||
|
|
||||||
if not new_recipe:
|
if not new_recipe:
|
||||||
return "fail" # TODO: Return Better Error Here
|
return "fail" # TODO: Return Better Error Here
|
||||||
|
|
||||||
new_recipe = process_recipe_data(new_recipe, url)
|
new_recipe = process_recipe_data(new_recipe, url=url)
|
||||||
|
new_recipe = normalize_data(new_recipe)
|
||||||
try:
|
else:
|
||||||
img_path = scrape_image(
|
new_recipe = basic_recipe_from_opengraph(html, url)
|
||||||
normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
|
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
|
||||||
)
|
|
||||||
new_recipe["image"] = img_path.name
|
|
||||||
except:
|
|
||||||
new_recipe["image"] = None
|
|
||||||
|
|
||||||
return new_recipe
|
return new_recipe
|
||||||
|
|
||||||
|
|
||||||
|
def download_image_for_recipe(recipe: dict) -> dict:
|
||||||
|
try:
|
||||||
|
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
|
||||||
|
recipe["image"] = img_path.name
|
||||||
|
except:
|
||||||
|
recipe["image"] = None
|
||||||
|
|
||||||
|
return recipe
|
||||||
|
|
||||||
|
|
||||||
|
def og_field(properties: dict, field_name: str) -> str:
|
||||||
|
return next((val for name, val in properties if name == field_name), None)
|
||||||
|
|
||||||
|
|
||||||
|
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
|
||||||
|
return list({val for name, val in properties if name == field_name})
|
||||||
|
|
||||||
|
|
||||||
|
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
|
||||||
|
base_url = get_base_url(html, url)
|
||||||
|
data = extruct.extract(html, base_url=base_url)
|
||||||
|
properties = data["opengraph"][0]["properties"]
|
||||||
|
return {
|
||||||
|
"name": og_field(properties, "og:title"),
|
||||||
|
"description": og_field(properties, "og:description"),
|
||||||
|
"image": og_field(properties, "og:image"),
|
||||||
|
"recipeYield": "",
|
||||||
|
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
|
||||||
|
"recipeIngredient": ["Could not detect ingredients"],
|
||||||
|
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
|
||||||
|
"recipeInstructions": [{"text": "Could not detect instructions"}],
|
||||||
|
"slug": slugify(og_field(properties, "og:title")),
|
||||||
|
"orgURL": og_field(properties, "og:url"),
|
||||||
|
"categories": [],
|
||||||
|
"tags": og_fields(properties, "og:article:tag"),
|
||||||
|
"dateAdded": None,
|
||||||
|
"notes": [],
|
||||||
|
"extras": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_recipe_url(url: str) -> dict:
|
||||||
|
r = requests.get(url)
|
||||||
|
new_recipe = extract_recipe_from_html(r.text, url)
|
||||||
|
new_recipe = download_image_for_recipe(new_recipe)
|
||||||
|
return new_recipe
|
||||||
|
|
||||||
|
|
||||||
|
def create_from_url(url: str) -> dict:
|
||||||
|
recipe_data = process_recipe_url(url)
|
||||||
|
|
||||||
|
with open(TEMP_FILE, "w") as f:
|
||||||
|
f.write(json.dumps(recipe_data, indent=4, default=str))
|
||||||
|
|
||||||
|
recipe = Recipe(**recipe_data)
|
||||||
|
|
||||||
|
return recipe.save_to_db()
|
||||||
|
|
File diff suppressed because one or more lines are too long
500
mealie/test/data/html-raw/healthy_pasta_bake_60759.html
Normal file
500
mealie/test/data/html-raw/healthy_pasta_bake_60759.html
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,16 +1,38 @@
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from services.scrape_services import normalize_data, normalize_instructions
|
from services.scrape_services import (
|
||||||
|
extract_recipe_from_html,
|
||||||
|
normalize_data,
|
||||||
|
normalize_instructions,
|
||||||
|
)
|
||||||
|
|
||||||
CWD = Path(__file__).parent
|
CWD = Path(__file__).parent
|
||||||
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
|
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
|
||||||
|
RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
|
||||||
|
|
||||||
|
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
|
||||||
|
url_validation_regex = re.compile(
|
||||||
|
r"^(?:http|ftp)s?://" # http:// or https://
|
||||||
|
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
||||||
|
r"localhost|" # localhost...
|
||||||
|
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
||||||
|
r"(?::\d+)?" # optional port
|
||||||
|
r"(?:/?|[/?]\S+)$",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("json_file,num_steps", [
|
@pytest.mark.parametrize(
|
||||||
|
"json_file,num_steps",
|
||||||
|
[
|
||||||
("best-homemade-salsa-recipe.json", 2),
|
("best-homemade-salsa-recipe.json", 2),
|
||||||
("blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json", 3),
|
(
|
||||||
|
"blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json",
|
||||||
|
3,
|
||||||
|
),
|
||||||
("bon_appetit.json", 8),
|
("bon_appetit.json", 8),
|
||||||
("chunky-apple-cake.json", 4),
|
("chunky-apple-cake.json", 4),
|
||||||
("dairy-free-impossible-pumpkin-pie.json", 7),
|
("dairy-free-impossible-pumpkin-pie.json", 7),
|
||||||
|
@ -21,19 +43,57 @@ RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
|
||||||
("microwave_sweet_potatoes_04783.json", 4),
|
("microwave_sweet_potatoes_04783.json", 4),
|
||||||
("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4),
|
("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4),
|
||||||
("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3),
|
("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3),
|
||||||
])
|
],
|
||||||
|
)
|
||||||
def test_normalize_data(json_file, num_steps):
|
def test_normalize_data(json_file, num_steps):
|
||||||
recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file))))
|
recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file))))
|
||||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
assert len(recipe_data["recipeInstructions"]) == num_steps
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("instructions", [
|
@pytest.mark.parametrize(
|
||||||
|
"instructions",
|
||||||
|
[
|
||||||
"A\n\nB\n\nC\n\n",
|
"A\n\nB\n\nC\n\n",
|
||||||
"A\nB\nC\n",
|
"A\nB\nC\n",
|
||||||
"A\r\n\r\nB\r\n\r\nC\r\n\r\n",
|
"A\r\n\r\nB\r\n\r\nC\r\n\r\n",
|
||||||
"A\r\nB\r\nC\r\n",
|
"A\r\nB\r\nC\r\n",
|
||||||
["A","B","C"],
|
["A", "B", "C"],
|
||||||
[{"@type": "HowToStep", "text": x} for x in ["A","B","C"]]
|
[{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]],
|
||||||
])
|
],
|
||||||
|
)
|
||||||
def test_normalize_instructions(instructions):
|
def test_normalize_instructions(instructions):
|
||||||
assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
|
assert normalize_instructions(instructions) == [
|
||||||
|
{"text": "A"},
|
||||||
|
{"text": "B"},
|
||||||
|
{"text": "C"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_no_recipe_data():
|
||||||
|
path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html")
|
||||||
|
url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds"
|
||||||
|
recipe_data = extract_recipe_from_html(open(path).read(), url)
|
||||||
|
|
||||||
|
assert len(recipe_data["name"]) > 10
|
||||||
|
assert len(recipe_data["slug"]) > 10
|
||||||
|
assert recipe_data["orgURL"] == url
|
||||||
|
assert len(recipe_data["description"]) > 100
|
||||||
|
assert url_validation_regex.match(recipe_data["image"])
|
||||||
|
assert recipe_data["recipeIngredient"] == ["Could not detect ingredients"]
|
||||||
|
assert recipe_data["recipeInstructions"] == [
|
||||||
|
{"text": "Could not detect instructions"}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_with_recipe_data():
|
||||||
|
path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html")
|
||||||
|
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
|
||||||
|
recipe_data = extract_recipe_from_html(open(path).read(), url)
|
||||||
|
|
||||||
|
assert len(recipe_data["name"]) > 10
|
||||||
|
assert len(recipe_data["slug"]) > 10
|
||||||
|
assert recipe_data["orgURL"] == url
|
||||||
|
assert len(recipe_data["description"]) > 100
|
||||||
|
assert url_validation_regex.match(recipe_data["image"])
|
||||||
|
assert len(recipe_data["recipeIngredient"]) == 13
|
||||||
|
assert len(recipe_data["recipeInstructions"]) == 4
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue