From b65445d4b1eb2d277f3af95a70b20872729d401c Mon Sep 17 00:00:00 2001
From: hay-kot
Date: Wed, 9 Jun 2021 12:03:39 -0800
Subject: [PATCH] rewrite scraper to use new library
---
frontend/src/api/recipe.js | 6 +
frontend/src/components/UI/TheRecipeFab.vue | 18 +-
frontend/src/pages/Recipe/ScraperDebugger.vue | 62 +++++
frontend/src/pages/Recipe/ViewRecipe.vue | 4 +-
frontend/src/routes/recipes.js | 2 +
mealie/routes/recipe/recipe_crud_routes.py | 6 +
mealie/services/image/image.py | 3 +-
mealie/services/image/minify.py | 2 +
mealie/services/migrations/_migration_base.py | 4 +-
mealie/services/scraper/cleaner.py | 253 +++++++++---------
mealie/services/scraper/scraper.py | 171 +++++++++---
tests/unit_tests/test_cleaner.py | 22 +-
tests/unit_tests/test_recipe_parser.py | 62 +++++
13 files changed, 421 insertions(+), 194 deletions(-)
create mode 100644 frontend/src/pages/Recipe/ScraperDebugger.vue
create mode 100644 tests/unit_tests/test_recipe_parser.py
diff --git a/frontend/src/api/recipe.js b/frontend/src/api/recipe.js
index e5afffd39..c05189d27 100644
--- a/frontend/src/api/recipe.js
+++ b/frontend/src/api/recipe.js
@@ -12,6 +12,7 @@ const recipeURLs = {
allRecipesByCategory: prefix + "category",
create: prefix + "create",
createByURL: prefix + "create-url",
+ testParseURL: prefix + "test-scrape-url",
recipe: slug => prefix + slug,
update: slug => prefix + slug,
delete: slug => prefix + slug,
@@ -183,4 +184,9 @@ export const recipeAPI = {
const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id));
return response.data;
},
+
+ async testScrapeURL(url) {
+ const response = await apiReq.post(recipeURLs.testParseURL, { url: url });
+ return response.data;
+ },
};
diff --git a/frontend/src/components/UI/TheRecipeFab.vue b/frontend/src/components/UI/TheRecipeFab.vue
index 8f4e5f951..030aea6ad 100644
--- a/frontend/src/components/UI/TheRecipeFab.vue
+++ b/frontend/src/components/UI/TheRecipeFab.vue
@@ -54,13 +54,15 @@
-
-
-
- mdi-download {{ $t("about.download-recipe-json") }}
-
-
-
+
+ mdi-external-link
+ View Scraped Data
+
@@ -100,9 +102,7 @@
diff --git a/frontend/src/pages/Recipe/ViewRecipe.vue b/frontend/src/pages/Recipe/ViewRecipe.vue
index 938377bfa..b6138de51 100644
--- a/frontend/src/pages/Recipe/ViewRecipe.vue
+++ b/frontend/src/pages/Recipe/ViewRecipe.vue
@@ -63,7 +63,7 @@ import RecipeViewer from "@/components/Recipe/RecipeViewer";
import PrintView from "@/components/Recipe/PrintView";
import RecipeEditor from "@/components/Recipe/RecipeEditor";
import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue";
-import EditorButtonRow from "@/components/Recipe/EditorButtonRow";
+import EditorButtonRow from "@/components/Recipe/EditorButtonRow.vue";
import NoRecipe from "@/components/Fallbacks/NoRecipe";
import { user } from "@/mixins/user";
import { router } from "@/routes";
@@ -133,7 +133,7 @@ export default {
},
watch: {
- $route: function() {
+ $route: function () {
this.getRecipeDetails();
},
},
diff --git a/frontend/src/routes/recipes.js b/frontend/src/routes/recipes.js
index d21588772..25140f3c6 100644
--- a/frontend/src/routes/recipes.js
+++ b/frontend/src/routes/recipes.js
@@ -1,5 +1,6 @@
const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe");
const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe");
+const ScraperDebugger = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ScraperDebugger");
const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage");
const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes");
const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage");
@@ -9,6 +10,7 @@ import { api } from "@/api";
export const recipeRoutes = [
// Recipes
{ path: "/recipes/all", component: AllRecipes },
+ { path: "/recipes/debugger", component: ScraperDebugger },
{ path: "/user/:id/favorites", component: Favorites },
{ path: "/recipes/tag/:tag", component: CategoryTagPage },
{ path: "/recipes/tag", component: CategoryTagPage },
diff --git a/mealie/routes/recipe/recipe_crud_routes.py b/mealie/routes/recipe/recipe_crud_routes.py
index 60a49789b..d4fbdcf35 100644
--- a/mealie/routes/recipe/recipe_crud_routes.py
+++ b/mealie/routes/recipe/recipe_crud_routes.py
@@ -13,6 +13,7 @@ from mealie.services.events import create_recipe_event
from mealie.services.image.image import scrape_image, write_image
from mealie.services.recipe.media import check_assets, delete_assets
from mealie.services.scraper.scraper import create_from_url
+from scrape_schema_recipe import scrape_url
from slugify import slugify
from sqlalchemy.orm.session import Session
@@ -41,6 +42,11 @@ def create_from_json(
return recipe.slug
+@router.post("/test-scrape-url", dependencies=[Depends(get_current_user)])
+def test_parse_recipe_url(url: RecipeURLIn):
+ return scrape_url(url.url)
+
+
@router.post("/create-url", status_code=201, response_model=str)
def parse_recipe_url(
background_tasks: BackgroundTasks,
diff --git a/mealie/services/image/image.py b/mealie/services/image/image.py
index 9bcee6e7b..a8a3e45df 100644
--- a/mealie/services/image/image.py
+++ b/mealie/services/image/image.py
@@ -42,6 +42,7 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path:
def scrape_image(image_url: str, slug: str) -> Path:
+ logger.info(f"Image URL: {image_url}")
if isinstance(image_url, str): # Handles String Types
image_url = image_url
@@ -64,7 +65,7 @@ def scrape_image(image_url: str, slug: str) -> Path:
if r.status_code == 200:
r.raw.decode_content = True
-
+ logger.info(f"File Name Suffix {filename.suffix}")
write_image(slug, r.raw, filename.suffix)
filename.unlink(missing_ok=True)
diff --git a/mealie/services/image/minify.py b/mealie/services/image/minify.py
index 6393cab69..ebc865da6 100644
--- a/mealie/services/image/minify.py
+++ b/mealie/services/image/minify.py
@@ -39,6 +39,8 @@ def minify_image(image_file: Path, force=False) -> ImageSizes:
min_dest = image_file.parent.joinpath("min-original.webp")
tiny_dest = image_file.parent.joinpath("tiny-original.webp")
+ cleanup_images = False
+
if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force:
return
try:
diff --git a/mealie/services/migrations/_migration_base.py b/mealie/services/migrations/_migration_base.py
index 6eb35895f..9c1ab5bd9 100644
--- a/mealie/services/migrations/_migration_base.py
+++ b/mealie/services/migrations/_migration_base.py
@@ -9,7 +9,7 @@ from mealie.db.database import db
from mealie.schema.migration import MigrationImport
from mealie.schema.recipe import Recipe
from mealie.services.image import image
-from mealie.services.scraper.cleaner import Cleaner
+from mealie.services.scraper import cleaner
from mealie.utils.unzip import unpack_zip
from pydantic import BaseModel
@@ -144,7 +144,7 @@ class MigrationBase(BaseModel):
"""Calls the rewrite_alias function and the Cleaner.clean function on a
dictionary and returns the result unpacked into a Recipe object"""
recipe_dict = self.rewrite_alias(recipe_dict)
- recipe_dict = Cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
+ recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
return Recipe(**recipe_dict)
diff --git a/mealie/services/scraper/cleaner.py b/mealie/services/scraper/cleaner.py
index 4b8ca39c1..842113a18 100644
--- a/mealie/services/scraper/cleaner.py
+++ b/mealie/services/scraper/cleaner.py
@@ -1,4 +1,5 @@
import html
+import json
import re
from datetime import datetime, timedelta
from typing import List
@@ -6,157 +7,157 @@ from typing import List
from slugify import slugify
-class Cleaner:
- """A Namespace for utility function to clean recipe data extracted
- from a url and returns a dictionary that is ready for import into
- the database. Cleaner.clean is the main entrypoint
+def clean(recipe_data: dict, url=None) -> dict:
+ """Main entrypoint to clean a recipe extracted from the web
+ and format the data into an accectable format for the database
+
+ Args:
+ recipe_data (dict): raw recipe dicitonary
+
+ Returns:
+ dict: cleaned recipe dictionary
"""
+ recipe_data["description"] = clean_string(recipe_data.get("description", ""))
- @staticmethod
- def clean(recipe_data: dict, url=None) -> dict:
- """Main entrypoint to clean a recipe extracted from the web
- and format the data into an accectable format for the database
+ # Times
+ recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
+ recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
+ recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
+ recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", []))
- Args:
- recipe_data (dict): raw recipe dicitonary
+ recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield"))
+ recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient"))
+ recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions"))
+ recipe_data["image"] = image(recipe_data.get("image"))
+ recipe_data["slug"] = slugify(recipe_data.get("name"))
+ recipe_data["orgURL"] = url
- Returns:
- dict: cleaned recipe dictionary
- """
- recipe_data["description"] = Cleaner.html(recipe_data.get("description", ""))
+ return recipe_data
- # Times
- recipe_data["prepTime"] = Cleaner.time(recipe_data.get("prepTime"))
- recipe_data["performTime"] = Cleaner.time(recipe_data.get("performTime"))
- recipe_data["totalTime"] = Cleaner.time(recipe_data.get("totalTime"))
- recipe_data["recipeCategory"] = Cleaner.category(recipe_data.get("recipeCategory", []))
- recipe_data["recipeYield"] = Cleaner.yield_amount(recipe_data.get("recipeYield"))
- recipe_data["recipeIngredient"] = Cleaner.ingredient(recipe_data.get("recipeIngredient"))
- recipe_data["recipeInstructions"] = Cleaner.instructions(recipe_data.get("recipeInstructions"))
- recipe_data["image"] = Cleaner.image(recipe_data.get("image"))
- recipe_data["slug"] = slugify(recipe_data.get("name"))
- recipe_data["orgURL"] = url
+def clean_string(text: str) -> str:
+ cleaned_text = html.unescape(text)
+ cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
+ cleaned_text = re.sub(" +", " ", cleaned_text)
+ cleaned_text = re.sub("
", "\n", cleaned_text)
+ cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
+ cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip()
+ return cleaned_text
- return recipe_data
- @staticmethod
- def category(category: str):
- if isinstance(category, str) and category != "":
- return [category]
- else:
- return []
+def category(category: str):
+ if isinstance(category, str) and category != "":
+ return [category]
+ else:
+ return []
- @staticmethod
- def html(raw_html):
- cleanr = re.compile("<.*?>")
- return re.sub(cleanr, "", raw_html)
- @staticmethod
- def image(image=None) -> str:
- if not image:
- return "no image"
- if isinstance(image, list):
- return image[0]
- elif isinstance(image, dict):
- return image["url"]
- elif isinstance(image, str):
- return image
- else:
- raise Exception(f"Unrecognised image URL format: {image}")
+def clean_html(raw_html):
+ cleanr = re.compile("<.*?>")
+ return re.sub(cleanr, "", raw_html)
- @staticmethod
- def instructions(instructions) -> List[dict]:
- if not instructions:
- return []
- if isinstance(instructions[0], list):
- instructions = instructions[0]
+def image(image=None) -> str:
+ if not image:
+ return "no image"
+ if isinstance(image, list):
+ return image[0]
+ elif isinstance(image, dict):
+ return image["url"]
+ elif isinstance(image, str):
+ return image
+ else:
+ raise Exception(f"Unrecognised image URL format: {image}")
- # One long string split by (possibly multiple) new lines
- if isinstance(instructions, str):
- return [{"text": Cleaner._instruction(line)} for line in instructions.splitlines() if line]
- # Plain strings in a list
- elif isinstance(instructions, list) and isinstance(instructions[0], str):
- return [{"text": Cleaner._instruction(step)} for step in instructions]
+def instructions(instructions) -> List[dict]:
+ try:
+ instructions = json.loads(instructions)
+ except Exception:
+ pass
- # Dictionaries (let's assume it's a HowToStep) in a list
- elif isinstance(instructions, list) and isinstance(instructions[0], dict):
- # Try List of Dictionary without "@type" or "type"
- if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
- return [{"text": Cleaner._instruction(step["text"])} for step in instructions]
+ if not instructions:
+ return []
+ if isinstance(instructions, list) and isinstance(instructions[0], list):
+ instructions = instructions[0]
+
+ # One long string split by (possibly multiple) new lines
+ if isinstance(instructions, str):
+ return [{"text": _instruction(line)} for line in instructions.splitlines() if line]
+
+ # Plain strings in a list
+ elif isinstance(instructions, list) and isinstance(instructions[0], str):
+ return [{"text": _instruction(step)} for step in instructions]
+
+ # Dictionaries (let's assume it's a HowToStep) in a list
+ elif isinstance(instructions, list) and isinstance(instructions[0], dict):
+ # Try List of Dictionary without "@type" or "type"
+ if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
+ return [{"text": _instruction(step["text"])} for step in instructions]
+
+ try:
+ # If HowToStep is under HowToSection
+ sectionSteps = []
+ for step in instructions:
+ if step["@type"] == "HowToSection":
+ [sectionSteps.append(item) for item in step["itemListElement"]]
+
+ if len(sectionSteps) > 0:
+ return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"]
+
+ return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"]
+ except Exception as e:
+ print(e)
+ # Not "@type", try "type"
try:
- # If HowToStep is under HowToSection
- sectionSteps = []
- for step in instructions:
- if step["@type"] == "HowToSection":
- [sectionSteps.append(item) for item in step["itemListElement"]]
-
- if len(sectionSteps) > 0:
- return [
- {"text": Cleaner._instruction(step["text"])}
- for step in sectionSteps
- if step["@type"] == "HowToStep"
- ]
-
return [
- {"text": Cleaner._instruction(step["text"])}
+ {"text": _instruction(step["properties"]["text"])}
for step in instructions
- if step["@type"] == "HowToStep"
+ if step["type"].find("HowToStep") > -1
]
- except Exception as e:
- print(e)
- # Not "@type", try "type"
- try:
- return [
- {"text": Cleaner._instruction(step["properties"]["text"])}
- for step in instructions
- if step["type"].find("HowToStep") > -1
- ]
- except Exception:
- pass
+ except Exception:
+ pass
- else:
- raise Exception(f"Unrecognised instruction format: {instructions}")
+ else:
+ raise Exception(f"Unrecognised instruction format: {instructions}")
- @staticmethod
- def _instruction(line) -> str:
- clean_line = Cleaner.html(line.strip())
- # Some sites erroneously escape their strings on multiple levels
- while not clean_line == (clean_line := html.unescape(clean_line)):
- pass
- return clean_line
- @staticmethod
- def ingredient(ingredients: list) -> str:
- if ingredients:
- return [Cleaner.html(html.unescape(ing)) for ing in ingredients]
- else:
- return []
+def _instruction(line) -> str:
+ clean_line = clean_string(line.strip())
+ # Some sites erroneously escape their strings on multiple levels
+ while not clean_line == (clean_line := clean_string(clean_line)):
+ pass
+ return clean_line
- @staticmethod
- def yield_amount(yld) -> str:
- if isinstance(yld, list):
- return yld[-1]
- else:
- return yld
- @staticmethod
- def time(time_entry):
- if time_entry is None:
- return None
- elif isinstance(time_entry, timedelta):
- pretty_print_timedelta(time_entry)
- elif isinstance(time_entry, datetime):
- print(time_entry)
- elif isinstance(time_entry, str):
- if re.match("PT.*H.*M", time_entry):
- time_delta_object = parse_duration(time_entry)
- return pretty_print_timedelta(time_delta_object)
- else:
- return str(time_entry)
+def ingredient(ingredients: list) -> str:
+ if ingredients:
+ return [clean_string(ing) for ing in ingredients]
+ else:
+ return []
+
+
+def yield_amount(yld) -> str:
+ if isinstance(yld, list):
+ return yld[-1]
+ else:
+ return yld
+
+
+def clean_time(time_entry):
+ if time_entry is None:
+ return None
+ elif isinstance(time_entry, timedelta):
+ pretty_print_timedelta(time_entry)
+ elif isinstance(time_entry, datetime):
+ print(time_entry)
+ elif isinstance(time_entry, str):
+ if re.match("PT.*H.*M", time_entry):
+ time_delta_object = parse_duration(time_entry)
+ return pretty_print_timedelta(time_delta_object)
+ else:
+ return str(time_entry)
# ! TODO: Cleanup Code Below
diff --git a/mealie/services/scraper/scraper.py b/mealie/services/scraper/scraper.py
index 043a55ba0..a138d9931 100644
--- a/mealie/services/scraper/scraper.py
+++ b/mealie/services/scraper/scraper.py
@@ -1,17 +1,20 @@
import json
+from enum import Enum
+from typing import Any, Callable
import requests
-import scrape_schema_recipe
-from mealie.core import root_logger
+from fastapi import HTTPException, status
from mealie.core.config import app_dirs
-from mealie.schema.recipe import Recipe
+from mealie.core.root_logger import get_logger
+from mealie.schema.recipe import Recipe, RecipeStep
from mealie.services.image.image import scrape_image
-from mealie.services.scraper import open_graph
-from mealie.services.scraper.cleaner import Cleaner
+from mealie.services.scraper import cleaner, open_graph
+from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
-logger = root_logger.get_logger()
+
+logger = get_logger()
def create_from_url(url: str) -> Recipe:
@@ -24,48 +27,130 @@ def create_from_url(url: str) -> Recipe:
Returns:
Recipe: Recipe Object
"""
- r = requests.get(url)
- new_recipe = extract_recipe_from_html(r.text, url)
- new_recipe = Cleaner.clean(new_recipe, url)
- new_recipe = download_image_for_recipe(new_recipe)
-
- return Recipe(**new_recipe)
-
-
-def extract_recipe_from_html(html: str, url: str) -> dict:
- scraped_recipes: list[dict]
-
- try:
- scraped_recipes = scrape_schema_recipe.scrape_url(url)
- except Exception as e:
- print(e)
- scraped_recipes = scrape_schema_recipe.loads(html, python_objects=True)
-
- dump_last_json(scraped_recipes)
-
- if scraped_recipes:
- new_recipe: dict = scraped_recipes[0]
- logger.info(f"Recipe Scraped From Web: {new_recipe}")
-
- if not new_recipe:
- return "fail" # TODO: Return Better Error Here
-
- new_recipe = Cleaner.clean(new_recipe, url)
- else:
- new_recipe = open_graph.basic_recipe_from_opengraph(html, url)
- logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
+ new_recipe = scrape_from_url(url)
+ logger.info(f"Image {new_recipe.image}")
+ new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
return new_recipe
-def download_image_for_recipe(recipe: dict) -> dict:
- try:
- img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
- recipe["image"] = img_path.name
- except Exception:
- recipe["image"] = "no image"
+class ParserErrors(str, Enum):
+ bad_recipe = "BAD_RECIPE_DATA"
+ no_recipe_data = "NO_RECIPE_DATA"
+ connection_error = "CONNECTION_ERROR"
- return recipe
+
+def extract_open_graph_values(url) -> Recipe:
+ r = requests.get(url)
+ recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
+
+ return Recipe(**recipe)
+
+
+def scrape_from_url(url: str) -> Recipe:
+ """Entry function to generating are recipe obejct from a url
+ This will determine if a url can be parsed and raise an appropriate error keyword
+ This keyword is used on the frontend to reference a localized string to present on the UI.
+
+ Args:
+ url (str): String Representing the URL
+
+ Raises:
+ HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
+
+ Returns:
+ Recipe: Recipe Model
+ """
+ try:
+ scraped_schema = scrape_me(url)
+ except (WebsiteNotImplementedError, AttributeError):
+ try:
+ scraped_schema = scrape_me(url, wild_mode=True)
+ except (NoSchemaFoundInWildMode, AttributeError):
+ recipe = extract_open_graph_values(url)
+ if recipe.name != "":
+ return recipe
+ raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.bad_recipe.value})
+
+ except ConnectionError:
+ raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.connection_error.value})
+
+ try:
+ instruct = scraped_schema.instructions()
+ except Exception:
+ instruct = []
+
+ try:
+ ing = scraped_schema.ingredients()
+ except Exception:
+ ing = []
+
+ if not instruct and not ing:
+ raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.no_recipe_data.value})
+ else:
+ return clean_scraper(scraped_schema, url)
+
+
+def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
+ def try_get_default(func_call: Callable, get_attr: str, default: Any, clean_func=None):
+ value = default
+ try:
+ value = func_call()
+ except Exception:
+ logger.error(f"Error parsing recipe func_call for '{get_attr}'")
+
+ if value == default:
+ try:
+ value = scraped_data.schema.data.get(get_attr)
+ except Exception:
+ logger.error(f"Error parsing recipe attribute '{get_attr}'")
+
+ if clean_func:
+ value = clean_func(value)
+
+ return value
+
+ def get_instructions() -> list[dict]:
+ instruction_as_text = try_get_default(
+ scraped_data.instructions, "recipeInstructions", ["No Instructions Found"]
+ )
+
+ logger.info(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
+
+ instruction_as_text = cleaner.instructions(instruction_as_text)
+
+ logger.info(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
+
+ try:
+ return [RecipeStep(title="", text=x.get("text")) for x in instruction_as_text]
+ except TypeError:
+ return []
+
+ return Recipe(
+ name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
+ slug="",
+ image=try_get_default(scraped_data.image, "image", None),
+ description=try_get_default(None, "description", "", cleaner.clean_string),
+ recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
+ recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
+ recipe_instructions=get_instructions(),
+ total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
+ prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
+ perform_time=try_get_default(None, "performTime", None, cleaner.clean_time),
+ org_url=url,
+ )
+
+
+def download_image_for_recipe(slug, image_url) -> dict:
+ img_name = None
+ try:
+ img_path = scrape_image(image_url, slug)
+ img_name = img_path.name
+ except Exception as e:
+ logger.error(f"Error Scraping Image: {e}")
+ img_name = None
+
+ return img_name or "no image"
def dump_last_json(recipe_data: dict):
diff --git a/tests/unit_tests/test_cleaner.py b/tests/unit_tests/test_cleaner.py
index ed1ea0d11..5a6e73f23 100644
--- a/tests/unit_tests/test_cleaner.py
+++ b/tests/unit_tests/test_cleaner.py
@@ -2,7 +2,7 @@ import json
import re
import pytest
-from mealie.services.scraper.cleaner import Cleaner
+from mealie.services.scraper import cleaner
from mealie.services.scraper.scraper import extract_recipe_from_html
from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES
@@ -39,23 +39,23 @@ url_validation_regex = re.compile(
],
)
def test_cleaner_clean(json_file, num_steps):
- recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
+ recipe_data = cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
assert len(recipe_data["recipeInstructions"]) == num_steps
def test_clean_category():
- assert Cleaner.category("my-category") == ["my-category"]
+ assert cleaner.category("my-category") == ["my-category"]
-def test_clean_html():
- assert Cleaner.html("Hello World
") == "Hello World"
+def test_clean_string():
+ assert cleaner.clean_string("Hello World
") == "Hello World"
def test_clean_image():
- assert Cleaner.image(None) == "no image"
- assert Cleaner.image("https://my.image/path/") == "https://my.image/path/"
- assert Cleaner.image({"url": "My URL!"}) == "My URL!"
- assert Cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
+ assert cleaner.image(None) == "no image"
+ assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
+ assert cleaner.image({"url": "My URL!"}) == "My URL!"
+ assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
@pytest.mark.parametrize(
@@ -70,7 +70,7 @@ def test_clean_image():
],
)
def test_cleaner_instructions(instructions):
- assert Cleaner.instructions(instructions) == [
+ assert cleaner.instructions(instructions) == [
{"text": "A"},
{"text": "B"},
{"text": "C"},
@@ -94,6 +94,6 @@ def test_html_with_recipe_data():
def test_time_cleaner():
my_time_delta = "PT2H30M"
- return_delta = Cleaner.time(my_time_delta)
+ return_delta = cleaner.clean_time(my_time_delta)
assert return_delta == "2 Hours 30 Minutes"
diff --git a/tests/unit_tests/test_recipe_parser.py b/tests/unit_tests/test_recipe_parser.py
new file mode 100644
index 000000000..878f4b25c
--- /dev/null
+++ b/tests/unit_tests/test_recipe_parser.py
@@ -0,0 +1,62 @@
+from dataclasses import dataclass
+
+import pytest
+from mealie.services.scraper import scraper
+
+
+@dataclass
+class RecipeSiteTestCase:
+ url: str
+ expected_slug: str
+ num_ingredients: int
+ num_steps: int
+
+
+test_cases = [
+ RecipeSiteTestCase(
+ url="https://www.seriouseats.com/taiwanese-three-cup-chicken-san-bei-gi-recipe",
+ expected_slug="taiwanese-three-cup-chicken-san-bei-ji-recipe",
+ num_ingredients=10,
+ num_steps=3,
+ ),
+ RecipeSiteTestCase(
+ url="https://www.rezeptwelt.de/backen-herzhaft-rezepte/schinken-kaese-waffeln-ohne-viel-schnickschnack/4j0bkiig-94d4d-106529-cfcd2-is97x2ml",
+ expected_slug="schinken-kase-waffeln-ohne-viel-schnickschnack",
+ num_ingredients=7,
+ num_steps=1, # Malformed JSON Data, can't parse steps just get one string
+ ),
+ RecipeSiteTestCase(
+ url="https://cookpad.com/us/recipes/5544853-sous-vide-smoked-beef-ribs",
+ expected_slug="sous-vide-smoked-beef-ribs",
+ num_ingredients=7,
+ num_steps=12,
+ ),
+ RecipeSiteTestCase(
+ url="https://www.greatbritishchefs.com/recipes/jam-roly-poly-recipe",
+ expected_slug="jam-roly-poly-with-custard",
+ num_ingredients=13,
+ num_steps=9,
+ ),
+ RecipeSiteTestCase(
+ url="https://recipes.anovaculinary.com/recipe/sous-vide-shrimp",
+ expected_slug="sous-vide-shrimp",
+ num_ingredients=5,
+ num_steps=0,
+ ),
+ RecipeSiteTestCase(
+ url="https://www.bonappetit.com/recipe/detroit-style-pepperoni-pizza",
+ expected_slug="detroit-style-pepperoni-pizza",
+ num_ingredients=8,
+ num_steps=5,
+ ),
+]
+
+
+@pytest.mark.parametrize("recipe_test_data", test_cases)
+def test_recipe_parser(recipe_test_data: RecipeSiteTestCase):
+ recipe = scraper.create_from_url(recipe_test_data.url)
+
+ assert recipe.slug == recipe_test_data.expected_slug
+ assert len(recipe.recipe_instructions) == recipe_test_data.num_steps
+ assert len(recipe.recipe_ingredient) == recipe_test_data.num_ingredients
+ assert recipe.org_url == recipe_test_data.url