From b65445d4b1eb2d277f3af95a70b20872729d401c Mon Sep 17 00:00:00 2001 From: hay-kot Date: Wed, 9 Jun 2021 12:03:39 -0800 Subject: [PATCH] rewrite scraper to use new library --- frontend/src/api/recipe.js | 6 + frontend/src/components/UI/TheRecipeFab.vue | 18 +- frontend/src/pages/Recipe/ScraperDebugger.vue | 62 +++++ frontend/src/pages/Recipe/ViewRecipe.vue | 4 +- frontend/src/routes/recipes.js | 2 + mealie/routes/recipe/recipe_crud_routes.py | 6 + mealie/services/image/image.py | 3 +- mealie/services/image/minify.py | 2 + mealie/services/migrations/_migration_base.py | 4 +- mealie/services/scraper/cleaner.py | 253 +++++++++--------- mealie/services/scraper/scraper.py | 171 +++++++++--- tests/unit_tests/test_cleaner.py | 22 +- tests/unit_tests/test_recipe_parser.py | 62 +++++ 13 files changed, 421 insertions(+), 194 deletions(-) create mode 100644 frontend/src/pages/Recipe/ScraperDebugger.vue create mode 100644 tests/unit_tests/test_recipe_parser.py diff --git a/frontend/src/api/recipe.js b/frontend/src/api/recipe.js index e5afffd39..c05189d27 100644 --- a/frontend/src/api/recipe.js +++ b/frontend/src/api/recipe.js @@ -12,6 +12,7 @@ const recipeURLs = { allRecipesByCategory: prefix + "category", create: prefix + "create", createByURL: prefix + "create-url", + testParseURL: prefix + "test-scrape-url", recipe: slug => prefix + slug, update: slug => prefix + slug, delete: slug => prefix + slug, @@ -183,4 +184,9 @@ export const recipeAPI = { const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id)); return response.data; }, + + async testScrapeURL(url) { + const response = await apiReq.post(recipeURLs.testParseURL, { url: url }); + return response.data; + }, }; diff --git a/frontend/src/components/UI/TheRecipeFab.vue b/frontend/src/components/UI/TheRecipeFab.vue index 8f4e5f951..030aea6ad 100644 --- a/frontend/src/components/UI/TheRecipeFab.vue +++ b/frontend/src/components/UI/TheRecipeFab.vue @@ -54,13 +54,15 @@
- - - + + mdi-external-link + View Scraped Data +
@@ -100,9 +102,7 @@ diff --git a/frontend/src/pages/Recipe/ViewRecipe.vue b/frontend/src/pages/Recipe/ViewRecipe.vue index 938377bfa..b6138de51 100644 --- a/frontend/src/pages/Recipe/ViewRecipe.vue +++ b/frontend/src/pages/Recipe/ViewRecipe.vue @@ -63,7 +63,7 @@ import RecipeViewer from "@/components/Recipe/RecipeViewer"; import PrintView from "@/components/Recipe/PrintView"; import RecipeEditor from "@/components/Recipe/RecipeEditor"; import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue"; -import EditorButtonRow from "@/components/Recipe/EditorButtonRow"; +import EditorButtonRow from "@/components/Recipe/EditorButtonRow.vue"; import NoRecipe from "@/components/Fallbacks/NoRecipe"; import { user } from "@/mixins/user"; import { router } from "@/routes"; @@ -133,7 +133,7 @@ export default { }, watch: { - $route: function() { + $route: function () { this.getRecipeDetails(); }, }, diff --git a/frontend/src/routes/recipes.js b/frontend/src/routes/recipes.js index d21588772..25140f3c6 100644 --- a/frontend/src/routes/recipes.js +++ b/frontend/src/routes/recipes.js @@ -1,5 +1,6 @@ const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe"); const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe"); +const ScraperDebugger = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ScraperDebugger"); const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage"); const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes"); const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage"); @@ -9,6 +10,7 @@ import { api } from "@/api"; export const recipeRoutes = [ // Recipes { path: "/recipes/all", component: AllRecipes }, + { path: "/recipes/debugger", component: ScraperDebugger }, { path: "/user/:id/favorites", component: Favorites }, { path: "/recipes/tag/:tag", component: CategoryTagPage }, { path: "/recipes/tag", component: CategoryTagPage }, diff --git a/mealie/routes/recipe/recipe_crud_routes.py b/mealie/routes/recipe/recipe_crud_routes.py index 60a49789b..d4fbdcf35 100644 --- a/mealie/routes/recipe/recipe_crud_routes.py +++ b/mealie/routes/recipe/recipe_crud_routes.py @@ -13,6 +13,7 @@ from mealie.services.events import create_recipe_event from mealie.services.image.image import scrape_image, write_image from mealie.services.recipe.media import check_assets, delete_assets from mealie.services.scraper.scraper import create_from_url +from scrape_schema_recipe import scrape_url from slugify import slugify from sqlalchemy.orm.session import Session @@ -41,6 +42,11 @@ def create_from_json( return recipe.slug +@router.post("/test-scrape-url", dependencies=[Depends(get_current_user)]) +def test_parse_recipe_url(url: RecipeURLIn): + return scrape_url(url.url) + + @router.post("/create-url", status_code=201, response_model=str) def parse_recipe_url( background_tasks: BackgroundTasks, diff --git a/mealie/services/image/image.py b/mealie/services/image/image.py index 9bcee6e7b..a8a3e45df 100644 --- a/mealie/services/image/image.py +++ b/mealie/services/image/image.py @@ -42,6 +42,7 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path: def scrape_image(image_url: str, slug: str) -> Path: + logger.info(f"Image URL: {image_url}") if isinstance(image_url, str): # Handles String Types image_url = image_url @@ -64,7 +65,7 @@ def scrape_image(image_url: str, slug: str) -> Path: if r.status_code == 200: r.raw.decode_content = True - + logger.info(f"File Name Suffix {filename.suffix}") write_image(slug, r.raw, filename.suffix) filename.unlink(missing_ok=True) diff --git a/mealie/services/image/minify.py b/mealie/services/image/minify.py index 6393cab69..ebc865da6 100644 --- a/mealie/services/image/minify.py +++ b/mealie/services/image/minify.py @@ -39,6 +39,8 @@ def minify_image(image_file: Path, force=False) -> ImageSizes: min_dest = image_file.parent.joinpath("min-original.webp") tiny_dest = image_file.parent.joinpath("tiny-original.webp") + cleanup_images = False + if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force: return try: diff --git a/mealie/services/migrations/_migration_base.py b/mealie/services/migrations/_migration_base.py index 6eb35895f..9c1ab5bd9 100644 --- a/mealie/services/migrations/_migration_base.py +++ b/mealie/services/migrations/_migration_base.py @@ -9,7 +9,7 @@ from mealie.db.database import db from mealie.schema.migration import MigrationImport from mealie.schema.recipe import Recipe from mealie.services.image import image -from mealie.services.scraper.cleaner import Cleaner +from mealie.services.scraper import cleaner from mealie.utils.unzip import unpack_zip from pydantic import BaseModel @@ -144,7 +144,7 @@ class MigrationBase(BaseModel): """Calls the rewrite_alias function and the Cleaner.clean function on a dictionary and returns the result unpacked into a Recipe object""" recipe_dict = self.rewrite_alias(recipe_dict) - recipe_dict = Cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None)) + recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None)) return Recipe(**recipe_dict) diff --git a/mealie/services/scraper/cleaner.py b/mealie/services/scraper/cleaner.py index 4b8ca39c1..842113a18 100644 --- a/mealie/services/scraper/cleaner.py +++ b/mealie/services/scraper/cleaner.py @@ -1,4 +1,5 @@ import html +import json import re from datetime import datetime, timedelta from typing import List @@ -6,157 +7,157 @@ from typing import List from slugify import slugify -class Cleaner: - """A Namespace for utility function to clean recipe data extracted - from a url and returns a dictionary that is ready for import into - the database. Cleaner.clean is the main entrypoint +def clean(recipe_data: dict, url=None) -> dict: + """Main entrypoint to clean a recipe extracted from the web + and format the data into an accectable format for the database + + Args: + recipe_data (dict): raw recipe dicitonary + + Returns: + dict: cleaned recipe dictionary """ + recipe_data["description"] = clean_string(recipe_data.get("description", "")) - @staticmethod - def clean(recipe_data: dict, url=None) -> dict: - """Main entrypoint to clean a recipe extracted from the web - and format the data into an accectable format for the database + # Times + recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime")) + recipe_data["performTime"] = clean_time(recipe_data.get("performTime")) + recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime")) + recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", [])) - Args: - recipe_data (dict): raw recipe dicitonary + recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield")) + recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient")) + recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions")) + recipe_data["image"] = image(recipe_data.get("image")) + recipe_data["slug"] = slugify(recipe_data.get("name")) + recipe_data["orgURL"] = url - Returns: - dict: cleaned recipe dictionary - """ - recipe_data["description"] = Cleaner.html(recipe_data.get("description", "")) + return recipe_data - # Times - recipe_data["prepTime"] = Cleaner.time(recipe_data.get("prepTime")) - recipe_data["performTime"] = Cleaner.time(recipe_data.get("performTime")) - recipe_data["totalTime"] = Cleaner.time(recipe_data.get("totalTime")) - recipe_data["recipeCategory"] = Cleaner.category(recipe_data.get("recipeCategory", [])) - recipe_data["recipeYield"] = Cleaner.yield_amount(recipe_data.get("recipeYield")) - recipe_data["recipeIngredient"] = Cleaner.ingredient(recipe_data.get("recipeIngredient")) - recipe_data["recipeInstructions"] = Cleaner.instructions(recipe_data.get("recipeInstructions")) - recipe_data["image"] = Cleaner.image(recipe_data.get("image")) - recipe_data["slug"] = slugify(recipe_data.get("name")) - recipe_data["orgURL"] = url +def clean_string(text: str) -> str: + cleaned_text = html.unescape(text) + cleaned_text = re.sub("<[^<]+?>", "", cleaned_text) + cleaned_text = re.sub(" +", " ", cleaned_text) + cleaned_text = re.sub("

", "\n", cleaned_text) + cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text) + cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip() + return cleaned_text - return recipe_data - @staticmethod - def category(category: str): - if isinstance(category, str) and category != "": - return [category] - else: - return [] +def category(category: str): + if isinstance(category, str) and category != "": + return [category] + else: + return [] - @staticmethod - def html(raw_html): - cleanr = re.compile("<.*?>") - return re.sub(cleanr, "", raw_html) - @staticmethod - def image(image=None) -> str: - if not image: - return "no image" - if isinstance(image, list): - return image[0] - elif isinstance(image, dict): - return image["url"] - elif isinstance(image, str): - return image - else: - raise Exception(f"Unrecognised image URL format: {image}") +def clean_html(raw_html): + cleanr = re.compile("<.*?>") + return re.sub(cleanr, "", raw_html) - @staticmethod - def instructions(instructions) -> List[dict]: - if not instructions: - return [] - if isinstance(instructions[0], list): - instructions = instructions[0] +def image(image=None) -> str: + if not image: + return "no image" + if isinstance(image, list): + return image[0] + elif isinstance(image, dict): + return image["url"] + elif isinstance(image, str): + return image + else: + raise Exception(f"Unrecognised image URL format: {image}") - # One long string split by (possibly multiple) new lines - if isinstance(instructions, str): - return [{"text": Cleaner._instruction(line)} for line in instructions.splitlines() if line] - # Plain strings in a list - elif isinstance(instructions, list) and isinstance(instructions[0], str): - return [{"text": Cleaner._instruction(step)} for step in instructions] +def instructions(instructions) -> List[dict]: + try: + instructions = json.loads(instructions) + except Exception: + pass - # Dictionaries (let's assume it's a HowToStep) in a list - elif isinstance(instructions, list) and isinstance(instructions[0], dict): - # Try List of Dictionary without "@type" or "type" - if not instructions[0].get("@type", False) and not instructions[0].get("type", False): - return [{"text": Cleaner._instruction(step["text"])} for step in instructions] + if not instructions: + return [] + if isinstance(instructions, list) and isinstance(instructions[0], list): + instructions = instructions[0] + + # One long string split by (possibly multiple) new lines + if isinstance(instructions, str): + return [{"text": _instruction(line)} for line in instructions.splitlines() if line] + + # Plain strings in a list + elif isinstance(instructions, list) and isinstance(instructions[0], str): + return [{"text": _instruction(step)} for step in instructions] + + # Dictionaries (let's assume it's a HowToStep) in a list + elif isinstance(instructions, list) and isinstance(instructions[0], dict): + # Try List of Dictionary without "@type" or "type" + if not instructions[0].get("@type", False) and not instructions[0].get("type", False): + return [{"text": _instruction(step["text"])} for step in instructions] + + try: + # If HowToStep is under HowToSection + sectionSteps = [] + for step in instructions: + if step["@type"] == "HowToSection": + [sectionSteps.append(item) for item in step["itemListElement"]] + + if len(sectionSteps) > 0: + return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"] + + return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"] + except Exception as e: + print(e) + # Not "@type", try "type" try: - # If HowToStep is under HowToSection - sectionSteps = [] - for step in instructions: - if step["@type"] == "HowToSection": - [sectionSteps.append(item) for item in step["itemListElement"]] - - if len(sectionSteps) > 0: - return [ - {"text": Cleaner._instruction(step["text"])} - for step in sectionSteps - if step["@type"] == "HowToStep" - ] - return [ - {"text": Cleaner._instruction(step["text"])} + {"text": _instruction(step["properties"]["text"])} for step in instructions - if step["@type"] == "HowToStep" + if step["type"].find("HowToStep") > -1 ] - except Exception as e: - print(e) - # Not "@type", try "type" - try: - return [ - {"text": Cleaner._instruction(step["properties"]["text"])} - for step in instructions - if step["type"].find("HowToStep") > -1 - ] - except Exception: - pass + except Exception: + pass - else: - raise Exception(f"Unrecognised instruction format: {instructions}") + else: + raise Exception(f"Unrecognised instruction format: {instructions}") - @staticmethod - def _instruction(line) -> str: - clean_line = Cleaner.html(line.strip()) - # Some sites erroneously escape their strings on multiple levels - while not clean_line == (clean_line := html.unescape(clean_line)): - pass - return clean_line - @staticmethod - def ingredient(ingredients: list) -> str: - if ingredients: - return [Cleaner.html(html.unescape(ing)) for ing in ingredients] - else: - return [] +def _instruction(line) -> str: + clean_line = clean_string(line.strip()) + # Some sites erroneously escape their strings on multiple levels + while not clean_line == (clean_line := clean_string(clean_line)): + pass + return clean_line - @staticmethod - def yield_amount(yld) -> str: - if isinstance(yld, list): - return yld[-1] - else: - return yld - @staticmethod - def time(time_entry): - if time_entry is None: - return None - elif isinstance(time_entry, timedelta): - pretty_print_timedelta(time_entry) - elif isinstance(time_entry, datetime): - print(time_entry) - elif isinstance(time_entry, str): - if re.match("PT.*H.*M", time_entry): - time_delta_object = parse_duration(time_entry) - return pretty_print_timedelta(time_delta_object) - else: - return str(time_entry) +def ingredient(ingredients: list) -> str: + if ingredients: + return [clean_string(ing) for ing in ingredients] + else: + return [] + + +def yield_amount(yld) -> str: + if isinstance(yld, list): + return yld[-1] + else: + return yld + + +def clean_time(time_entry): + if time_entry is None: + return None + elif isinstance(time_entry, timedelta): + pretty_print_timedelta(time_entry) + elif isinstance(time_entry, datetime): + print(time_entry) + elif isinstance(time_entry, str): + if re.match("PT.*H.*M", time_entry): + time_delta_object = parse_duration(time_entry) + return pretty_print_timedelta(time_delta_object) + else: + return str(time_entry) # ! TODO: Cleanup Code Below diff --git a/mealie/services/scraper/scraper.py b/mealie/services/scraper/scraper.py index 043a55ba0..a138d9931 100644 --- a/mealie/services/scraper/scraper.py +++ b/mealie/services/scraper/scraper.py @@ -1,17 +1,20 @@ import json +from enum import Enum +from typing import Any, Callable import requests -import scrape_schema_recipe -from mealie.core import root_logger +from fastapi import HTTPException, status from mealie.core.config import app_dirs -from mealie.schema.recipe import Recipe +from mealie.core.root_logger import get_logger +from mealie.schema.recipe import Recipe, RecipeStep from mealie.services.image.image import scrape_image -from mealie.services.scraper import open_graph -from mealie.services.scraper.cleaner import Cleaner +from mealie.services.scraper import cleaner, open_graph +from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json") -logger = root_logger.get_logger() + +logger = get_logger() def create_from_url(url: str) -> Recipe: @@ -24,48 +27,130 @@ def create_from_url(url: str) -> Recipe: Returns: Recipe: Recipe Object """ - r = requests.get(url) - new_recipe = extract_recipe_from_html(r.text, url) - new_recipe = Cleaner.clean(new_recipe, url) - new_recipe = download_image_for_recipe(new_recipe) - - return Recipe(**new_recipe) - - -def extract_recipe_from_html(html: str, url: str) -> dict: - scraped_recipes: list[dict] - - try: - scraped_recipes = scrape_schema_recipe.scrape_url(url) - except Exception as e: - print(e) - scraped_recipes = scrape_schema_recipe.loads(html, python_objects=True) - - dump_last_json(scraped_recipes) - - if scraped_recipes: - new_recipe: dict = scraped_recipes[0] - logger.info(f"Recipe Scraped From Web: {new_recipe}") - - if not new_recipe: - return "fail" # TODO: Return Better Error Here - - new_recipe = Cleaner.clean(new_recipe, url) - else: - new_recipe = open_graph.basic_recipe_from_opengraph(html, url) - logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}") + new_recipe = scrape_from_url(url) + logger.info(f"Image {new_recipe.image}") + new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image) return new_recipe -def download_image_for_recipe(recipe: dict) -> dict: - try: - img_path = scrape_image(recipe.get("image"), recipe.get("slug")) - recipe["image"] = img_path.name - except Exception: - recipe["image"] = "no image" +class ParserErrors(str, Enum): + bad_recipe = "BAD_RECIPE_DATA" + no_recipe_data = "NO_RECIPE_DATA" + connection_error = "CONNECTION_ERROR" - return recipe + +def extract_open_graph_values(url) -> Recipe: + r = requests.get(url) + recipe = open_graph.basic_recipe_from_opengraph(r.text, url) + + return Recipe(**recipe) + + +def scrape_from_url(url: str) -> Recipe: + """Entry function to generating are recipe obejct from a url + This will determine if a url can be parsed and raise an appropriate error keyword + This keyword is used on the frontend to reference a localized string to present on the UI. + + Args: + url (str): String Representing the URL + + Raises: + HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details + + Returns: + Recipe: Recipe Model + """ + try: + scraped_schema = scrape_me(url) + except (WebsiteNotImplementedError, AttributeError): + try: + scraped_schema = scrape_me(url, wild_mode=True) + except (NoSchemaFoundInWildMode, AttributeError): + recipe = extract_open_graph_values(url) + if recipe.name != "": + return recipe + raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.bad_recipe.value}) + + except ConnectionError: + raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.connection_error.value}) + + try: + instruct = scraped_schema.instructions() + except Exception: + instruct = [] + + try: + ing = scraped_schema.ingredients() + except Exception: + ing = [] + + if not instruct and not ing: + raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.no_recipe_data.value}) + else: + return clean_scraper(scraped_schema, url) + + +def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe: + def try_get_default(func_call: Callable, get_attr: str, default: Any, clean_func=None): + value = default + try: + value = func_call() + except Exception: + logger.error(f"Error parsing recipe func_call for '{get_attr}'") + + if value == default: + try: + value = scraped_data.schema.data.get(get_attr) + except Exception: + logger.error(f"Error parsing recipe attribute '{get_attr}'") + + if clean_func: + value = clean_func(value) + + return value + + def get_instructions() -> list[dict]: + instruction_as_text = try_get_default( + scraped_data.instructions, "recipeInstructions", ["No Instructions Found"] + ) + + logger.info(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}") + + instruction_as_text = cleaner.instructions(instruction_as_text) + + logger.info(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}") + + try: + return [RecipeStep(title="", text=x.get("text")) for x in instruction_as_text] + except TypeError: + return [] + + return Recipe( + name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string), + slug="", + image=try_get_default(scraped_data.image, "image", None), + description=try_get_default(None, "description", "", cleaner.clean_string), + recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string), + recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient), + recipe_instructions=get_instructions(), + total_time=try_get_default(None, "totalTime", None, cleaner.clean_time), + prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time), + perform_time=try_get_default(None, "performTime", None, cleaner.clean_time), + org_url=url, + ) + + +def download_image_for_recipe(slug, image_url) -> dict: + img_name = None + try: + img_path = scrape_image(image_url, slug) + img_name = img_path.name + except Exception as e: + logger.error(f"Error Scraping Image: {e}") + img_name = None + + return img_name or "no image" def dump_last_json(recipe_data: dict): diff --git a/tests/unit_tests/test_cleaner.py b/tests/unit_tests/test_cleaner.py index ed1ea0d11..5a6e73f23 100644 --- a/tests/unit_tests/test_cleaner.py +++ b/tests/unit_tests/test_cleaner.py @@ -2,7 +2,7 @@ import json import re import pytest -from mealie.services.scraper.cleaner import Cleaner +from mealie.services.scraper import cleaner from mealie.services.scraper.scraper import extract_recipe_from_html from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES @@ -39,23 +39,23 @@ url_validation_regex = re.compile( ], ) def test_cleaner_clean(json_file, num_steps): - recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file)))) + recipe_data = cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file)))) assert len(recipe_data["recipeInstructions"]) == num_steps def test_clean_category(): - assert Cleaner.category("my-category") == ["my-category"] + assert cleaner.category("my-category") == ["my-category"] -def test_clean_html(): - assert Cleaner.html("
Hello World
") == "Hello World" +def test_clean_string(): + assert cleaner.clean_string("
Hello World
") == "Hello World" def test_clean_image(): - assert Cleaner.image(None) == "no image" - assert Cleaner.image("https://my.image/path/") == "https://my.image/path/" - assert Cleaner.image({"url": "My URL!"}) == "My URL!" - assert Cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" + assert cleaner.image(None) == "no image" + assert cleaner.image("https://my.image/path/") == "https://my.image/path/" + assert cleaner.image({"url": "My URL!"}) == "My URL!" + assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!" @pytest.mark.parametrize( @@ -70,7 +70,7 @@ def test_clean_image(): ], ) def test_cleaner_instructions(instructions): - assert Cleaner.instructions(instructions) == [ + assert cleaner.instructions(instructions) == [ {"text": "A"}, {"text": "B"}, {"text": "C"}, @@ -94,6 +94,6 @@ def test_html_with_recipe_data(): def test_time_cleaner(): my_time_delta = "PT2H30M" - return_delta = Cleaner.time(my_time_delta) + return_delta = cleaner.clean_time(my_time_delta) assert return_delta == "2 Hours 30 Minutes" diff --git a/tests/unit_tests/test_recipe_parser.py b/tests/unit_tests/test_recipe_parser.py new file mode 100644 index 000000000..878f4b25c --- /dev/null +++ b/tests/unit_tests/test_recipe_parser.py @@ -0,0 +1,62 @@ +from dataclasses import dataclass + +import pytest +from mealie.services.scraper import scraper + + +@dataclass +class RecipeSiteTestCase: + url: str + expected_slug: str + num_ingredients: int + num_steps: int + + +test_cases = [ + RecipeSiteTestCase( + url="https://www.seriouseats.com/taiwanese-three-cup-chicken-san-bei-gi-recipe", + expected_slug="taiwanese-three-cup-chicken-san-bei-ji-recipe", + num_ingredients=10, + num_steps=3, + ), + RecipeSiteTestCase( + url="https://www.rezeptwelt.de/backen-herzhaft-rezepte/schinken-kaese-waffeln-ohne-viel-schnickschnack/4j0bkiig-94d4d-106529-cfcd2-is97x2ml", + expected_slug="schinken-kase-waffeln-ohne-viel-schnickschnack", + num_ingredients=7, + num_steps=1, # Malformed JSON Data, can't parse steps just get one string + ), + RecipeSiteTestCase( + url="https://cookpad.com/us/recipes/5544853-sous-vide-smoked-beef-ribs", + expected_slug="sous-vide-smoked-beef-ribs", + num_ingredients=7, + num_steps=12, + ), + RecipeSiteTestCase( + url="https://www.greatbritishchefs.com/recipes/jam-roly-poly-recipe", + expected_slug="jam-roly-poly-with-custard", + num_ingredients=13, + num_steps=9, + ), + RecipeSiteTestCase( + url="https://recipes.anovaculinary.com/recipe/sous-vide-shrimp", + expected_slug="sous-vide-shrimp", + num_ingredients=5, + num_steps=0, + ), + RecipeSiteTestCase( + url="https://www.bonappetit.com/recipe/detroit-style-pepperoni-pizza", + expected_slug="detroit-style-pepperoni-pizza", + num_ingredients=8, + num_steps=5, + ), +] + + +@pytest.mark.parametrize("recipe_test_data", test_cases) +def test_recipe_parser(recipe_test_data: RecipeSiteTestCase): + recipe = scraper.create_from_url(recipe_test_data.url) + + assert recipe.slug == recipe_test_data.expected_slug + assert len(recipe.recipe_instructions) == recipe_test_data.num_steps + assert len(recipe.recipe_ingredient) == recipe_test_data.num_ingredients + assert recipe.org_url == recipe_test_data.url