rewrite scraper to use new library

2025-08-22 22:43:34 -07:00 · 2021-06-09 12:03:39 -08:00 · 2021-06-09 12:03:39 -08:00 · b65445d4b1
commit b65445d4b1
parent d674f1eae5
13 changed files with 421 additions and 194 deletions
--- a/frontend/src/api/recipe.js
+++ b/frontend/src/api/recipe.js
@ -12,6 +12,7 @@ const recipeURLs = {
  allRecipesByCategory: prefix + "category",
  create: prefix + "create",
  createByURL: prefix + "create-url",
  testParseURL: prefix + "test-scrape-url",
  recipe: slug => prefix + slug,
  update: slug => prefix + slug,
  delete: slug => prefix + slug,
@ -183,4 +184,9 @@ export const recipeAPI = {
    const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id));
    return response.data;
  },
  async testScrapeURL(url) {
    const response = await apiReq.post(recipeURLs.testParseURL, { url: url });
    return response.data;
  },
 };
--- a/frontend/src/components/UI/TheRecipeFab.vue
+++ b/frontend/src/components/UI/TheRecipeFab.vue
@ -54,13 +54,15 @@
                  </a>
                </div>
                <div class="d-flex justify-end">
-                  <TheDownloadBtn download-url="/api/debug/last-recipe-json">
+                  <v-btn
-                    <template v-slot:default="{ downloadFile }">
+                    white
-                      <v-btn class="ml-auto mt-4" outlined color="white" @click="downloadFile">
+                    outlined
-                        <v-icon left> mdi-download </v-icon> {{ $t("about.download-recipe-json") }}
+                    :to="{ path: '/recipes/debugger', query: { test_url: recipeURL } }"
-                      </v-btn>
+                    @click="addRecipe = false"
-                    </template>
+                  >
-                  </TheDownloadBtn>
+                    <v-icon> mdi-external-link </v-icon>
                    View Scraped Data
                  </v-btn>
                </div>
              </v-alert>
            </v-expand-transition>
@ -100,9 +102,7 @@
 <script>
 import { api } from "@/api";
 import TheDownloadBtn from "@/components/UI/Buttons/TheDownloadBtn.vue";
 export default {
  components: { TheDownloadBtn },
  props: {
    absolute: {
      default: false,
--- a/frontend/src/pages/Recipe/ScraperDebugger.vue
+++ b/frontend/src/pages/Recipe/ScraperDebugger.vue
@ -0,0 +1,62 @@
 <template>
  <v-container>
    <v-text-field v-model="testUrl" outlined single-line label="Recipe Url"> </v-text-field>
    <div class="d-flex">
      <v-btn class="mt-0 ml-auto" color="info" @click="getTestData">
        <v-icon left> mdi-test-tube </v-icon>
        Test Scrape
      </v-btn>
    </div>
    <VJsoneditor class="mt-2" v-model="recipeJson" height="1500px" :options="jsonEditorOptions" />
  </v-container>
 </template>
 <script>
 import VJsoneditor from "v-jsoneditor";
 import { api } from "@/api";
 export default {
  components: {
    VJsoneditor,
  },
  data() {
    return {
      jsonEditorOptions: {
        mode: "code",
        search: false,
        mainMenuBar: false,
      },
      recipeJson: {},
      defaultMessage: { details: "site failed to return valid schema" },
    };
  },
  mounted() {
    if (this.$route.query.test_url) {
      this.getTestData();
    }
  },
  computed: {
    testUrl: {
      set(test_url) {
        this.$router.replace({ query: { ...this.$route.query, test_url } });
      },
      get() {
        return this.$route.query.test_url || "";
      },
    },
  },
  methods: {
    async getTestData() {
      const response = await api.recipes.testScrapeURL(this.testUrl).catch(() => {
        this.recipeJson = this.defaultMessage;
      });
      if (response.length < 1) {
        this.recipeJson = this.defaultMessage;
        return;
      }
      this.recipeJson = response;
    },
  },
 };
 </script>
--- a/frontend/src/pages/Recipe/ViewRecipe.vue
+++ b/frontend/src/pages/Recipe/ViewRecipe.vue
@ -63,7 +63,7 @@ import RecipeViewer from "@/components/Recipe/RecipeViewer";
 import PrintView from "@/components/Recipe/PrintView";
 import RecipeEditor from "@/components/Recipe/RecipeEditor";
 import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue";
-import EditorButtonRow from "@/components/Recipe/EditorButtonRow";
+import EditorButtonRow from "@/components/Recipe/EditorButtonRow.vue";
 import NoRecipe from "@/components/Fallbacks/NoRecipe";
 import { user } from "@/mixins/user";
 import { router } from "@/routes";
@ -133,7 +133,7 @@ export default {
  },
  watch: {
-    $route: function() {
+    $route: function () {
      this.getRecipeDetails();
    },
  },
--- a/frontend/src/routes/recipes.js
+++ b/frontend/src/routes/recipes.js
@ -1,5 +1,6 @@
 const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe");
 const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe");
 const ScraperDebugger = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ScraperDebugger");
 const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage");
 const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes");
 const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage");
@ -9,6 +10,7 @@ import { api } from "@/api";
 export const recipeRoutes = [
  // Recipes
  { path: "/recipes/all", component: AllRecipes },
  { path: "/recipes/debugger", component: ScraperDebugger },
  { path: "/user/:id/favorites", component: Favorites },
  { path: "/recipes/tag/:tag", component: CategoryTagPage },
  { path: "/recipes/tag", component: CategoryTagPage },
--- a/mealie/routes/recipe/recipe_crud_routes.py
+++ b/mealie/routes/recipe/recipe_crud_routes.py
@ -13,6 +13,7 @@ from mealie.services.events import create_recipe_event
 from mealie.services.image.image import scrape_image, write_image
 from mealie.services.recipe.media import check_assets, delete_assets
 from mealie.services.scraper.scraper import create_from_url
 from scrape_schema_recipe import scrape_url
 from slugify import slugify
 from sqlalchemy.orm.session import Session
@ -41,6 +42,11 @@ def create_from_json(
    return recipe.slug
@router.post("/test-scrape-url", dependencies=[Depends(get_current_user)])
 def test_parse_recipe_url(url: RecipeURLIn):
    return scrape_url(url.url)
@router.post("/create-url", status_code=201, response_model=str)
 def parse_recipe_url(
    background_tasks: BackgroundTasks,
--- a/mealie/services/image/image.py
+++ b/mealie/services/image/image.py
@ -42,6 +42,7 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path:
 def scrape_image(image_url: str, slug: str) -> Path:
    logger.info(f"Image URL: {image_url}")
    if isinstance(image_url, str):  # Handles String Types
        image_url = image_url
@ -64,7 +65,7 @@ def scrape_image(image_url: str, slug: str) -> Path:
    if r.status_code == 200:
        r.raw.decode_content = True
-
+        logger.info(f"File Name Suffix {filename.suffix}")
        write_image(slug, r.raw, filename.suffix)
        filename.unlink(missing_ok=True)
--- a/mealie/services/image/minify.py
+++ b/mealie/services/image/minify.py
@ -39,6 +39,8 @@ def minify_image(image_file: Path, force=False) -> ImageSizes:
    min_dest = image_file.parent.joinpath("min-original.webp")
    tiny_dest = image_file.parent.joinpath("tiny-original.webp")
    cleanup_images = False
    if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force:
        return
    try:
--- a/mealie/services/migrations/_migration_base.py
+++ b/mealie/services/migrations/_migration_base.py
@ -9,7 +9,7 @@ from mealie.db.database import db
 from mealie.schema.migration import MigrationImport
 from mealie.schema.recipe import Recipe
 from mealie.services.image import image
-from mealie.services.scraper.cleaner import Cleaner
+from mealie.services.scraper import cleaner
 from mealie.utils.unzip import unpack_zip
 from pydantic import BaseModel
@ -144,7 +144,7 @@ class MigrationBase(BaseModel):
        """Calls the rewrite_alias function and the Cleaner.clean function on a
        dictionary and returns the result unpacked into a Recipe object"""
        recipe_dict = self.rewrite_alias(recipe_dict)
-        recipe_dict = Cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
+        recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
        return Recipe(**recipe_dict)
--- a/mealie/services/scraper/cleaner.py
+++ b/mealie/services/scraper/cleaner.py
@ -1,4 +1,5 @@
 import html
 import json
 import re
 from datetime import datetime, timedelta
 from typing import List
@ -6,157 +7,157 @@ from typing import List
 from slugify import slugify
-class Cleaner:
+def clean(recipe_data: dict, url=None) -> dict:
-    """A Namespace for utility function to clean recipe data extracted
+    """Main entrypoint to clean a recipe extracted from the web
-    from a url and returns a dictionary that is ready for import into
+    and format the data into an accectable format for the database
-    the database. Cleaner.clean is the main entrypoint
+
    Args:
        recipe_data (dict): raw recipe dicitonary
    Returns:
        dict: cleaned recipe dictionary
    """
    recipe_data["description"] = clean_string(recipe_data.get("description", ""))
-    @staticmethod
+    # Times
-    def clean(recipe_data: dict, url=None) -> dict:
+    recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
-        """Main entrypoint to clean a recipe extracted from the web
+    recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
-        and format the data into an accectable format for the database
+    recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
    recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", []))
-        Args:
+    recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield"))
-            recipe_data (dict): raw recipe dicitonary
+    recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient"))
    recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions"))
    recipe_data["image"] = image(recipe_data.get("image"))
    recipe_data["slug"] = slugify(recipe_data.get("name"))
    recipe_data["orgURL"] = url
-        Returns:
+    return recipe_data
            dict: cleaned recipe dictionary
        """
        recipe_data["description"] = Cleaner.html(recipe_data.get("description", ""))
        # Times
        recipe_data["prepTime"] = Cleaner.time(recipe_data.get("prepTime"))
        recipe_data["performTime"] = Cleaner.time(recipe_data.get("performTime"))
        recipe_data["totalTime"] = Cleaner.time(recipe_data.get("totalTime"))
        recipe_data["recipeCategory"] = Cleaner.category(recipe_data.get("recipeCategory", []))
-        recipe_data["recipeYield"] = Cleaner.yield_amount(recipe_data.get("recipeYield"))
+def clean_string(text: str) -> str:
-        recipe_data["recipeIngredient"] = Cleaner.ingredient(recipe_data.get("recipeIngredient"))
+    cleaned_text = html.unescape(text)
-        recipe_data["recipeInstructions"] = Cleaner.instructions(recipe_data.get("recipeInstructions"))
+    cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
-        recipe_data["image"] = Cleaner.image(recipe_data.get("image"))
+    cleaned_text = re.sub(" +", " ", cleaned_text)
-        recipe_data["slug"] = slugify(recipe_data.get("name"))
+    cleaned_text = re.sub("</p>", "\n", cleaned_text)
-        recipe_data["orgURL"] = url
+    cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
    cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip()
    return cleaned_text
        return recipe_data
-    @staticmethod
+def category(category: str):
-    def category(category: str):
+    if isinstance(category, str) and category != "":
-        if isinstance(category, str) and category != "":
+        return [category]
-            return [category]
+    else:
-        else:
+        return []
            return []
    @staticmethod
    def html(raw_html):
        cleanr = re.compile("<.*?>")
        return re.sub(cleanr, "", raw_html)
-    @staticmethod
+def clean_html(raw_html):
-    def image(image=None) -> str:
+    cleanr = re.compile("<.*?>")
-        if not image:
+    return re.sub(cleanr, "", raw_html)
            return "no image"
        if isinstance(image, list):
            return image[0]
        elif isinstance(image, dict):
            return image["url"]
        elif isinstance(image, str):
            return image
        else:
            raise Exception(f"Unrecognised image URL format: {image}")
    @staticmethod
    def instructions(instructions) -> List[dict]:
        if not instructions:
            return []
-        if isinstance(instructions[0], list):
+def image(image=None) -> str:
-            instructions = instructions[0]
+    if not image:
        return "no image"
    if isinstance(image, list):
        return image[0]
    elif isinstance(image, dict):
        return image["url"]
    elif isinstance(image, str):
        return image
    else:
        raise Exception(f"Unrecognised image URL format: {image}")
        # One long string split by (possibly multiple) new lines
        if isinstance(instructions, str):
            return [{"text": Cleaner._instruction(line)} for line in instructions.splitlines() if line]
-        # Plain strings in a list
+def instructions(instructions) -> List[dict]:
-        elif isinstance(instructions, list) and isinstance(instructions[0], str):
+    try:
-            return [{"text": Cleaner._instruction(step)} for step in instructions]
+        instructions = json.loads(instructions)
    except Exception:
        pass
-        # Dictionaries (let's assume it's a HowToStep) in a list
+    if not instructions:
-        elif isinstance(instructions, list) and isinstance(instructions[0], dict):
+        return []
            # Try List of Dictionary without "@type" or "type"
            if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
                return [{"text": Cleaner._instruction(step["text"])} for step in instructions]
    if isinstance(instructions, list) and isinstance(instructions[0], list):
        instructions = instructions[0]
    # One long string split by (possibly multiple) new lines
    if isinstance(instructions, str):
        return [{"text": _instruction(line)} for line in instructions.splitlines() if line]
    # Plain strings in a list
    elif isinstance(instructions, list) and isinstance(instructions[0], str):
        return [{"text": _instruction(step)} for step in instructions]
    # Dictionaries (let's assume it's a HowToStep) in a list
    elif isinstance(instructions, list) and isinstance(instructions[0], dict):
        # Try List of Dictionary without "@type" or "type"
        if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
            return [{"text": _instruction(step["text"])} for step in instructions]
        try:
            # If HowToStep is under HowToSection
            sectionSteps = []
            for step in instructions:
                if step["@type"] == "HowToSection":
                    [sectionSteps.append(item) for item in step["itemListElement"]]
            if len(sectionSteps) > 0:
                return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"]
            return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"]
        except Exception as e:
            print(e)
            # Not "@type", try "type"
            try:
                # If HowToStep is under HowToSection
                sectionSteps = []
                for step in instructions:
                    if step["@type"] == "HowToSection":
                        [sectionSteps.append(item) for item in step["itemListElement"]]
                if len(sectionSteps) > 0:
                    return [
                        {"text": Cleaner._instruction(step["text"])}
                        for step in sectionSteps
                        if step["@type"] == "HowToStep"
                    ]
                return [
-                    {"text": Cleaner._instruction(step["text"])}
+                    {"text": _instruction(step["properties"]["text"])}
                    for step in instructions
-                    if step["@type"] == "HowToStep"
+                    if step["type"].find("HowToStep") > -1
                ]
-            except Exception as e:
+            except Exception:
-                print(e)
+                pass
                # Not "@type", try "type"
                try:
                    return [
                        {"text": Cleaner._instruction(step["properties"]["text"])}
                        for step in instructions
                        if step["type"].find("HowToStep") > -1
                    ]
                except Exception:
                    pass
-        else:
+    else:
-            raise Exception(f"Unrecognised instruction format: {instructions}")
+        raise Exception(f"Unrecognised instruction format: {instructions}")
    @staticmethod
    def _instruction(line) -> str:
        clean_line = Cleaner.html(line.strip())
        # Some sites erroneously escape their strings on multiple levels
        while not clean_line == (clean_line := html.unescape(clean_line)):
            pass
        return clean_line
-    @staticmethod
+def _instruction(line) -> str:
-    def ingredient(ingredients: list) -> str:
+    clean_line = clean_string(line.strip())
-        if ingredients:
+    # Some sites erroneously escape their strings on multiple levels
-            return [Cleaner.html(html.unescape(ing)) for ing in ingredients]
+    while not clean_line == (clean_line := clean_string(clean_line)):
-        else:
+        pass
-            return []
+    return clean_line
    @staticmethod
    def yield_amount(yld) -> str:
        if isinstance(yld, list):
            return yld[-1]
        else:
            return yld
-    @staticmethod
+def ingredient(ingredients: list) -> str:
-    def time(time_entry):
+    if ingredients:
-        if time_entry is None:
+        return [clean_string(ing) for ing in ingredients]
-            return None
+    else:
-        elif isinstance(time_entry, timedelta):
+        return []
-            pretty_print_timedelta(time_entry)
+
-        elif isinstance(time_entry, datetime):
+
-            print(time_entry)
+def yield_amount(yld) -> str:
-        elif isinstance(time_entry, str):
+    if isinstance(yld, list):
-            if re.match("PT.*H.*M", time_entry):
+        return yld[-1]
-                time_delta_object = parse_duration(time_entry)
+    else:
-                return pretty_print_timedelta(time_delta_object)
+        return yld
-        else:
+
-            return str(time_entry)
+
 def clean_time(time_entry):
    if time_entry is None:
        return None
    elif isinstance(time_entry, timedelta):
        pretty_print_timedelta(time_entry)
    elif isinstance(time_entry, datetime):
        print(time_entry)
    elif isinstance(time_entry, str):
        if re.match("PT.*H.*M", time_entry):
            time_delta_object = parse_duration(time_entry)
            return pretty_print_timedelta(time_delta_object)
    else:
        return str(time_entry)
 # ! TODO: Cleanup Code Below
--- a/mealie/services/scraper/scraper.py
+++ b/mealie/services/scraper/scraper.py
@ -1,17 +1,20 @@
 import json
 from enum import Enum
 from typing import Any, Callable
 import requests
-import scrape_schema_recipe
+from fastapi import HTTPException, status
 from mealie.core import root_logger
 from mealie.core.config import app_dirs
-from mealie.schema.recipe import Recipe
+from mealie.core.root_logger import get_logger
 from mealie.schema.recipe import Recipe, RecipeStep
 from mealie.services.image.image import scrape_image
-from mealie.services.scraper import open_graph
+from mealie.services.scraper import cleaner, open_graph
-from mealie.services.scraper.cleaner import Cleaner
+from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
 LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
-logger = root_logger.get_logger()
+
 logger = get_logger()
 def create_from_url(url: str) -> Recipe:
@ -24,48 +27,130 @@ def create_from_url(url: str) -> Recipe:
    Returns:
        Recipe: Recipe Object
    """
-    r = requests.get(url)
+    new_recipe = scrape_from_url(url)
-    new_recipe = extract_recipe_from_html(r.text, url)
+    logger.info(f"Image {new_recipe.image}")
-    new_recipe = Cleaner.clean(new_recipe, url)
+    new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
    new_recipe = download_image_for_recipe(new_recipe)
    return Recipe(**new_recipe)
 def extract_recipe_from_html(html: str, url: str) -> dict:
    scraped_recipes: list[dict]
    try:
        scraped_recipes = scrape_schema_recipe.scrape_url(url)
    except Exception as e:
        print(e)
        scraped_recipes = scrape_schema_recipe.loads(html, python_objects=True)
    dump_last_json(scraped_recipes)
    if scraped_recipes:
        new_recipe: dict = scraped_recipes[0]
        logger.info(f"Recipe Scraped From Web: {new_recipe}")
        if not new_recipe:
            return "fail"  # TODO: Return Better Error Here
        new_recipe = Cleaner.clean(new_recipe, url)
    else:
        new_recipe = open_graph.basic_recipe_from_opengraph(html, url)
        logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
    return new_recipe
-def download_image_for_recipe(recipe: dict) -> dict:
+class ParserErrors(str, Enum):
-    try:
+    bad_recipe = "BAD_RECIPE_DATA"
-        img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
+    no_recipe_data = "NO_RECIPE_DATA"
-        recipe["image"] = img_path.name
+    connection_error = "CONNECTION_ERROR"
    except Exception:
        recipe["image"] = "no image"
-    return recipe
+
 def extract_open_graph_values(url) -> Recipe:
    r = requests.get(url)
    recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
    return Recipe(**recipe)
 def scrape_from_url(url: str) -> Recipe:
    """Entry function to generating are recipe obejct from a url
    This will determine if a url can be parsed and raise an appropriate error keyword
    This keyword is used on the frontend to reference a localized string to present on the UI.
    Args:
        url (str): String Representing the URL
    Raises:
        HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
    Returns:
        Recipe: Recipe Model
    """
    try:
        scraped_schema = scrape_me(url)
    except (WebsiteNotImplementedError, AttributeError):
        try:
            scraped_schema = scrape_me(url, wild_mode=True)
        except (NoSchemaFoundInWildMode, AttributeError):
            recipe = extract_open_graph_values(url)
            if recipe.name != "":
                return recipe
            raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.bad_recipe.value})
    except ConnectionError:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.connection_error.value})
    try:
        instruct = scraped_schema.instructions()
    except Exception:
        instruct = []
    try:
        ing = scraped_schema.ingredients()
    except Exception:
        ing = []
    if not instruct and not ing:
        raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.no_recipe_data.value})
    else:
        return clean_scraper(scraped_schema, url)
 def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
    def try_get_default(func_call: Callable, get_attr: str, default: Any, clean_func=None):
        value = default
        try:
            value = func_call()
        except Exception:
            logger.error(f"Error parsing recipe func_call for '{get_attr}'")
        if value == default:
            try:
                value = scraped_data.schema.data.get(get_attr)
            except Exception:
                logger.error(f"Error parsing recipe attribute '{get_attr}'")
        if clean_func:
            value = clean_func(value)
        return value
    def get_instructions() -> list[dict]:
        instruction_as_text = try_get_default(
            scraped_data.instructions, "recipeInstructions", ["No Instructions Found"]
        )
        logger.info(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
        instruction_as_text = cleaner.instructions(instruction_as_text)
        logger.info(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
        try:
            return [RecipeStep(title="", text=x.get("text")) for x in instruction_as_text]
        except TypeError:
            return []
    return Recipe(
        name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
        slug="",
        image=try_get_default(scraped_data.image, "image", None),
        description=try_get_default(None, "description", "", cleaner.clean_string),
        recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
        recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
        recipe_instructions=get_instructions(),
        total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
        prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
        perform_time=try_get_default(None, "performTime", None, cleaner.clean_time),
        org_url=url,
    )
 def download_image_for_recipe(slug, image_url) -> dict:
    img_name = None
    try:
        img_path = scrape_image(image_url, slug)
        img_name = img_path.name
    except Exception as e:
        logger.error(f"Error Scraping Image: {e}")
        img_name = None
    return img_name or "no image"
 def dump_last_json(recipe_data: dict):
--- a/tests/unit_tests/test_cleaner.py
+++ b/tests/unit_tests/test_cleaner.py
@ -2,7 +2,7 @@ import json
 import re
 import pytest
-from mealie.services.scraper.cleaner import Cleaner
+from mealie.services.scraper import cleaner
 from mealie.services.scraper.scraper import extract_recipe_from_html
 from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES
@ -39,23 +39,23 @@ url_validation_regex = re.compile(
    ],
 )
 def test_cleaner_clean(json_file, num_steps):
-    recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
+    recipe_data = cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
    assert len(recipe_data["recipeInstructions"]) == num_steps
 def test_clean_category():
-    assert Cleaner.category("my-category") == ["my-category"]
+    assert cleaner.category("my-category") == ["my-category"]
-def test_clean_html():
+def test_clean_string():
-    assert Cleaner.html("<div>Hello World</div>") == "Hello World"
+    assert cleaner.clean_string("<div>Hello World</div>") == "Hello World"
 def test_clean_image():
-    assert Cleaner.image(None) == "no image"
+    assert cleaner.image(None) == "no image"
-    assert Cleaner.image("https://my.image/path/") == "https://my.image/path/"
+    assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
-    assert Cleaner.image({"url": "My URL!"}) == "My URL!"
+    assert cleaner.image({"url": "My URL!"}) == "My URL!"
-    assert Cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
+    assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
@pytest.mark.parametrize(
@ -70,7 +70,7 @@ def test_clean_image():
    ],
 )
 def test_cleaner_instructions(instructions):
-    assert Cleaner.instructions(instructions) == [
+    assert cleaner.instructions(instructions) == [
        {"text": "A"},
        {"text": "B"},
        {"text": "C"},
@ -94,6 +94,6 @@ def test_html_with_recipe_data():
 def test_time_cleaner():
    my_time_delta = "PT2H30M"
-    return_delta = Cleaner.time(my_time_delta)
+    return_delta = cleaner.clean_time(my_time_delta)
    assert return_delta == "2 Hours 30 Minutes"
--- a/tests/unit_tests/test_recipe_parser.py
+++ b/tests/unit_tests/test_recipe_parser.py
@ -0,0 +1,62 @@
 from dataclasses import dataclass
 import pytest
 from mealie.services.scraper import scraper
@dataclass
 class RecipeSiteTestCase:
    url: str
    expected_slug: str
    num_ingredients: int
    num_steps: int
 test_cases = [
    RecipeSiteTestCase(
        url="https://www.seriouseats.com/taiwanese-three-cup-chicken-san-bei-gi-recipe",
        expected_slug="taiwanese-three-cup-chicken-san-bei-ji-recipe",
        num_ingredients=10,
        num_steps=3,
    ),
    RecipeSiteTestCase(
        url="https://www.rezeptwelt.de/backen-herzhaft-rezepte/schinken-kaese-waffeln-ohne-viel-schnickschnack/4j0bkiig-94d4d-106529-cfcd2-is97x2ml",
        expected_slug="schinken-kase-waffeln-ohne-viel-schnickschnack",
        num_ingredients=7,
        num_steps=1,  # Malformed JSON Data, can't parse steps just get one string
    ),
    RecipeSiteTestCase(
        url="https://cookpad.com/us/recipes/5544853-sous-vide-smoked-beef-ribs",
        expected_slug="sous-vide-smoked-beef-ribs",
        num_ingredients=7,
        num_steps=12,
    ),
    RecipeSiteTestCase(
        url="https://www.greatbritishchefs.com/recipes/jam-roly-poly-recipe",
        expected_slug="jam-roly-poly-with-custard",
        num_ingredients=13,
        num_steps=9,
    ),
    RecipeSiteTestCase(
        url="https://recipes.anovaculinary.com/recipe/sous-vide-shrimp",
        expected_slug="sous-vide-shrimp",
        num_ingredients=5,
        num_steps=0,
    ),
    RecipeSiteTestCase(
        url="https://www.bonappetit.com/recipe/detroit-style-pepperoni-pizza",
        expected_slug="detroit-style-pepperoni-pizza",
        num_ingredients=8,
        num_steps=5,
    ),
 ]
@pytest.mark.parametrize("recipe_test_data", test_cases)
 def test_recipe_parser(recipe_test_data: RecipeSiteTestCase):
    recipe = scraper.create_from_url(recipe_test_data.url)
    assert recipe.slug == recipe_test_data.expected_slug
    assert len(recipe.recipe_instructions) == recipe_test_data.num_steps
    assert len(recipe.recipe_ingredient) == recipe_test_data.num_ingredients
    assert recipe.org_url == recipe_test_data.url