mirror of
https://github.com/hay-kot/mealie.git
synced 2025-08-23 06:45:22 -07:00
rewrite scraper to use new library
This commit is contained in:
parent
d674f1eae5
commit
b65445d4b1
13 changed files with 421 additions and 194 deletions
|
@ -12,6 +12,7 @@ const recipeURLs = {
|
||||||
allRecipesByCategory: prefix + "category",
|
allRecipesByCategory: prefix + "category",
|
||||||
create: prefix + "create",
|
create: prefix + "create",
|
||||||
createByURL: prefix + "create-url",
|
createByURL: prefix + "create-url",
|
||||||
|
testParseURL: prefix + "test-scrape-url",
|
||||||
recipe: slug => prefix + slug,
|
recipe: slug => prefix + slug,
|
||||||
update: slug => prefix + slug,
|
update: slug => prefix + slug,
|
||||||
delete: slug => prefix + slug,
|
delete: slug => prefix + slug,
|
||||||
|
@ -183,4 +184,9 @@ export const recipeAPI = {
|
||||||
const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id));
|
const response = await apiReq.delete(API_ROUTES.recipesSlugCommentsId(slug, id));
|
||||||
return response.data;
|
return response.data;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
async testScrapeURL(url) {
|
||||||
|
const response = await apiReq.post(recipeURLs.testParseURL, { url: url });
|
||||||
|
return response.data;
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
|
@ -54,13 +54,15 @@
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="d-flex justify-end">
|
<div class="d-flex justify-end">
|
||||||
<TheDownloadBtn download-url="/api/debug/last-recipe-json">
|
<v-btn
|
||||||
<template v-slot:default="{ downloadFile }">
|
white
|
||||||
<v-btn class="ml-auto mt-4" outlined color="white" @click="downloadFile">
|
outlined
|
||||||
<v-icon left> mdi-download </v-icon> {{ $t("about.download-recipe-json") }}
|
:to="{ path: '/recipes/debugger', query: { test_url: recipeURL } }"
|
||||||
</v-btn>
|
@click="addRecipe = false"
|
||||||
</template>
|
>
|
||||||
</TheDownloadBtn>
|
<v-icon> mdi-external-link </v-icon>
|
||||||
|
View Scraped Data
|
||||||
|
</v-btn>
|
||||||
</div>
|
</div>
|
||||||
</v-alert>
|
</v-alert>
|
||||||
</v-expand-transition>
|
</v-expand-transition>
|
||||||
|
@ -100,9 +102,7 @@
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
import { api } from "@/api";
|
import { api } from "@/api";
|
||||||
import TheDownloadBtn from "@/components/UI/Buttons/TheDownloadBtn.vue";
|
|
||||||
export default {
|
export default {
|
||||||
components: { TheDownloadBtn },
|
|
||||||
props: {
|
props: {
|
||||||
absolute: {
|
absolute: {
|
||||||
default: false,
|
default: false,
|
||||||
|
|
62
frontend/src/pages/Recipe/ScraperDebugger.vue
Normal file
62
frontend/src/pages/Recipe/ScraperDebugger.vue
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
<template>
|
||||||
|
<v-container>
|
||||||
|
<v-text-field v-model="testUrl" outlined single-line label="Recipe Url"> </v-text-field>
|
||||||
|
<div class="d-flex">
|
||||||
|
<v-btn class="mt-0 ml-auto" color="info" @click="getTestData">
|
||||||
|
<v-icon left> mdi-test-tube </v-icon>
|
||||||
|
Test Scrape
|
||||||
|
</v-btn>
|
||||||
|
</div>
|
||||||
|
<VJsoneditor class="mt-2" v-model="recipeJson" height="1500px" :options="jsonEditorOptions" />
|
||||||
|
</v-container>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
import VJsoneditor from "v-jsoneditor";
|
||||||
|
import { api } from "@/api";
|
||||||
|
export default {
|
||||||
|
components: {
|
||||||
|
VJsoneditor,
|
||||||
|
},
|
||||||
|
data() {
|
||||||
|
return {
|
||||||
|
jsonEditorOptions: {
|
||||||
|
mode: "code",
|
||||||
|
search: false,
|
||||||
|
mainMenuBar: false,
|
||||||
|
},
|
||||||
|
recipeJson: {},
|
||||||
|
defaultMessage: { details: "site failed to return valid schema" },
|
||||||
|
};
|
||||||
|
},
|
||||||
|
mounted() {
|
||||||
|
if (this.$route.query.test_url) {
|
||||||
|
this.getTestData();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
computed: {
|
||||||
|
testUrl: {
|
||||||
|
set(test_url) {
|
||||||
|
this.$router.replace({ query: { ...this.$route.query, test_url } });
|
||||||
|
},
|
||||||
|
get() {
|
||||||
|
return this.$route.query.test_url || "";
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
methods: {
|
||||||
|
async getTestData() {
|
||||||
|
const response = await api.recipes.testScrapeURL(this.testUrl).catch(() => {
|
||||||
|
this.recipeJson = this.defaultMessage;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.length < 1) {
|
||||||
|
this.recipeJson = this.defaultMessage;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.recipeJson = response;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
</script>
|
|
@ -63,7 +63,7 @@ import RecipeViewer from "@/components/Recipe/RecipeViewer";
|
||||||
import PrintView from "@/components/Recipe/PrintView";
|
import PrintView from "@/components/Recipe/PrintView";
|
||||||
import RecipeEditor from "@/components/Recipe/RecipeEditor";
|
import RecipeEditor from "@/components/Recipe/RecipeEditor";
|
||||||
import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue";
|
import RecipeTimeCard from "@/components/Recipe/RecipeTimeCard.vue";
|
||||||
import EditorButtonRow from "@/components/Recipe/EditorButtonRow";
|
import EditorButtonRow from "@/components/Recipe/EditorButtonRow.vue";
|
||||||
import NoRecipe from "@/components/Fallbacks/NoRecipe";
|
import NoRecipe from "@/components/Fallbacks/NoRecipe";
|
||||||
import { user } from "@/mixins/user";
|
import { user } from "@/mixins/user";
|
||||||
import { router } from "@/routes";
|
import { router } from "@/routes";
|
||||||
|
@ -133,7 +133,7 @@ export default {
|
||||||
},
|
},
|
||||||
|
|
||||||
watch: {
|
watch: {
|
||||||
$route: function() {
|
$route: function () {
|
||||||
this.getRecipeDetails();
|
this.getRecipeDetails();
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe");
|
const ViewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ViewRecipe");
|
||||||
const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe");
|
const NewRecipe = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/NewRecipe");
|
||||||
|
const ScraperDebugger = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipe/ScraperDebugger");
|
||||||
const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage");
|
const CustomPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CustomPage");
|
||||||
const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes");
|
const AllRecipes = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/AllRecipes");
|
||||||
const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage");
|
const CategoryTagPage = () => import(/* webpackChunkName: "recipes" */ "@/pages/Recipes/CategoryTagPage");
|
||||||
|
@ -9,6 +10,7 @@ import { api } from "@/api";
|
||||||
export const recipeRoutes = [
|
export const recipeRoutes = [
|
||||||
// Recipes
|
// Recipes
|
||||||
{ path: "/recipes/all", component: AllRecipes },
|
{ path: "/recipes/all", component: AllRecipes },
|
||||||
|
{ path: "/recipes/debugger", component: ScraperDebugger },
|
||||||
{ path: "/user/:id/favorites", component: Favorites },
|
{ path: "/user/:id/favorites", component: Favorites },
|
||||||
{ path: "/recipes/tag/:tag", component: CategoryTagPage },
|
{ path: "/recipes/tag/:tag", component: CategoryTagPage },
|
||||||
{ path: "/recipes/tag", component: CategoryTagPage },
|
{ path: "/recipes/tag", component: CategoryTagPage },
|
||||||
|
|
|
@ -13,6 +13,7 @@ from mealie.services.events import create_recipe_event
|
||||||
from mealie.services.image.image import scrape_image, write_image
|
from mealie.services.image.image import scrape_image, write_image
|
||||||
from mealie.services.recipe.media import check_assets, delete_assets
|
from mealie.services.recipe.media import check_assets, delete_assets
|
||||||
from mealie.services.scraper.scraper import create_from_url
|
from mealie.services.scraper.scraper import create_from_url
|
||||||
|
from scrape_schema_recipe import scrape_url
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from sqlalchemy.orm.session import Session
|
from sqlalchemy.orm.session import Session
|
||||||
|
|
||||||
|
@ -41,6 +42,11 @@ def create_from_json(
|
||||||
return recipe.slug
|
return recipe.slug
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/test-scrape-url", dependencies=[Depends(get_current_user)])
|
||||||
|
def test_parse_recipe_url(url: RecipeURLIn):
|
||||||
|
return scrape_url(url.url)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/create-url", status_code=201, response_model=str)
|
@router.post("/create-url", status_code=201, response_model=str)
|
||||||
def parse_recipe_url(
|
def parse_recipe_url(
|
||||||
background_tasks: BackgroundTasks,
|
background_tasks: BackgroundTasks,
|
||||||
|
|
|
@ -42,6 +42,7 @@ def write_image(recipe_slug: str, file_data: bytes, extension: str) -> Path:
|
||||||
|
|
||||||
|
|
||||||
def scrape_image(image_url: str, slug: str) -> Path:
|
def scrape_image(image_url: str, slug: str) -> Path:
|
||||||
|
logger.info(f"Image URL: {image_url}")
|
||||||
if isinstance(image_url, str): # Handles String Types
|
if isinstance(image_url, str): # Handles String Types
|
||||||
image_url = image_url
|
image_url = image_url
|
||||||
|
|
||||||
|
@ -64,7 +65,7 @@ def scrape_image(image_url: str, slug: str) -> Path:
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
r.raw.decode_content = True
|
r.raw.decode_content = True
|
||||||
|
logger.info(f"File Name Suffix {filename.suffix}")
|
||||||
write_image(slug, r.raw, filename.suffix)
|
write_image(slug, r.raw, filename.suffix)
|
||||||
|
|
||||||
filename.unlink(missing_ok=True)
|
filename.unlink(missing_ok=True)
|
||||||
|
|
|
@ -39,6 +39,8 @@ def minify_image(image_file: Path, force=False) -> ImageSizes:
|
||||||
min_dest = image_file.parent.joinpath("min-original.webp")
|
min_dest = image_file.parent.joinpath("min-original.webp")
|
||||||
tiny_dest = image_file.parent.joinpath("tiny-original.webp")
|
tiny_dest = image_file.parent.joinpath("tiny-original.webp")
|
||||||
|
|
||||||
|
cleanup_images = False
|
||||||
|
|
||||||
if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force:
|
if min_dest.exists() and tiny_dest.exists() and org_dest.exists() and not force:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from mealie.db.database import db
|
||||||
from mealie.schema.migration import MigrationImport
|
from mealie.schema.migration import MigrationImport
|
||||||
from mealie.schema.recipe import Recipe
|
from mealie.schema.recipe import Recipe
|
||||||
from mealie.services.image import image
|
from mealie.services.image import image
|
||||||
from mealie.services.scraper.cleaner import Cleaner
|
from mealie.services.scraper import cleaner
|
||||||
from mealie.utils.unzip import unpack_zip
|
from mealie.utils.unzip import unpack_zip
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ class MigrationBase(BaseModel):
|
||||||
"""Calls the rewrite_alias function and the Cleaner.clean function on a
|
"""Calls the rewrite_alias function and the Cleaner.clean function on a
|
||||||
dictionary and returns the result unpacked into a Recipe object"""
|
dictionary and returns the result unpacked into a Recipe object"""
|
||||||
recipe_dict = self.rewrite_alias(recipe_dict)
|
recipe_dict = self.rewrite_alias(recipe_dict)
|
||||||
recipe_dict = Cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
|
recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
|
||||||
|
|
||||||
return Recipe(**recipe_dict)
|
return Recipe(**recipe_dict)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import html
|
import html
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import List
|
from typing import List
|
||||||
|
@ -6,157 +7,157 @@ from typing import List
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
class Cleaner:
|
def clean(recipe_data: dict, url=None) -> dict:
|
||||||
"""A Namespace for utility function to clean recipe data extracted
|
"""Main entrypoint to clean a recipe extracted from the web
|
||||||
from a url and returns a dictionary that is ready for import into
|
and format the data into an accectable format for the database
|
||||||
the database. Cleaner.clean is the main entrypoint
|
|
||||||
|
Args:
|
||||||
|
recipe_data (dict): raw recipe dicitonary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: cleaned recipe dictionary
|
||||||
"""
|
"""
|
||||||
|
recipe_data["description"] = clean_string(recipe_data.get("description", ""))
|
||||||
|
|
||||||
@staticmethod
|
# Times
|
||||||
def clean(recipe_data: dict, url=None) -> dict:
|
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
|
||||||
"""Main entrypoint to clean a recipe extracted from the web
|
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
|
||||||
and format the data into an accectable format for the database
|
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
|
||||||
|
recipe_data["recipeCategory"] = category(recipe_data.get("recipeCategory", []))
|
||||||
|
|
||||||
Args:
|
recipe_data["recipeYield"] = yield_amount(recipe_data.get("recipeYield"))
|
||||||
recipe_data (dict): raw recipe dicitonary
|
recipe_data["recipeIngredient"] = ingredient(recipe_data.get("recipeIngredient"))
|
||||||
|
recipe_data["recipeInstructions"] = instructions(recipe_data.get("recipeInstructions"))
|
||||||
|
recipe_data["image"] = image(recipe_data.get("image"))
|
||||||
|
recipe_data["slug"] = slugify(recipe_data.get("name"))
|
||||||
|
recipe_data["orgURL"] = url
|
||||||
|
|
||||||
Returns:
|
return recipe_data
|
||||||
dict: cleaned recipe dictionary
|
|
||||||
"""
|
|
||||||
recipe_data["description"] = Cleaner.html(recipe_data.get("description", ""))
|
|
||||||
|
|
||||||
# Times
|
|
||||||
recipe_data["prepTime"] = Cleaner.time(recipe_data.get("prepTime"))
|
|
||||||
recipe_data["performTime"] = Cleaner.time(recipe_data.get("performTime"))
|
|
||||||
recipe_data["totalTime"] = Cleaner.time(recipe_data.get("totalTime"))
|
|
||||||
recipe_data["recipeCategory"] = Cleaner.category(recipe_data.get("recipeCategory", []))
|
|
||||||
|
|
||||||
recipe_data["recipeYield"] = Cleaner.yield_amount(recipe_data.get("recipeYield"))
|
def clean_string(text: str) -> str:
|
||||||
recipe_data["recipeIngredient"] = Cleaner.ingredient(recipe_data.get("recipeIngredient"))
|
cleaned_text = html.unescape(text)
|
||||||
recipe_data["recipeInstructions"] = Cleaner.instructions(recipe_data.get("recipeInstructions"))
|
cleaned_text = re.sub("<[^<]+?>", "", cleaned_text)
|
||||||
recipe_data["image"] = Cleaner.image(recipe_data.get("image"))
|
cleaned_text = re.sub(" +", " ", cleaned_text)
|
||||||
recipe_data["slug"] = slugify(recipe_data.get("name"))
|
cleaned_text = re.sub("</p>", "\n", cleaned_text)
|
||||||
recipe_data["orgURL"] = url
|
cleaned_text = re.sub(r"\n\s*\n", "\n\n", cleaned_text)
|
||||||
|
cleaned_text = cleaned_text.replace("\xa0", " ").replace("\t", " ").strip()
|
||||||
|
return cleaned_text
|
||||||
|
|
||||||
return recipe_data
|
|
||||||
|
|
||||||
@staticmethod
|
def category(category: str):
|
||||||
def category(category: str):
|
if isinstance(category, str) and category != "":
|
||||||
if isinstance(category, str) and category != "":
|
return [category]
|
||||||
return [category]
|
else:
|
||||||
else:
|
return []
|
||||||
return []
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def html(raw_html):
|
|
||||||
cleanr = re.compile("<.*?>")
|
|
||||||
return re.sub(cleanr, "", raw_html)
|
|
||||||
|
|
||||||
@staticmethod
|
def clean_html(raw_html):
|
||||||
def image(image=None) -> str:
|
cleanr = re.compile("<.*?>")
|
||||||
if not image:
|
return re.sub(cleanr, "", raw_html)
|
||||||
return "no image"
|
|
||||||
if isinstance(image, list):
|
|
||||||
return image[0]
|
|
||||||
elif isinstance(image, dict):
|
|
||||||
return image["url"]
|
|
||||||
elif isinstance(image, str):
|
|
||||||
return image
|
|
||||||
else:
|
|
||||||
raise Exception(f"Unrecognised image URL format: {image}")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def instructions(instructions) -> List[dict]:
|
|
||||||
if not instructions:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if isinstance(instructions[0], list):
|
def image(image=None) -> str:
|
||||||
instructions = instructions[0]
|
if not image:
|
||||||
|
return "no image"
|
||||||
|
if isinstance(image, list):
|
||||||
|
return image[0]
|
||||||
|
elif isinstance(image, dict):
|
||||||
|
return image["url"]
|
||||||
|
elif isinstance(image, str):
|
||||||
|
return image
|
||||||
|
else:
|
||||||
|
raise Exception(f"Unrecognised image URL format: {image}")
|
||||||
|
|
||||||
# One long string split by (possibly multiple) new lines
|
|
||||||
if isinstance(instructions, str):
|
|
||||||
return [{"text": Cleaner._instruction(line)} for line in instructions.splitlines() if line]
|
|
||||||
|
|
||||||
# Plain strings in a list
|
def instructions(instructions) -> List[dict]:
|
||||||
elif isinstance(instructions, list) and isinstance(instructions[0], str):
|
try:
|
||||||
return [{"text": Cleaner._instruction(step)} for step in instructions]
|
instructions = json.loads(instructions)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Dictionaries (let's assume it's a HowToStep) in a list
|
if not instructions:
|
||||||
elif isinstance(instructions, list) and isinstance(instructions[0], dict):
|
return []
|
||||||
# Try List of Dictionary without "@type" or "type"
|
|
||||||
if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
|
|
||||||
return [{"text": Cleaner._instruction(step["text"])} for step in instructions]
|
|
||||||
|
|
||||||
|
if isinstance(instructions, list) and isinstance(instructions[0], list):
|
||||||
|
instructions = instructions[0]
|
||||||
|
|
||||||
|
# One long string split by (possibly multiple) new lines
|
||||||
|
if isinstance(instructions, str):
|
||||||
|
return [{"text": _instruction(line)} for line in instructions.splitlines() if line]
|
||||||
|
|
||||||
|
# Plain strings in a list
|
||||||
|
elif isinstance(instructions, list) and isinstance(instructions[0], str):
|
||||||
|
return [{"text": _instruction(step)} for step in instructions]
|
||||||
|
|
||||||
|
# Dictionaries (let's assume it's a HowToStep) in a list
|
||||||
|
elif isinstance(instructions, list) and isinstance(instructions[0], dict):
|
||||||
|
# Try List of Dictionary without "@type" or "type"
|
||||||
|
if not instructions[0].get("@type", False) and not instructions[0].get("type", False):
|
||||||
|
return [{"text": _instruction(step["text"])} for step in instructions]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# If HowToStep is under HowToSection
|
||||||
|
sectionSteps = []
|
||||||
|
for step in instructions:
|
||||||
|
if step["@type"] == "HowToSection":
|
||||||
|
[sectionSteps.append(item) for item in step["itemListElement"]]
|
||||||
|
|
||||||
|
if len(sectionSteps) > 0:
|
||||||
|
return [{"text": _instruction(step["text"])} for step in sectionSteps if step["@type"] == "HowToStep"]
|
||||||
|
|
||||||
|
return [{"text": _instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"]
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# Not "@type", try "type"
|
||||||
try:
|
try:
|
||||||
# If HowToStep is under HowToSection
|
|
||||||
sectionSteps = []
|
|
||||||
for step in instructions:
|
|
||||||
if step["@type"] == "HowToSection":
|
|
||||||
[sectionSteps.append(item) for item in step["itemListElement"]]
|
|
||||||
|
|
||||||
if len(sectionSteps) > 0:
|
|
||||||
return [
|
|
||||||
{"text": Cleaner._instruction(step["text"])}
|
|
||||||
for step in sectionSteps
|
|
||||||
if step["@type"] == "HowToStep"
|
|
||||||
]
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
{"text": Cleaner._instruction(step["text"])}
|
{"text": _instruction(step["properties"]["text"])}
|
||||||
for step in instructions
|
for step in instructions
|
||||||
if step["@type"] == "HowToStep"
|
if step["type"].find("HowToStep") > -1
|
||||||
]
|
]
|
||||||
except Exception as e:
|
except Exception:
|
||||||
print(e)
|
pass
|
||||||
# Not "@type", try "type"
|
|
||||||
try:
|
|
||||||
return [
|
|
||||||
{"text": Cleaner._instruction(step["properties"]["text"])}
|
|
||||||
for step in instructions
|
|
||||||
if step["type"].find("HowToStep") > -1
|
|
||||||
]
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unrecognised instruction format: {instructions}")
|
raise Exception(f"Unrecognised instruction format: {instructions}")
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _instruction(line) -> str:
|
|
||||||
clean_line = Cleaner.html(line.strip())
|
|
||||||
# Some sites erroneously escape their strings on multiple levels
|
|
||||||
while not clean_line == (clean_line := html.unescape(clean_line)):
|
|
||||||
pass
|
|
||||||
return clean_line
|
|
||||||
|
|
||||||
@staticmethod
|
def _instruction(line) -> str:
|
||||||
def ingredient(ingredients: list) -> str:
|
clean_line = clean_string(line.strip())
|
||||||
if ingredients:
|
# Some sites erroneously escape their strings on multiple levels
|
||||||
return [Cleaner.html(html.unescape(ing)) for ing in ingredients]
|
while not clean_line == (clean_line := clean_string(clean_line)):
|
||||||
else:
|
pass
|
||||||
return []
|
return clean_line
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def yield_amount(yld) -> str:
|
|
||||||
if isinstance(yld, list):
|
|
||||||
return yld[-1]
|
|
||||||
else:
|
|
||||||
return yld
|
|
||||||
|
|
||||||
@staticmethod
|
def ingredient(ingredients: list) -> str:
|
||||||
def time(time_entry):
|
if ingredients:
|
||||||
if time_entry is None:
|
return [clean_string(ing) for ing in ingredients]
|
||||||
return None
|
else:
|
||||||
elif isinstance(time_entry, timedelta):
|
return []
|
||||||
pretty_print_timedelta(time_entry)
|
|
||||||
elif isinstance(time_entry, datetime):
|
|
||||||
print(time_entry)
|
def yield_amount(yld) -> str:
|
||||||
elif isinstance(time_entry, str):
|
if isinstance(yld, list):
|
||||||
if re.match("PT.*H.*M", time_entry):
|
return yld[-1]
|
||||||
time_delta_object = parse_duration(time_entry)
|
else:
|
||||||
return pretty_print_timedelta(time_delta_object)
|
return yld
|
||||||
else:
|
|
||||||
return str(time_entry)
|
|
||||||
|
def clean_time(time_entry):
|
||||||
|
if time_entry is None:
|
||||||
|
return None
|
||||||
|
elif isinstance(time_entry, timedelta):
|
||||||
|
pretty_print_timedelta(time_entry)
|
||||||
|
elif isinstance(time_entry, datetime):
|
||||||
|
print(time_entry)
|
||||||
|
elif isinstance(time_entry, str):
|
||||||
|
if re.match("PT.*H.*M", time_entry):
|
||||||
|
time_delta_object = parse_duration(time_entry)
|
||||||
|
return pretty_print_timedelta(time_delta_object)
|
||||||
|
else:
|
||||||
|
return str(time_entry)
|
||||||
|
|
||||||
|
|
||||||
# ! TODO: Cleanup Code Below
|
# ! TODO: Cleanup Code Below
|
||||||
|
|
|
@ -1,17 +1,20 @@
|
||||||
import json
|
import json
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import scrape_schema_recipe
|
from fastapi import HTTPException, status
|
||||||
from mealie.core import root_logger
|
|
||||||
from mealie.core.config import app_dirs
|
from mealie.core.config import app_dirs
|
||||||
from mealie.schema.recipe import Recipe
|
from mealie.core.root_logger import get_logger
|
||||||
|
from mealie.schema.recipe import Recipe, RecipeStep
|
||||||
from mealie.services.image.image import scrape_image
|
from mealie.services.image.image import scrape_image
|
||||||
from mealie.services.scraper import open_graph
|
from mealie.services.scraper import cleaner, open_graph
|
||||||
from mealie.services.scraper.cleaner import Cleaner
|
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
|
||||||
|
|
||||||
LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
|
LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
|
||||||
|
|
||||||
logger = root_logger.get_logger()
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
def create_from_url(url: str) -> Recipe:
|
def create_from_url(url: str) -> Recipe:
|
||||||
|
@ -24,48 +27,130 @@ def create_from_url(url: str) -> Recipe:
|
||||||
Returns:
|
Returns:
|
||||||
Recipe: Recipe Object
|
Recipe: Recipe Object
|
||||||
"""
|
"""
|
||||||
r = requests.get(url)
|
new_recipe = scrape_from_url(url)
|
||||||
new_recipe = extract_recipe_from_html(r.text, url)
|
logger.info(f"Image {new_recipe.image}")
|
||||||
new_recipe = Cleaner.clean(new_recipe, url)
|
new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
|
||||||
new_recipe = download_image_for_recipe(new_recipe)
|
|
||||||
|
|
||||||
return Recipe(**new_recipe)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_recipe_from_html(html: str, url: str) -> dict:
|
|
||||||
scraped_recipes: list[dict]
|
|
||||||
|
|
||||||
try:
|
|
||||||
scraped_recipes = scrape_schema_recipe.scrape_url(url)
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
scraped_recipes = scrape_schema_recipe.loads(html, python_objects=True)
|
|
||||||
|
|
||||||
dump_last_json(scraped_recipes)
|
|
||||||
|
|
||||||
if scraped_recipes:
|
|
||||||
new_recipe: dict = scraped_recipes[0]
|
|
||||||
logger.info(f"Recipe Scraped From Web: {new_recipe}")
|
|
||||||
|
|
||||||
if not new_recipe:
|
|
||||||
return "fail" # TODO: Return Better Error Here
|
|
||||||
|
|
||||||
new_recipe = Cleaner.clean(new_recipe, url)
|
|
||||||
else:
|
|
||||||
new_recipe = open_graph.basic_recipe_from_opengraph(html, url)
|
|
||||||
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
|
|
||||||
|
|
||||||
return new_recipe
|
return new_recipe
|
||||||
|
|
||||||
|
|
||||||
def download_image_for_recipe(recipe: dict) -> dict:
|
class ParserErrors(str, Enum):
|
||||||
try:
|
bad_recipe = "BAD_RECIPE_DATA"
|
||||||
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
|
no_recipe_data = "NO_RECIPE_DATA"
|
||||||
recipe["image"] = img_path.name
|
connection_error = "CONNECTION_ERROR"
|
||||||
except Exception:
|
|
||||||
recipe["image"] = "no image"
|
|
||||||
|
|
||||||
return recipe
|
|
||||||
|
def extract_open_graph_values(url) -> Recipe:
|
||||||
|
r = requests.get(url)
|
||||||
|
recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
|
||||||
|
|
||||||
|
return Recipe(**recipe)
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_from_url(url: str) -> Recipe:
|
||||||
|
"""Entry function to generating are recipe obejct from a url
|
||||||
|
This will determine if a url can be parsed and raise an appropriate error keyword
|
||||||
|
This keyword is used on the frontend to reference a localized string to present on the UI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): String Representing the URL
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Recipe: Recipe Model
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
scraped_schema = scrape_me(url)
|
||||||
|
except (WebsiteNotImplementedError, AttributeError):
|
||||||
|
try:
|
||||||
|
scraped_schema = scrape_me(url, wild_mode=True)
|
||||||
|
except (NoSchemaFoundInWildMode, AttributeError):
|
||||||
|
recipe = extract_open_graph_values(url)
|
||||||
|
if recipe.name != "":
|
||||||
|
return recipe
|
||||||
|
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.bad_recipe.value})
|
||||||
|
|
||||||
|
except ConnectionError:
|
||||||
|
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.connection_error.value})
|
||||||
|
|
||||||
|
try:
|
||||||
|
instruct = scraped_schema.instructions()
|
||||||
|
except Exception:
|
||||||
|
instruct = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
ing = scraped_schema.ingredients()
|
||||||
|
except Exception:
|
||||||
|
ing = []
|
||||||
|
|
||||||
|
if not instruct and not ing:
|
||||||
|
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.no_recipe_data.value})
|
||||||
|
else:
|
||||||
|
return clean_scraper(scraped_schema, url)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
|
||||||
|
def try_get_default(func_call: Callable, get_attr: str, default: Any, clean_func=None):
|
||||||
|
value = default
|
||||||
|
try:
|
||||||
|
value = func_call()
|
||||||
|
except Exception:
|
||||||
|
logger.error(f"Error parsing recipe func_call for '{get_attr}'")
|
||||||
|
|
||||||
|
if value == default:
|
||||||
|
try:
|
||||||
|
value = scraped_data.schema.data.get(get_attr)
|
||||||
|
except Exception:
|
||||||
|
logger.error(f"Error parsing recipe attribute '{get_attr}'")
|
||||||
|
|
||||||
|
if clean_func:
|
||||||
|
value = clean_func(value)
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
def get_instructions() -> list[dict]:
|
||||||
|
instruction_as_text = try_get_default(
|
||||||
|
scraped_data.instructions, "recipeInstructions", ["No Instructions Found"]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
|
||||||
|
|
||||||
|
instruction_as_text = cleaner.instructions(instruction_as_text)
|
||||||
|
|
||||||
|
logger.info(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return [RecipeStep(title="", text=x.get("text")) for x in instruction_as_text]
|
||||||
|
except TypeError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return Recipe(
|
||||||
|
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
|
||||||
|
slug="",
|
||||||
|
image=try_get_default(scraped_data.image, "image", None),
|
||||||
|
description=try_get_default(None, "description", "", cleaner.clean_string),
|
||||||
|
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
|
||||||
|
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
|
||||||
|
recipe_instructions=get_instructions(),
|
||||||
|
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
|
||||||
|
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
|
||||||
|
perform_time=try_get_default(None, "performTime", None, cleaner.clean_time),
|
||||||
|
org_url=url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download_image_for_recipe(slug, image_url) -> dict:
|
||||||
|
img_name = None
|
||||||
|
try:
|
||||||
|
img_path = scrape_image(image_url, slug)
|
||||||
|
img_name = img_path.name
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error Scraping Image: {e}")
|
||||||
|
img_name = None
|
||||||
|
|
||||||
|
return img_name or "no image"
|
||||||
|
|
||||||
|
|
||||||
def dump_last_json(recipe_data: dict):
|
def dump_last_json(recipe_data: dict):
|
||||||
|
|
|
@ -2,7 +2,7 @@ import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from mealie.services.scraper.cleaner import Cleaner
|
from mealie.services.scraper import cleaner
|
||||||
from mealie.services.scraper.scraper import extract_recipe_from_html
|
from mealie.services.scraper.scraper import extract_recipe_from_html
|
||||||
from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES
|
from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES
|
||||||
|
|
||||||
|
@ -39,23 +39,23 @@ url_validation_regex = re.compile(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_cleaner_clean(json_file, num_steps):
|
def test_cleaner_clean(json_file, num_steps):
|
||||||
recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
|
recipe_data = cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
|
||||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
assert len(recipe_data["recipeInstructions"]) == num_steps
|
||||||
|
|
||||||
|
|
||||||
def test_clean_category():
|
def test_clean_category():
|
||||||
assert Cleaner.category("my-category") == ["my-category"]
|
assert cleaner.category("my-category") == ["my-category"]
|
||||||
|
|
||||||
|
|
||||||
def test_clean_html():
|
def test_clean_string():
|
||||||
assert Cleaner.html("<div>Hello World</div>") == "Hello World"
|
assert cleaner.clean_string("<div>Hello World</div>") == "Hello World"
|
||||||
|
|
||||||
|
|
||||||
def test_clean_image():
|
def test_clean_image():
|
||||||
assert Cleaner.image(None) == "no image"
|
assert cleaner.image(None) == "no image"
|
||||||
assert Cleaner.image("https://my.image/path/") == "https://my.image/path/"
|
assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
|
||||||
assert Cleaner.image({"url": "My URL!"}) == "My URL!"
|
assert cleaner.image({"url": "My URL!"}) == "My URL!"
|
||||||
assert Cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
|
assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -70,7 +70,7 @@ def test_clean_image():
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_cleaner_instructions(instructions):
|
def test_cleaner_instructions(instructions):
|
||||||
assert Cleaner.instructions(instructions) == [
|
assert cleaner.instructions(instructions) == [
|
||||||
{"text": "A"},
|
{"text": "A"},
|
||||||
{"text": "B"},
|
{"text": "B"},
|
||||||
{"text": "C"},
|
{"text": "C"},
|
||||||
|
@ -94,6 +94,6 @@ def test_html_with_recipe_data():
|
||||||
def test_time_cleaner():
|
def test_time_cleaner():
|
||||||
|
|
||||||
my_time_delta = "PT2H30M"
|
my_time_delta = "PT2H30M"
|
||||||
return_delta = Cleaner.time(my_time_delta)
|
return_delta = cleaner.clean_time(my_time_delta)
|
||||||
|
|
||||||
assert return_delta == "2 Hours 30 Minutes"
|
assert return_delta == "2 Hours 30 Minutes"
|
||||||
|
|
62
tests/unit_tests/test_recipe_parser.py
Normal file
62
tests/unit_tests/test_recipe_parser.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from mealie.services.scraper import scraper
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RecipeSiteTestCase:
|
||||||
|
url: str
|
||||||
|
expected_slug: str
|
||||||
|
num_ingredients: int
|
||||||
|
num_steps: int
|
||||||
|
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
RecipeSiteTestCase(
|
||||||
|
url="https://www.seriouseats.com/taiwanese-three-cup-chicken-san-bei-gi-recipe",
|
||||||
|
expected_slug="taiwanese-three-cup-chicken-san-bei-ji-recipe",
|
||||||
|
num_ingredients=10,
|
||||||
|
num_steps=3,
|
||||||
|
),
|
||||||
|
RecipeSiteTestCase(
|
||||||
|
url="https://www.rezeptwelt.de/backen-herzhaft-rezepte/schinken-kaese-waffeln-ohne-viel-schnickschnack/4j0bkiig-94d4d-106529-cfcd2-is97x2ml",
|
||||||
|
expected_slug="schinken-kase-waffeln-ohne-viel-schnickschnack",
|
||||||
|
num_ingredients=7,
|
||||||
|
num_steps=1, # Malformed JSON Data, can't parse steps just get one string
|
||||||
|
),
|
||||||
|
RecipeSiteTestCase(
|
||||||
|
url="https://cookpad.com/us/recipes/5544853-sous-vide-smoked-beef-ribs",
|
||||||
|
expected_slug="sous-vide-smoked-beef-ribs",
|
||||||
|
num_ingredients=7,
|
||||||
|
num_steps=12,
|
||||||
|
),
|
||||||
|
RecipeSiteTestCase(
|
||||||
|
url="https://www.greatbritishchefs.com/recipes/jam-roly-poly-recipe",
|
||||||
|
expected_slug="jam-roly-poly-with-custard",
|
||||||
|
num_ingredients=13,
|
||||||
|
num_steps=9,
|
||||||
|
),
|
||||||
|
RecipeSiteTestCase(
|
||||||
|
url="https://recipes.anovaculinary.com/recipe/sous-vide-shrimp",
|
||||||
|
expected_slug="sous-vide-shrimp",
|
||||||
|
num_ingredients=5,
|
||||||
|
num_steps=0,
|
||||||
|
),
|
||||||
|
RecipeSiteTestCase(
|
||||||
|
url="https://www.bonappetit.com/recipe/detroit-style-pepperoni-pizza",
|
||||||
|
expected_slug="detroit-style-pepperoni-pizza",
|
||||||
|
num_ingredients=8,
|
||||||
|
num_steps=5,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("recipe_test_data", test_cases)
|
||||||
|
def test_recipe_parser(recipe_test_data: RecipeSiteTestCase):
|
||||||
|
recipe = scraper.create_from_url(recipe_test_data.url)
|
||||||
|
|
||||||
|
assert recipe.slug == recipe_test_data.expected_slug
|
||||||
|
assert len(recipe.recipe_instructions) == recipe_test_data.num_steps
|
||||||
|
assert len(recipe.recipe_ingredient) == recipe_test_data.num_ingredients
|
||||||
|
assert recipe.org_url == recipe_test_data.url
|
Loading…
Add table
Add a link
Reference in a new issue