refactor/scrapper (#175)

* API Endpoint

* shopping list added to the UI

* fixed category sidebar on mobile

* fix category sidebar hidden all the time

* adjust mobile view on times

* remove console.logs

* actually remove console.logs

* Fixed varying card height on smaller screens

* change style of meal planner categories

* bug/fixed open search on '/' when on input

* Improved import summary dialog

* url validation

* refactor/split reciper scraper into seperate fiels

Co-authored-by: hay-kot <hay-kot@pm.me>
This commit is contained in:
Hayden 2021-02-20 13:58:06 -09:00 committed by GitHub
commit a48547126a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 593 additions and 295 deletions

View file

@ -63,7 +63,7 @@ export default {
}, },
created() { created() {
window.addEventListener("keyup", e => { window.addEventListener("keyup", e => {
if (e.key == "/") { if (e.key == "/" && !document.activeElement.id.startsWith('input') ) {
this.search = !this.search; this.search = !this.search;
} }
}); });

View file

@ -8,9 +8,9 @@ const backupURLs = {
// Backup // Backup
available: `${backupBase}available`, available: `${backupBase}available`,
createBackup: `${backupBase}export/database`, createBackup: `${backupBase}export/database`,
importBackup: (fileName) => `${backupBase}${fileName}/import`, importBackup: fileName => `${backupBase}${fileName}/import`,
deleteBackup: (fileName) => `${backupBase}${fileName}/delete`, deleteBackup: fileName => `${backupBase}${fileName}/delete`,
downloadBackup: (fileName) => `${backupBase}${fileName}/download`, downloadBackup: fileName => `${backupBase}${fileName}/download`,
}; };
export default { export default {

View file

@ -69,10 +69,9 @@ export default {
this.$emit("loading"); this.$emit("loading");
let response = await api.backups.import(data.name, data); let response = await api.backups.import(data.name, data);
let failed = response.data.failed; let importData = response.data;
let succesful = response.data.successful;
this.$emit("finished", succesful, failed); this.$emit("finished", importData);
}, },
deleteBackup(data) { deleteBackup(data) {
this.$emit("loading"); this.$emit("loading");

View file

@ -0,0 +1,47 @@
<template>
<div>
<v-data-table
dense
:headers="dataHeaders"
:items="dataSet"
item-key="name"
class="elevation-1 mt-2"
show-expand
:expanded.sync="expanded"
:footer-props="{
'items-per-page-options': [100, 200, 300, 400, -1],
}"
:items-per-page="100"
>
<template v-slot:item.status="{ item }">
<div :class="item.status ? 'success--text' : 'error--text'">
{{ item.status ? "Imported" : "Failed" }}
</div>
</template>
<template v-slot:expanded-item="{ headers, item }">
<td :colspan="headers.length">
<div class="ma-2">
{{ item.exception }}
</div>
</td>
</template>
</v-data-table>
</div>
</template>
<script>
export default {
props: {
dataSet: Array,
dataHeaders: Array,
},
data: () => ({
singleExpand: false,
expanded: [],
}),
};
</script>
<style>
</style>

View file

@ -0,0 +1,152 @@
<template>
<div class="text-center">
<v-dialog v-model="dialog" width="70%">
<v-card>
<v-card-title> Import Summary </v-card-title>
<v-card-text>
<v-row class="mb-n9">
<v-card flat>
<v-card-text>
<div>
<h3>Recipes</h3>
</div>
<div class="success--text">
Success: {{ recipeNumbers.success }}
</div>
<div class="error--text">
Failed: {{ recipeNumbers.failure }}
</div>
</v-card-text>
</v-card>
<v-card flat>
<v-card-text>
<div>
<h3>Themes</h3>
</div>
<div class="success--text">
Success: {{ themeNumbers.success }}
</div>
<div class="error--text">
Failed: {{ themeNumbers.failure }}
</div>
</v-card-text>
</v-card>
<v-card flat>
<v-card-text>
<div>
<h3>Settings</h3>
</div>
<div class="success--text">
Success: {{ settingsNumbers.success }}
</div>
<div class="error--text">
Failed: {{ settingsNumbers.failure }}
</div>
</v-card-text>
</v-card>
</v-row>
</v-card-text>
<v-tabs v-model="tab">
<v-tab>Recipes</v-tab>
<v-tab>Themes</v-tab>
<v-tab>Settings</v-tab>
</v-tabs>
<v-tabs-items v-model="tab">
<v-tab-item>
<v-card flat>
<DataTable :data-headers="recipeHeaders" :data-set="recipeData" />
</v-card>
</v-tab-item>
<v-tab-item>
<v-card>
<DataTable
:data-headers="recipeHeaders"
:data-set="themeData"
/> </v-card
></v-tab-item>
<v-tab-item>
<v-card
><DataTable
:data-headers="recipeHeaders"
:data-set="settingsData"
/>
</v-card>
</v-tab-item>
</v-tabs-items>
</v-card>
</v-dialog>
</div>
</template>
<script>
import DataTable from "./DataTable";
export default {
components: {
DataTable,
},
data: () => ({
tab: null,
dialog: false,
recipeData: [],
themeData: [],
settingsData: [],
recipeHeaders: [
{
text: "Status",
value: "status",
},
{
text: "Name",
align: "start",
sortable: true,
value: "name",
},
{ text: "Exception", value: "data-table-expand", align: "center" },
],
allDataTables: [],
}),
computed: {
recipeNumbers() {
let numbers = { success: 0, failure: 0 };
this.recipeData.forEach(element => {
if (element.status) {
numbers.success++;
} else numbers.failure++;
});
return numbers;
},
settingsNumbers() {
let numbers = { success: 0, failure: 0 };
this.settingsData.forEach(element => {
if (element.status) {
numbers.success++;
} else numbers.failure++;
});
return numbers;
},
themeNumbers() {
let numbers = { success: 0, failure: 0 };
this.themeData.forEach(element => {
if (element.status) {
numbers.success++;
} else numbers.failure++;
});
return numbers;
},
},
methods: {
open(importData) {
this.recipeData = importData.recipeImports;
this.themeData = importData.themeReport;
this.settingsData = importData.settingsReport;
this.dialog = true;
},
},
};
</script>
<style>
</style>

View file

@ -41,6 +41,7 @@
:failed-header="$t('settings.backup.failed-imports')" :failed-header="$t('settings.backup.failed-imports')"
:failed="failedImports" :failed="failedImports"
/> />
<ImportSummaryDialog ref="report" :import-data="importData" />
</v-card-text> </v-card-text>
</v-card> </v-card>
</template> </template>
@ -48,6 +49,7 @@
<script> <script>
import api from "@/api"; import api from "@/api";
import SuccessFailureAlert from "../../UI/SuccessFailureAlert"; import SuccessFailureAlert from "../../UI/SuccessFailureAlert";
import ImportSummaryDialog from "./ImportSummaryDialog";
import UploadBtn from "../../UI/UploadBtn"; import UploadBtn from "../../UI/UploadBtn";
import AvailableBackupCard from "./AvailableBackupCard"; import AvailableBackupCard from "./AvailableBackupCard";
import NewBackupCard from "./NewBackupCard"; import NewBackupCard from "./NewBackupCard";
@ -58,6 +60,7 @@ export default {
UploadBtn, UploadBtn,
AvailableBackupCard, AvailableBackupCard,
NewBackupCard, NewBackupCard,
ImportSummaryDialog,
}, },
data() { data() {
return { return {
@ -65,6 +68,7 @@ export default {
successfulImports: [], successfulImports: [],
backupLoading: false, backupLoading: false,
availableBackups: [], availableBackups: [],
importData: [],
}; };
}, },
mounted() { mounted() {
@ -87,12 +91,10 @@ export default {
this.backupLoading = false; this.backupLoading = false;
} }
}, },
processFinished(successful = null, failed = null) { processFinished(data) {
this.getAvailableBackups(); this.getAvailableBackups();
this.backupLoading = false; this.backupLoading = false;
this.successfulImports = successful; this.$refs.report.open(data);
this.failedImports = failed;
this.$refs.report.open();
}, },
}, },
}; };

View file

@ -7,10 +7,13 @@
</v-card-title> </v-card-title>
<v-card-text> <v-card-text>
<v-form> <v-form ref="urlForm">
<v-text-field <v-text-field
v-model="recipeURL" v-model="recipeURL"
:label="$t('new-recipe.recipe-url')" :label="$t('new-recipe.recipe-url')"
required
validate-on-blur
:rules="[isValidWebUrl]"
></v-text-field> ></v-text-field>
</v-form> </v-form>
@ -64,18 +67,20 @@ export default {
methods: { methods: {
async createRecipe() { async createRecipe() {
this.processing = true; if (this.$refs.urlForm.validate()) {
let response = await api.recipes.createByURL(this.recipeURL); this.processing = true;
if (response.status !== 201) { let response = await api.recipes.createByURL(this.recipeURL);
this.error = true; if (response.status !== 201) {
this.processing = false; this.error = true;
return; this.processing = false;
} return;
}
this.addRecipe = false; this.addRecipe = false;
this.processing = false; this.processing = false;
this.recipeURL = ""; this.recipeURL = "";
this.$router.push(`/recipe/${response.data}`); this.$router.push(`/recipe/${response.data}`);
}
}, },
navCreate() { navCreate() {
@ -89,6 +94,10 @@ export default {
this.recipeURL = ""; this.recipeURL = "";
this.processing = false; this.processing = false;
}, },
isValidWebUrl(url) {
let regEx = /^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)$/gm;
return regEx.test(url) ? true : "Must be a Valid URL";
},
}, },
}; };
</script> </script>

View file

@ -0,0 +1,20 @@
from typing import Optional
from pydantic.main import BaseModel
class RecipeImport(BaseModel):
name: Optional[str]
slug: str
status: bool
exception: Optional[str]
class ThemeImport(BaseModel):
name: str
status: bool
exception: Optional[str]
class SettingsImport(BaseModel):
name: str
status: bool
exception: Optional[str]

View file

@ -1,10 +1,11 @@
from db.db_setup import generate_session from db.db_setup import generate_session
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query from fastapi import APIRouter, Depends, File, Form, HTTPException
from fastapi.logger import logger
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from models.recipe_models import RecipeURLIn from models.recipe_models import RecipeURLIn
from services.image_services import read_image, write_image from services.image_services import read_image, write_image
from services.recipe_services import Recipe from services.recipe_services import Recipe
from services.scrape_services import create_from_url from services.scraper.scraper import create_from_url
from sqlalchemy.orm.session import Session from sqlalchemy.orm.session import Session
from utils.snackbar import SnackResponse from utils.snackbar import SnackResponse
@ -27,6 +28,7 @@ def parse_recipe_url(url: RecipeURLIn, db: Session = Depends(generate_session)):
""" Takes in a URL and attempts to scrape data and load it into the database """ """ Takes in a URL and attempts to scrape data and load it into the database """
recipe = create_from_url(url.url) recipe = create_from_url(url.url)
recipe.save_to_db(db) recipe.save_to_db(db)
return recipe.slug return recipe.slug

View file

@ -1,12 +1,13 @@
import json import json
import shutil import shutil
import zipfile import zipfile
from logging import error from logging import error, exception
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from app_config import BACKUP_DIR, IMG_DIR, TEMP_DIR from app_config import BACKUP_DIR, IMG_DIR, TEMP_DIR
from db.database import db from db.database import db
from models.import_models import RecipeImport, SettingsImport, ThemeImport
from models.theme_models import SiteTheme from models.theme_models import SiteTheme
from services.recipe_services import Recipe from services.recipe_services import Recipe
from services.settings_services import SiteSettings from services.settings_services import SiteSettings
@ -57,23 +58,29 @@ class ImportDatabase:
raise Exception("Import file does not exist") raise Exception("Import file does not exist")
def run(self): def run(self):
report = {} recipe_report = []
settings_report = []
theme_report = []
if self.imp_recipes: if self.imp_recipes:
report = self.import_recipes() recipe_report = self.import_recipes()
if self.imp_settings: if self.imp_settings:
self.import_settings() settings_report = self.import_settings()
if self.imp_themes: if self.imp_themes:
self.import_themes() theme_report = self.import_themes()
self.clean_up() self.clean_up()
return report if report else None return {
"recipeImports": recipe_report,
"settingsReport": settings_report,
"themeReport": theme_report,
}
def import_recipes(self): def import_recipes(self):
recipe_dir: Path = self.import_dir.joinpath("recipes") recipe_dir: Path = self.import_dir.joinpath("recipes")
imports = []
successful_imports = [] successful_imports = []
failed_imports = []
for recipe in recipe_dir.glob("*.json"): for recipe in recipe_dir.glob("*.json"):
with open(recipe, "r") as f: with open(recipe, "r") as f:
@ -82,16 +89,27 @@ class ImportDatabase:
try: try:
recipe_obj = Recipe(**recipe_dict) recipe_obj = Recipe(**recipe_dict)
recipe_obj.save_to_db(self.session) recipe_obj.save_to_db(self.session)
import_status = RecipeImport(
name=recipe_obj.name, slug=recipe_obj.slug, status=True
)
imports.append(import_status)
successful_imports.append(recipe.stem) successful_imports.append(recipe.stem)
logger.info(f"Imported: {recipe.stem}") logger.info(f"Imported: {recipe.stem}")
except Exception as inst: except Exception as inst:
logger.error(inst) logger.error(inst)
logger.info(f"Failed Import: {recipe.stem}") logger.info(f"Failed Import: {recipe.stem}")
failed_imports.append(recipe.stem) import_status = RecipeImport(
name=recipe.stem,
slug=recipe.stem,
status=False,
exception=str(inst),
)
imports.append(import_status)
self._import_images(successful_imports) self._import_images(successful_imports)
return {"successful": successful_imports, "failed": failed_imports} return imports
@staticmethod @staticmethod
def _recipe_migration(recipe_dict: dict) -> dict: def _recipe_migration(recipe_dict: dict) -> dict:
@ -130,7 +148,7 @@ class ImportDatabase:
def import_themes(self): def import_themes(self):
themes_file = self.import_dir.joinpath("themes", "themes.json") themes_file = self.import_dir.joinpath("themes", "themes.json")
theme_imports = []
with open(themes_file, "r") as f: with open(themes_file, "r") as f:
themes: list[dict] = json.loads(f.read()) themes: list[dict] = json.loads(f.read())
for theme in themes: for theme in themes:
@ -138,17 +156,38 @@ class ImportDatabase:
continue continue
new_theme = SiteTheme(**theme) new_theme = SiteTheme(**theme)
try: try:
db.themes.create(self.session, new_theme.dict()) db.themes.create(self.session, new_theme.dict())
except: theme_imports.append(ThemeImport(name=new_theme.name, status=True))
except Exception as inst:
logger.info(f"Unable Import Theme {new_theme.name}") logger.info(f"Unable Import Theme {new_theme.name}")
theme_imports.append(
ThemeImport(name=new_theme.name, status=False, exception=str(inst))
)
return theme_imports
def import_settings(self): def import_settings(self):
settings_file = self.import_dir.joinpath("settings", "settings.json") settings_file = self.import_dir.joinpath("settings", "settings.json")
settings_imports = []
with open(settings_file, "r") as f: with open(settings_file, "r") as f:
settings: dict = json.loads(f.read()) settings: dict = json.loads(f.read())
db.settings.update(self.session, settings.get("name"), settings) name = settings.get("name")
try:
db.settings.update(self.session, name, settings)
import_status = SettingsImport(name=name, status=True)
except Exception as inst:
import_status = SettingsImport(
name=name, status=False, exception=str(inst)
)
settings_imports.append(import_status)
return settings_imports
def clean_up(self): def clean_up(self):
shutil.rmtree(TEMP_DIR) shutil.rmtree(TEMP_DIR)

View file

@ -6,7 +6,7 @@ from pathlib import Path
from app_config import IMG_DIR, MIGRATION_DIR, TEMP_DIR from app_config import IMG_DIR, MIGRATION_DIR, TEMP_DIR
from services.recipe_services import Recipe from services.recipe_services import Recipe
from services.scrape_services import normalize_data, process_recipe_data from services.scraper.cleaner import Cleaner
from app_config import IMG_DIR, TEMP_DIR from app_config import IMG_DIR, TEMP_DIR
@ -34,8 +34,7 @@ def import_recipes(recipe_dir: Path) -> Recipe:
with open(recipe_file, "r") as f: with open(recipe_file, "r") as f:
recipe_dict = json.loads(f.read()) recipe_dict = json.loads(f.read())
recipe_dict = process_recipe_data(recipe_dict) recipe_data = Cleaner.clean(recipe_dict)
recipe_data = normalize_data(recipe_dict)
image_name = None image_name = None
if image: if image:

View file

@ -38,8 +38,8 @@ class Recipe(BaseModel):
tags: Optional[List[str]] = [] tags: Optional[List[str]] = []
dateAdded: Optional[datetime.date] dateAdded: Optional[datetime.date]
notes: Optional[List[RecipeNote]] = [] notes: Optional[List[RecipeNote]] = []
rating: Optional[int] rating: Optional[int] = 0
orgURL: Optional[str] orgURL: Optional[str] = ""
extras: Optional[dict] = {} extras: Optional[dict] = {}
class Config: class Config:

View file

@ -1,246 +0,0 @@
import html
import json
import re
from typing import List, Tuple
import extruct
import requests
import scrape_schema_recipe
from app_config import DEBUG_DIR
from slugify import slugify
from utils.logger import logger
from w3lib.html import get_base_url
from services.image_services import scrape_image
from services.recipe_services import Recipe
LAST_JSON = DEBUG_DIR.joinpath("last_recipe.json")
def cleanhtml(raw_html):
cleanr = re.compile("<.*?>")
cleantext = re.sub(cleanr, "", raw_html)
return cleantext
def normalize_image_url(image) -> str:
if type(image) == list:
return image[0]
elif type(image) == dict:
return image["url"]
elif type(image) == str:
return image
else:
raise Exception(f"Unrecognised image URL format: {image}")
def normalize_instructions(instructions) -> List[dict]:
if not instructions:
return []
# One long string split by (possibly multiple) new lines
if type(instructions) == str:
return [
{"text": normalize_instruction(line)}
for line in instructions.splitlines()
if line
]
# Plain strings in a list
elif type(instructions) == list and type(instructions[0]) == str:
return [{"text": normalize_instruction(step)} for step in instructions]
# Dictionaries (let's assume it's a HowToStep) in a list
elif type(instructions) == list and type(instructions[0]) == dict:
try:
# If HowToStep is under HowToSection
sectionSteps = []
for step in instructions:
if step["@type"] == "HowToSection":
for item in step["itemListElement"]:
sectionSteps.append(item)
if len(sectionSteps) > 0:
return [
{"text": normalize_instruction(step["text"])}
for step in sectionSteps
if step["@type"] == "HowToStep"
]
return [
{"text": normalize_instruction(step["text"])}
for step in instructions
if step["@type"] == "HowToStep"
]
except Exception as e:
# Not "@type", try "type"
return [
{"text": normalize_instruction(step["properties"]["text"])}
for step in instructions
if step["type"].find("HowToStep") > -1
]
else:
raise Exception(f"Unrecognised instruction format: {instructions}")
def normalize_instruction(line) -> str:
l = cleanhtml(line.strip())
# Some sites erroneously escape their strings on multiple levels
while not l == (l := html.unescape(l)):
pass
return l
def normalize_ingredient(ingredients: list) -> str:
return [cleanhtml(html.unescape(ing)) for ing in ingredients]
def normalize_yield(yld) -> str:
if type(yld) == list:
return yld[-1]
else:
return yld
def normalize_time(time_entry) -> str:
if type(time_entry) == type(None):
return None
elif type(time_entry) != str:
return str(time_entry)
def normalize_data(recipe_data: dict) -> dict:
recipe_data["totalTime"] = normalize_time(recipe_data.get("totalTime"))
recipe_data["description"] = cleanhtml(recipe_data.get("description", ""))
recipe_data["prepTime"] = normalize_time(recipe_data.get("prepTime"))
recipe_data["performTime"] = normalize_time(recipe_data.get("performTime"))
recipe_data["recipeYield"] = normalize_yield(recipe_data.get("recipeYield"))
recipe_data["recipeIngredient"] = normalize_ingredient(
recipe_data.get("recipeIngredient")
)
recipe_data["recipeInstructions"] = normalize_instructions(
recipe_data["recipeInstructions"]
)
recipe_data["image"] = normalize_image_url(recipe_data["image"])
return recipe_data
def process_recipe_data(new_recipe: dict, url=None) -> dict:
slug = slugify(new_recipe["name"])
mealie_tags = {
"slug": slug,
"orgURL": url,
"categories": [],
"tags": [],
"dateAdded": None,
"notes": [],
"extras": [],
}
new_recipe.update(mealie_tags)
return new_recipe
def extract_recipe_from_html(html: str, url: str) -> dict:
try:
scraped_recipes: List[dict] = scrape_schema_recipe.loads(
html, python_objects=True
)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(
url, python_objects=True
)
except Exception as e:
# trying without python_objects
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}")
if not new_recipe:
return "fail" # TODO: Return Better Error Here
new_recipe = process_recipe_data(new_recipe, url=url)
new_recipe = normalize_data(new_recipe)
else:
new_recipe = basic_recipe_from_opengraph(html, url)
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
return new_recipe
def download_image_for_recipe(recipe: dict) -> dict:
try:
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
recipe["image"] = img_path.name
except:
recipe["image"] = "no image"
return recipe
def og_field(properties: dict, field_name: str) -> str:
return next((val for name, val in properties if name == field_name), None)
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
return list({val for name, val in properties if name == field_name})
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url)
try:
properties = data["opengraph"][0]["properties"]
except:
return
return {
"name": og_field(properties, "og:title"),
"description": og_field(properties, "og:description"),
"image": og_field(properties, "og:image"),
"recipeYield": "",
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
"recipeIngredient": ["Could not detect ingredients"],
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
"recipeInstructions": [{"text": "Could not detect instructions"}],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": og_field(properties, "og:url"),
"categories": [],
"tags": og_fields(properties, "og:article:tag"),
"dateAdded": None,
"notes": [],
"extras": [],
}
def dump_last_json(recipe_data: dict):
with open(LAST_JSON, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
return
def process_recipe_url(url: str) -> dict:
r = requests.get(url)
new_recipe = extract_recipe_from_html(r.text, url)
new_recipe = download_image_for_recipe(new_recipe)
return new_recipe
def create_from_url(url: str) -> Recipe:
recipe_data = process_recipe_url(url)
recipe = Recipe(**recipe_data)
return recipe

View file

@ -0,0 +1,151 @@
import html
import re
from typing import List
from slugify import slugify
class Cleaner:
"""A Namespace for utility function to clean recipe data extracted
from a url and returns a dictionary that is ready for import into
the database. Cleaner.clean is the main entrypoint
"""
@staticmethod
def clean(recipe_data: dict, url=None) -> dict:
print(recipe_data)
"""Main entrypoint to clean a recipe extracted from the web
and format the data into an accectable format for the database
Args:
recipe_data (dict): raw recipe dicitonary
Returns:
dict: cleaned recipe dictionary
"""
recipe_data["totalTime"] = Cleaner.time(recipe_data.get("totalTime"))
recipe_data["description"] = Cleaner.html(recipe_data.get("description", ""))
recipe_data["prepTime"] = Cleaner.time(recipe_data.get("prepTime"))
recipe_data["performTime"] = Cleaner.time(recipe_data.get("performTime"))
recipe_data["recipeYield"] = Cleaner.yield_amount(
recipe_data.get("recipeYield")
)
recipe_data["recipeIngredient"] = Cleaner.ingredient(
recipe_data.get("recipeIngredient")
)
recipe_data["recipeInstructions"] = Cleaner.instructions(
recipe_data["recipeInstructions"]
)
recipe_data["image"] = Cleaner.image(recipe_data["image"])
recipe_data["slug"] = slugify(recipe_data["name"])
recipe_data["orgURL"] = url
return recipe_data
@staticmethod
def html(raw_html):
cleanr = re.compile("<.*?>")
cleantext = re.sub(cleanr, "", raw_html)
return cleantext
@staticmethod
def image(image) -> str:
if type(image) == list:
return image[0]
elif type(image) == dict:
return image["url"]
elif type(image) == str:
return image
else:
raise Exception(f"Unrecognised image URL format: {image}")
@staticmethod
def instructions(instructions) -> List[dict]:
if not instructions:
return []
# One long string split by (possibly multiple) new lines
print(instructions)
if type(instructions) == str:
return [
{"text": Cleaner._instruction(line)}
for line in instructions.splitlines()
if line
]
# Plain strings in a list
elif type(instructions) == list and type(instructions[0]) == str:
return [{"text": Cleaner._instruction(step)} for step in instructions]
# Dictionaries (let's assume it's a HowToStep) in a list
elif type(instructions) == list and type(instructions[0]) == dict:
# Try List of Dictionary without "@type" or "type"
if not instructions[0].get("@type", False) and not instructions[0].get(
"type", False
):
return [
{"text": Cleaner._instruction(step["text"])}
for step in instructions
]
try:
# If HowToStep is under HowToSection
sectionSteps = []
for step in instructions:
if step["@type"] == "HowToSection":
[sectionSteps.append(item) for item in step["itemListELement"]]
if len(sectionSteps) > 0:
return [
{"text": Cleaner._instruction(step["text"])}
for step in sectionSteps
if step["@type"] == "HowToStep"
]
return [
{"text": Cleaner._instruction(step["text"])}
for step in instructions
if step["@type"] == "HowToStep"
]
except Exception as e:
# Not "@type", try "type"
try:
return [
{"text": Cleaner._instruction(step["properties"]["text"])}
for step in instructions
if step["type"].find("HowToStep") > -1
]
except:
pass
else:
raise Exception(f"Unrecognised instruction format: {instructions}")
@staticmethod
def _instruction(line) -> str:
l = Cleaner.html(line.strip())
# Some sites erroneously escape their strings on multiple levels
while not l == (l := html.unescape(l)):
pass
return l
@staticmethod
def ingredient(ingredients: list) -> str:
return [Cleaner.html(html.unescape(ing)) for ing in ingredients]
@staticmethod
def yield_amount(yld) -> str:
if type(yld) == list:
return yld[-1]
else:
return yld
@staticmethod
def time(time_entry) -> str:
if type(time_entry) == type(None):
return None
elif type(time_entry) != str:
return str(time_entry)

View file

@ -0,0 +1,43 @@
from typing import Tuple
import extruct
from app_config import DEBUG_DIR
from slugify import slugify
from w3lib.html import get_base_url
LAST_JSON = DEBUG_DIR.joinpath("last_recipe.json")
def og_field(properties: dict, field_name: str) -> str:
return next((val for name, val in properties if name == field_name), None)
def og_fields(properties: list[Tuple[str, str]], field_name: str) -> list[str]:
return list({val for name, val in properties if name == field_name})
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url)
try:
properties = data["opengraph"][0]["properties"]
except:
return
return {
"name": og_field(properties, "og:title"),
"description": og_field(properties, "og:description"),
"image": og_field(properties, "og:image"),
"recipeYield": "",
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
"recipeIngredient": ["Could not detect ingredients"],
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
"recipeInstructions": [{"text": "Could not detect instructions"}],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": og_field(properties, "og:url"),
"categories": [],
"tags": og_fields(properties, "og:article:tag"),
"dateAdded": None,
"notes": [],
"extras": [],
}

View file

@ -0,0 +1,84 @@
import json
from typing import List
import requests
import scrape_schema_recipe
from app_config import DEBUG_DIR
from services.image_services import scrape_image
from services.recipe_services import Recipe
from services.scraper import open_graph
from services.scraper.cleaner import Cleaner
from utils.logger import logger
LAST_JSON = DEBUG_DIR.joinpath("last_recipe.json")
def create_from_url(url: str) -> Recipe:
"""Main entry point for generating a recipe from a URL. Pass in a URL and
a Recipe object will be returned if successful.
Args:
url (str): a valid string representing a URL
Returns:
Recipe: Recipe Object
"""
r = requests.get(url)
new_recipe = extract_recipe_from_html(r.text, url)
new_recipe = Cleaner.clean(new_recipe)
new_recipe = download_image_for_recipe(new_recipe)
recipe = Recipe(**new_recipe)
return recipe
def extract_recipe_from_html(html: str, url: str) -> dict:
try:
scraped_recipes: List[dict] = scrape_schema_recipe.loads(
html, python_objects=True
)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(
url, python_objects=True
)
except Exception as e:
# trying without python_objects
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
logger.info(f"Recipe Scraped From Web: {new_recipe}")
if not new_recipe:
return "fail" # TODO: Return Better Error Here
new_recipe = Cleaner.clean(new_recipe, url)
else:
new_recipe = open_graph.basic_recipe_from_opengraph(html, url)
logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")
return new_recipe
def download_image_for_recipe(recipe: dict) -> dict:
try:
img_path = scrape_image(recipe.get("image"), recipe.get("slug"))
recipe["image"] = img_path.name
except:
recipe["image"] = "no image"
return recipe
def dump_last_json(recipe_data: dict):
with open(LAST_JSON, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
return

View file

@ -2,11 +2,8 @@ import json
import re import re
import pytest import pytest
from services.scrape_services import ( from services.scraper.cleaner import Cleaner
extract_recipe_from_html, from services.scraper.scraper import extract_recipe_from_html
normalize_data,
normalize_instructions,
)
from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 # https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
@ -42,7 +39,7 @@ url_validation_regex = re.compile(
], ],
) )
def test_normalize_data(json_file, num_steps): def test_normalize_data(json_file, num_steps):
recipe_data = normalize_data(json.load(open(TEST_RAW_RECIPES.joinpath(json_file)))) recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
assert len(recipe_data["recipeInstructions"]) == num_steps assert len(recipe_data["recipeInstructions"]) == num_steps
@ -58,7 +55,7 @@ def test_normalize_data(json_file, num_steps):
], ],
) )
def test_normalize_instructions(instructions): def test_normalize_instructions(instructions):
assert normalize_instructions(instructions) == [ assert Cleaner.instructions(instructions) == [
{"text": "A"}, {"text": "A"},
{"text": "B"}, {"text": "B"},
{"text": "C"}, {"text": "C"},