From ce48ae61c71db587de7ba238bc274675d3d944fb Mon Sep 17 00:00:00 2001 From: dekvall Date: Sat, 30 Jan 2021 01:01:37 +0100 Subject: [PATCH] scraper: unescape html in instructions (#129) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some urls erroneously deliver escaped html their instructions, sometimes they are even escaped on multiple levels like here: https://www.ica.se/recept/kladdig-kladdkaka-722982/ ``` >>> normalize_instruction("Sätt ugnen på 200°C.") 'Sätt ugnen på 200°C.' ``` --- mealie/services/scrape_services.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mealie/services/scrape_services.py b/mealie/services/scrape_services.py index 1980384e2..8e70155af 100644 --- a/mealie/services/scrape_services.py +++ b/mealie/services/scrape_services.py @@ -5,6 +5,7 @@ from typing import List, Tuple import extruct import requests import scrape_schema_recipe +import html from app_config import DEBUG_DIR from slugify import slugify from utils.logger import logger @@ -32,17 +33,17 @@ def normalize_instructions(instructions) -> List[dict]: # One long string split by (possibly multiple) new lines if type(instructions) == str: return [ - {"text": line.strip()} for line in filter(None, instructions.splitlines()) + {"text": normalize_instruction(line)} for line in instructions.splitlines() if line ] # Plain strings in a list elif type(instructions) == list and type(instructions[0]) == str: - return [{"text": step.strip()} for step in instructions] + return [{"text": normalize_instruction(step)} for step in instructions] # Dictionaries (let's assume it's a HowToStep) in a list elif type(instructions) == list and type(instructions[0]) == dict: return [ - {"text": step["text"].strip()} + {"text": normalize_instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep" ] @@ -51,6 +52,14 @@ def normalize_instructions(instructions) -> List[dict]: raise Exception(f"Unrecognised instruction format: {instructions}") +def normalize_instruction(line) -> str: + l = line.strip() + # Some sites erroneously escape their strings on multiple levels + while not l == (l := html.unescape(l)): + pass + return l + + def normalize_yield(yld) -> str: if type(yld) == list: return yld[-1]