scraper: unescape html in instructions (#129)

Some urls erroneously deliver escaped html their instructions, sometimes they are even escaped on multiple levels like here: https://www.ica.se/recept/kladdig-kladdkaka-722982/ ``` >>> normalize_instruction("S&auml;tt ugnen p&aring; 200&deg;C.") 'Sätt ugnen på 200°C.' ```
2025-08-22 14:33:33 -07:00 · 2021-01-30 01:01:37 +01:00 · 2021-01-30 01:01:37 +01:00 · ce48ae61c7
commit ce48ae61c7
parent c746f7f4f8
1 changed files with 12 additions and 3 deletions
--- a/mealie/services/scrape_services.py
+++ b/mealie/services/scrape_services.py
@ -5,6 +5,7 @@ from typing import List, Tuple
 import extruct
 import requests
 import scrape_schema_recipe
 import html
 from app_config import DEBUG_DIR
 from slugify import slugify
 from utils.logger import logger
@ -32,17 +33,17 @@ def normalize_instructions(instructions) -> List[dict]:
    # One long string split by (possibly multiple) new lines
    if type(instructions) == str:
        return [
-            {"text": line.strip()} for line in filter(None, instructions.splitlines())
+            {"text": normalize_instruction(line)} for line in instructions.splitlines() if line
        ]
    # Plain strings in a list
    elif type(instructions) == list and type(instructions[0]) == str:
-        return [{"text": step.strip()} for step in instructions]
+        return [{"text": normalize_instruction(step)} for step in instructions]
    # Dictionaries (let's assume it's a HowToStep) in a list
    elif type(instructions) == list and type(instructions[0]) == dict:
        return [
-            {"text": step["text"].strip()}
+            {"text": normalize_instruction(step["text"])}
            for step in instructions
            if step["@type"] == "HowToStep"
        ]
@ -51,6 +52,14 @@ def normalize_instructions(instructions) -> List[dict]:
        raise Exception(f"Unrecognised instruction format: {instructions}")
 def normalize_instruction(line) -> str:
    l = line.strip()
    # Some sites erroneously escape their strings on multiple levels
    while not l == (l := html.unescape(l)):
        pass
    return l
 def normalize_yield(yld) -> str:
    if type(yld) == list:
        return yld[-1]