From ce48ae61c71db587de7ba238bc274675d3d944fb Mon Sep 17 00:00:00 2001
From: dekvall <dkvldev@gmail.com>
Date: Sat, 30 Jan 2021 01:01:37 +0100
Subject: [PATCH] scraper: unescape html in instructions (#129)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some urls erroneously deliver escaped html their instructions,
sometimes they are even escaped on multiple levels like here:

https://www.ica.se/recept/kladdig-kladdkaka-722982/

```
>>> normalize_instruction("S&amp;auml;tt ugnen p&amp;aring; 200&amp;deg;C.")
'Sätt ugnen på 200°C.'
```
---
 mealie/services/scrape_services.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/mealie/services/scrape_services.py b/mealie/services/scrape_services.py
index 1980384e2..8e70155af 100644
--- a/mealie/services/scrape_services.py
+++ b/mealie/services/scrape_services.py
@@ -5,6 +5,7 @@ from typing import List, Tuple
 import extruct
 import requests
 import scrape_schema_recipe
+import html
 from app_config import DEBUG_DIR
 from slugify import slugify
 from utils.logger import logger
@@ -32,17 +33,17 @@ def normalize_instructions(instructions) -> List[dict]:
     # One long string split by (possibly multiple) new lines
     if type(instructions) == str:
         return [
-            {"text": line.strip()} for line in filter(None, instructions.splitlines())
+            {"text": normalize_instruction(line)} for line in instructions.splitlines() if line
         ]
 
     # Plain strings in a list
     elif type(instructions) == list and type(instructions[0]) == str:
-        return [{"text": step.strip()} for step in instructions]
+        return [{"text": normalize_instruction(step)} for step in instructions]
 
     # Dictionaries (let's assume it's a HowToStep) in a list
     elif type(instructions) == list and type(instructions[0]) == dict:
         return [
-            {"text": step["text"].strip()}
+            {"text": normalize_instruction(step["text"])}
             for step in instructions
             if step["@type"] == "HowToStep"
         ]
@@ -51,6 +52,14 @@ def normalize_instructions(instructions) -> List[dict]:
         raise Exception(f"Unrecognised instruction format: {instructions}")
 
 
+def normalize_instruction(line) -> str:
+    l = line.strip()
+    # Some sites erroneously escape their strings on multiple levels
+    while not l == (l := html.unescape(l)):
+        pass
+    return l
+
+
 def normalize_yield(yld) -> str:
     if type(yld) == list:
         return yld[-1]