scraper: unescape html in instructions (#129)

Some urls erroneously deliver escaped html their instructions,
sometimes they are even escaped on multiple levels like here:

https://www.ica.se/recept/kladdig-kladdkaka-722982/

```
>>> normalize_instruction("Sätt ugnen på 200°C.")
'Sätt ugnen på 200°C.'
```
This commit is contained in:
dekvall 2021-01-30 01:01:37 +01:00 committed by GitHub
commit ce48ae61c7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -5,6 +5,7 @@ from typing import List, Tuple
import extruct import extruct
import requests import requests
import scrape_schema_recipe import scrape_schema_recipe
import html
from app_config import DEBUG_DIR from app_config import DEBUG_DIR
from slugify import slugify from slugify import slugify
from utils.logger import logger from utils.logger import logger
@ -32,17 +33,17 @@ def normalize_instructions(instructions) -> List[dict]:
# One long string split by (possibly multiple) new lines # One long string split by (possibly multiple) new lines
if type(instructions) == str: if type(instructions) == str:
return [ return [
{"text": line.strip()} for line in filter(None, instructions.splitlines()) {"text": normalize_instruction(line)} for line in instructions.splitlines() if line
] ]
# Plain strings in a list # Plain strings in a list
elif type(instructions) == list and type(instructions[0]) == str: elif type(instructions) == list and type(instructions[0]) == str:
return [{"text": step.strip()} for step in instructions] return [{"text": normalize_instruction(step)} for step in instructions]
# Dictionaries (let's assume it's a HowToStep) in a list # Dictionaries (let's assume it's a HowToStep) in a list
elif type(instructions) == list and type(instructions[0]) == dict: elif type(instructions) == list and type(instructions[0]) == dict:
return [ return [
{"text": step["text"].strip()} {"text": normalize_instruction(step["text"])}
for step in instructions for step in instructions
if step["@type"] == "HowToStep" if step["@type"] == "HowToStep"
] ]
@ -51,6 +52,14 @@ def normalize_instructions(instructions) -> List[dict]:
raise Exception(f"Unrecognised instruction format: {instructions}") raise Exception(f"Unrecognised instruction format: {instructions}")
def normalize_instruction(line) -> str:
l = line.strip()
# Some sites erroneously escape their strings on multiple levels
while not l == (l := html.unescape(l)):
pass
return l
def normalize_yield(yld) -> str: def normalize_yield(yld) -> str:
if type(yld) == list: if type(yld) == list:
return yld[-1] return yld[-1]