mirror of
https://github.com/hay-kot/mealie.git
synced 2025-08-22 14:33:33 -07:00
scraper: unescape html in instructions (#129)
Some urls erroneously deliver escaped html their instructions, sometimes they are even escaped on multiple levels like here: https://www.ica.se/recept/kladdig-kladdkaka-722982/ ``` >>> normalize_instruction("Sätt ugnen på 200°C.") 'Sätt ugnen på 200°C.' ```
This commit is contained in:
parent
c746f7f4f8
commit
ce48ae61c7
1 changed files with 12 additions and 3 deletions
|
@ -5,6 +5,7 @@ from typing import List, Tuple
|
||||||
import extruct
|
import extruct
|
||||||
import requests
|
import requests
|
||||||
import scrape_schema_recipe
|
import scrape_schema_recipe
|
||||||
|
import html
|
||||||
from app_config import DEBUG_DIR
|
from app_config import DEBUG_DIR
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
|
@ -32,17 +33,17 @@ def normalize_instructions(instructions) -> List[dict]:
|
||||||
# One long string split by (possibly multiple) new lines
|
# One long string split by (possibly multiple) new lines
|
||||||
if type(instructions) == str:
|
if type(instructions) == str:
|
||||||
return [
|
return [
|
||||||
{"text": line.strip()} for line in filter(None, instructions.splitlines())
|
{"text": normalize_instruction(line)} for line in instructions.splitlines() if line
|
||||||
]
|
]
|
||||||
|
|
||||||
# Plain strings in a list
|
# Plain strings in a list
|
||||||
elif type(instructions) == list and type(instructions[0]) == str:
|
elif type(instructions) == list and type(instructions[0]) == str:
|
||||||
return [{"text": step.strip()} for step in instructions]
|
return [{"text": normalize_instruction(step)} for step in instructions]
|
||||||
|
|
||||||
# Dictionaries (let's assume it's a HowToStep) in a list
|
# Dictionaries (let's assume it's a HowToStep) in a list
|
||||||
elif type(instructions) == list and type(instructions[0]) == dict:
|
elif type(instructions) == list and type(instructions[0]) == dict:
|
||||||
return [
|
return [
|
||||||
{"text": step["text"].strip()}
|
{"text": normalize_instruction(step["text"])}
|
||||||
for step in instructions
|
for step in instructions
|
||||||
if step["@type"] == "HowToStep"
|
if step["@type"] == "HowToStep"
|
||||||
]
|
]
|
||||||
|
@ -51,6 +52,14 @@ def normalize_instructions(instructions) -> List[dict]:
|
||||||
raise Exception(f"Unrecognised instruction format: {instructions}")
|
raise Exception(f"Unrecognised instruction format: {instructions}")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_instruction(line) -> str:
|
||||||
|
l = line.strip()
|
||||||
|
# Some sites erroneously escape their strings on multiple levels
|
||||||
|
while not l == (l := html.unescape(l)):
|
||||||
|
pass
|
||||||
|
return l
|
||||||
|
|
||||||
|
|
||||||
def normalize_yield(yld) -> str:
|
def normalize_yield(yld) -> str:
|
||||||
if type(yld) == list:
|
if type(yld) == list:
|
||||||
return yld[-1]
|
return yld[-1]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue