fix scrape recipe error

- fixes #138
- fixes #145
This commit is contained in:
Weng Tad 2021-02-13 17:06:12 +08:00
commit a44663b61e

View file

@ -40,11 +40,7 @@ def normalize_instructions(instructions) -> List[dict]:
# One long string split by (possibly multiple) new lines # One long string split by (possibly multiple) new lines
if type(instructions) == str: if type(instructions) == str:
return [ return [{"text": normalize_instruction(line)} for line in instructions.splitlines() if line]
{"text": normalize_instruction(line)}
for line in instructions.splitlines()
if line
]
# Plain strings in a list # Plain strings in a list
elif type(instructions) == list and type(instructions[0]) == str: elif type(instructions) == list and type(instructions[0]) == str:
@ -52,11 +48,31 @@ def normalize_instructions(instructions) -> List[dict]:
# Dictionaries (let's assume it's a HowToStep) in a list # Dictionaries (let's assume it's a HowToStep) in a list
elif type(instructions) == list and type(instructions[0]) == dict: elif type(instructions) == list and type(instructions[0]) == dict:
return [ try:
{"text": normalize_instruction(step["text"])} # If HowToStep is under HowToSection
for step in instructions sectionSteps = []
if step["@type"] == "HowToStep" for step in instructions:
] if step["@type"] == "HowToSection":
for item in step["itemListElement"]:
sectionSteps.append(item)
if len(sectionSteps) > 0:
return [
{"text": normalize_instruction(step["text"])}
for step in sectionSteps
if step["@type"] == "HowToStep"
]
return [
{"text": normalize_instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"
]
except Exception as e:
# Not "@type", try "type"
return [
{"text": normalize_instruction(step["properties"]["text"])}
for step in instructions
if step["type"].find("HowToStep") > -1
]
else: else:
raise Exception(f"Unrecognised instruction format: {instructions}") raise Exception(f"Unrecognised instruction format: {instructions}")
@ -95,12 +111,8 @@ def normalize_data(recipe_data: dict) -> dict:
recipe_data["prepTime"] = normalize_time(recipe_data.get("prepTime")) recipe_data["prepTime"] = normalize_time(recipe_data.get("prepTime"))
recipe_data["performTime"] = normalize_time(recipe_data.get("performTime")) recipe_data["performTime"] = normalize_time(recipe_data.get("performTime"))
recipe_data["recipeYield"] = normalize_yield(recipe_data.get("recipeYield")) recipe_data["recipeYield"] = normalize_yield(recipe_data.get("recipeYield"))
recipe_data["recipeIngredient"] = normalize_ingredient( recipe_data["recipeIngredient"] = normalize_ingredient(recipe_data.get("recipeIngredient"))
recipe_data.get("recipeIngredient") recipe_data["recipeInstructions"] = normalize_instructions(recipe_data["recipeInstructions"])
)
recipe_data["recipeInstructions"] = normalize_instructions(
recipe_data["recipeInstructions"]
)
recipe_data["image"] = normalize_image_url(recipe_data["image"]) recipe_data["image"] = normalize_image_url(recipe_data["image"])
return recipe_data return recipe_data
@ -123,13 +135,19 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
def extract_recipe_from_html(html: str, url: str) -> dict: def extract_recipe_from_html(html: str, url: str) -> dict:
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True) try:
dump_last_json(scraped_recipes) scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
dump_last_json(scraped_recipes)
if not scraped_recipes: if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url( scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url, python_objects=True)
url, python_objects=True except Exception as e:
) # trying without python_objects
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html)
dump_last_json(scraped_recipes)
if not scraped_recipes:
scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url)
if scraped_recipes: if scraped_recipes:
new_recipe: dict = scraped_recipes[0] new_recipe: dict = scraped_recipes[0]