From d8f226092ea6a5c6b8f3d600f3f60a64588ddec4 Mon Sep 17 00:00:00 2001
From: wengtad <wengtad93@gmail.com>
Date: Sun, 14 Feb 2021 05:04:10 +0800
Subject: [PATCH] fix scrape recipe error (#166)

- fixes #138
- fixes #145
---
 mealie/services/scrape_services.py | 62 +++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/mealie/services/scrape_services.py b/mealie/services/scrape_services.py
index f5d644c4d..f31604cfe 100644
--- a/mealie/services/scrape_services.py
+++ b/mealie/services/scrape_services.py
@@ -40,11 +40,7 @@ def normalize_instructions(instructions) -> List[dict]:
 
     # One long string split by (possibly multiple) new lines
     if type(instructions) == str:
-        return [
-            {"text": normalize_instruction(line)}
-            for line in instructions.splitlines()
-            if line
-        ]
+        return [{"text": normalize_instruction(line)} for line in instructions.splitlines() if line]
 
     # Plain strings in a list
     elif type(instructions) == list and type(instructions[0]) == str:
@@ -52,11 +48,31 @@ def normalize_instructions(instructions) -> List[dict]:
 
     # Dictionaries (let's assume it's a HowToStep) in a list
     elif type(instructions) == list and type(instructions[0]) == dict:
-        return [
-            {"text": normalize_instruction(step["text"])}
-            for step in instructions
-            if step["@type"] == "HowToStep"
-        ]
+        try:
+            # If HowToStep is under HowToSection
+            sectionSteps = []
+            for step in instructions:
+                if step["@type"] == "HowToSection":
+                    for item in step["itemListElement"]:
+                        sectionSteps.append(item)
+
+            if len(sectionSteps) > 0:
+                return [
+                    {"text": normalize_instruction(step["text"])}
+                    for step in sectionSteps
+                    if step["@type"] == "HowToStep"
+                ]
+
+            return [
+                {"text": normalize_instruction(step["text"])} for step in instructions if step["@type"] == "HowToStep"
+            ]
+        except Exception as e:
+            # Not "@type", try "type"
+            return [
+                {"text": normalize_instruction(step["properties"]["text"])}
+                for step in instructions
+                if step["type"].find("HowToStep") > -1
+            ]
 
     else:
         raise Exception(f"Unrecognised instruction format: {instructions}")
@@ -95,12 +111,8 @@ def normalize_data(recipe_data: dict) -> dict:
     recipe_data["prepTime"] = normalize_time(recipe_data.get("prepTime"))
     recipe_data["performTime"] = normalize_time(recipe_data.get("performTime"))
     recipe_data["recipeYield"] = normalize_yield(recipe_data.get("recipeYield"))
-    recipe_data["recipeIngredient"] = normalize_ingredient(
-        recipe_data.get("recipeIngredient")
-    )
-    recipe_data["recipeInstructions"] = normalize_instructions(
-        recipe_data["recipeInstructions"]
-    )
+    recipe_data["recipeIngredient"] = normalize_ingredient(recipe_data.get("recipeIngredient"))
+    recipe_data["recipeInstructions"] = normalize_instructions(recipe_data["recipeInstructions"])
     recipe_data["image"] = normalize_image_url(recipe_data["image"])
     return recipe_data
 
@@ -123,13 +135,19 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
 
 
 def extract_recipe_from_html(html: str, url: str) -> dict:
-    scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
-    dump_last_json(scraped_recipes)
+    try:
+        scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
+        dump_last_json(scraped_recipes)
 
-    if not scraped_recipes:
-        scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(
-            url, python_objects=True
-        )
+        if not scraped_recipes:
+            scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url, python_objects=True)
+    except Exception as e:
+        # trying without python_objects
+        scraped_recipes: List[dict] = scrape_schema_recipe.loads(html)
+        dump_last_json(scraped_recipes)
+
+        if not scraped_recipes:
+            scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url)
 
     if scraped_recipes:
         new_recipe: dict = scraped_recipes[0]