From 9818d567b9575a4968ba9cff833160c96a18fb2b Mon Sep 17 00:00:00 2001 From: Richard Mitic Date: Sun, 10 Jan 2021 20:15:43 +0100 Subject: [PATCH 1/3] Use opengraph metadata to make basic recipe cards when full recipe metadata is not available --- dev/scripts/scrape_recipe.py | 11 +- mealie/services/scrape_services.py | 103 +- ...s-rapps-with-rice-and-sunflower-seeds.html | 1318 +++++++++++++++++ .../html-raw/healthy_pasta_bake_60759.html | 500 +++++++ mealie/test/test_scraper.py | 41 +- 5 files changed, 1944 insertions(+), 29 deletions(-) create mode 100644 mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html create mode 100644 mealie/test/data/html-raw/healthy_pasta_bake_60759.html diff --git a/dev/scripts/scrape_recipe.py b/dev/scripts/scrape_recipe.py index de18e64b2..ce5f119ef 100644 --- a/dev/scripts/scrape_recipe.py +++ b/dev/scripts/scrape_recipe.py @@ -3,8 +3,11 @@ Helper script to download raw recipe data from a URL and dump it to disk. The resulting files can be used as test input data. """ -import sys, json +import sys, json, pprint +import requests +import extruct from scrape_schema_recipe import scrape_url +from w3lib.html import get_base_url for url in sys.argv[1:]: try: @@ -16,3 +19,9 @@ for url in sys.argv[1:]: print(f"Saved {filename}") except Exception as e: print(f"Error for {url}: {e}") + print("Trying extruct instead") + pp = pprint.PrettyPrinter(indent=2) + r = requests.get(url) + base_url = get_base_url(r.text, r.url) + data = extruct.extract(r.text, base_url=base_url) + pp.pprint(data) diff --git a/mealie/services/scrape_services.py b/mealie/services/scrape_services.py index a42e16792..5a6b45c72 100644 --- a/mealie/services/scrape_services.py +++ b/mealie/services/scrape_services.py @@ -1,8 +1,13 @@ +from typing import List, Tuple + import json from pathlib import Path from typing import List -from scrape_schema_recipe import scrape_url +import extruct +import requests +from w3lib.html import get_base_url +import scrape_schema_recipe from slugify import slugify from utils.logger import logger @@ -59,21 +64,10 @@ def normalize_data(recipe_data: dict) -> dict: recipe_data["recipeInstructions"] = normalize_instructions( recipe_data["recipeInstructions"] ) + recipe_data["image"] = normalize_image_url(recipe_data["image"]) return recipe_data -def create_from_url(url: str) -> dict: - recipe_data = process_recipe_url(url) - - with open(TEMP_FILE, "w") as f: - f.write(json.dumps(recipe_data, indent=4, default=str)) - - recipe_data = normalize_data(recipe_data) - recipe = Recipe(**recipe_data) - - return recipe.save_to_db() - - def process_recipe_data(new_recipe: dict, url=None) -> dict: slug = slugify(new_recipe["name"]) mealie_tags = { @@ -91,21 +85,76 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict: return new_recipe -def process_recipe_url(url: str) -> dict: - new_recipe: dict = scrape_url(url, python_objects=True)[0] - logger.info(f"Recipe Scraped From Web: {new_recipe}") +def extract_recipe_from_html(html:str, url: str) -> dict: + scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True) + if scraped_recipes: + new_recipe: dict = scraped_recipes[0] + logger.info(f"Recipe Scraped From Web: {new_recipe}") - if not new_recipe: - return "fail" # TODO: Return Better Error Here + if not new_recipe: + return "fail" # TODO: Return Better Error Here - new_recipe = process_recipe_data(new_recipe, url) - - try: - img_path = scrape_image( - normalize_image_url(new_recipe.get("image")), new_recipe.get("slug") - ) - new_recipe["image"] = img_path.name - except: - new_recipe["image"] = None + new_recipe = process_recipe_data(new_recipe, url=url) + new_recipe = normalize_data(new_recipe) + else: + new_recipe = basic_recipe_from_opengraph(html, url) + logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}") return new_recipe + + +def download_image_for_recipe(recipe: dict) -> dict: + try: + img_path = scrape_image(recipe.get("image"), recipe.get("slug")) + recipe["image"] = img_path.name + except: + recipe["image"] = None + + return recipe + + +def og_field(properties: dict, field_name: str) -> str: + return next((val for name, val in properties if name == field_name), None) + +def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]: + return list({val for name, val in properties if name == field_name}) + +def basic_recipe_from_opengraph(html: str, url: str) -> dict: + base_url = get_base_url(html, url) + data = extruct.extract(html, base_url=base_url) + properties = data["opengraph"][0]['properties'] + return { + "name": og_field(properties, "og:title"), + "description": og_field(properties, "og:description"), + "image": og_field(properties, "og:image"), + "recipeYield": "", + # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails. + "recipeIngredient": ["Could not detect ingredients"], + # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity. + "recipeInstructions": ["Could not detect instructions"], + "slug": slugify(og_field(properties, "og:title")), + "orgURL": og_field(properties, "og:url"), + "categories": [], + "tags": og_fields(properties, "og:article:tag"), + "dateAdded": None, + "notes": [], + "extras": [], + } + + +def process_recipe_url(url: str) -> dict: + r = requests.get(url) + new_recipe = extract_recipe_from_html(r.text, url) + new_recipe = download_image_for_recipe(new_recipe) + return new_recipe + + +def create_from_url(url: str) -> dict: + recipe_data = process_recipe_url(url) + + with open(TEMP_FILE, "w") as f: + f.write(json.dumps(recipe_data, indent=4, default=str)) + + recipe = Recipe(**recipe_data) + + return recipe.save_to_db() diff --git a/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html b/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html new file mode 100644 index 000000000..b3fb0b94b --- /dev/null +++ b/mealie/test/data/html-raw/carottes-rapps-with-rice-and-sunflower-seeds.html @@ -0,0 +1,1318 @@ + + + + + + + + +Carottes Rappés with Rice and Sunflower Seeds — FEED THE SWIMMERS + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ + + + + + + + + + + + +
+ + + + +
+ + + + + + + +
+ +
+ + + + + + +
+ + +
+ + + + + + +

Jill Fergus

I began cooking as a child and feeding family and friends has always been my passion. My kitchen is a busy one. I love to experiment and embrace the kitchen successes along with the accidents. I love to cook and collaborate with friends. I am seasonally driven (I love the farmer's market!), avoid processed foods and focus on whole and organic (mostly plant-based, but not exclusively) choices. In my home, my family has a variety of eating preferences from plant-based, gluten free, refined sugar free to full on omnivore. My goal is to create dishes to please all, either as is or with minor adjustments to the recipe. Where did "Feed the Swimmers" come from? When my kids began swimming competitively and growing into young adults, I realized, even more, how important nutrition is to performance, growth and overall health and emotional well being. Everyone (including the coach during travel meets) would ask "what are you feeding the swimmers?" This has become my mantra whenever I'm in my kitchen cooking for family and the friends I love.

 

 

+
+
+
+ +
+ +
+ + + +
+ +
+ + + +
+ + + +
+ +
+ +
+
Carottes Rappés with Rice and Sunflower Seeds
+
+ + +
+ +
+ +
+ +

Carottes Rappés with Rice and Sunflower Seeds

+ +
+ + + +
+
+ +
+ +
+
+
+
+
+
+

 Carottes Râpées with Rice and Sunflower Seeds thanks to @think_rice and @thefeedfeed. Carottes Râpées is a classic French Salad found ready to go (think picnic) at every charcuterie and on most cafe menus. This is one of those insanely simple salads that explode with flavor! The carrots are a fine julienne which allows their juice to meld with the gentle vinaigrette brilliantly. This simple classic gets an update with U.S.-Grown long grain white rice and crunchy sunflower seeds and for me is loaded with the fondest of food memories! When I was a student, I did a semester abroad in Rennes, France, and lived with a family who had three very young children. My eating preferences then were very similar to now- mostly vegetarian with the occasional serving of fish. My French “mom” (who also taught me the art of fresh mayonnaise along with many other wonders) wasn’t quite used to this, but was also looking to add more veggies to their three toddlers diet. I’m not sure if she added rice to make this more child friendly or to stretch their food budget, but it was magical. For protein, I would add hard boiled eggs (from the chickens in the back yard) at lunch and was happy eating this several times per week! Some evenings, my “dad” would cook fresh sardines in the fireplace for me and those dinners were sublime. I didn’t leave with her recipe and we’ve lost touch, but I’ve recreated this based on memory, added the seeds for crunch and used U.S.-Grown rice to support American farmers. I’m delighted to share this with you! With summer looming, this also makes a great picnic salad served with crusty bread😋 for sopping up the dressing and is a great addition to sandwiches. If your prefer vegan, simply omit the egg. #feedfeed #thinkrice #sponsored

RECIPE

Carottes Râpées with Rice and Sunflower Seeds

Notes-

• The finer you’re able to julienne the carrots the better. I recommend using the finest setting of a food processor. If using a spiralizer, or similar, using the finest option works well, too. You’ll simply need to cut your carrots noodles into shorter pieces. I have an old Moulinex grater (not to be confused with their food Mill) I use for these which I love. A box grater is also just fine, as it releases the juice. You’ll simply have a slightly different texture.

• I prefer seasoned rice vinegar for this vinaigrette for its added sweetness.

• Tossing this by hand allows the vinaigrette to blend beautifully with the natural juice of the carrots and coats the grains of rice allowing for even distribution of the flavors.

• The rice should be prepared “al dente” as it will absorb the the vinaigrette.

• If you prefer a vegan option, simply omit the egg

Ingredients-

1/2 lb carrots, finely julienned or grated

1 cup long grain white rice

Large handful flat leaf parsley

1/4 cup seasoned rice vinegar

1/4 cup extra virgin olive oil, plus 1 tsp, separated

1 tsp Dijon mustard

Flaky sea salt

Fresh ground pepper

3 eggs

1/2 cup raw sunflower seeds

Method-

1- Prepare rice per package directions cutting back the water by 1/2 cup. Fluff with a fork and set aside to allow to cool a bit. You don’t want to toss the rice while it’s hot, but slightly warm to the touch best.


2- While the rice is cooking, prepare the vinaigrette, grate the carrots and chop the parsley. Boil the eggs for 10 minutes and add to an ice bath to stop the cooking. You don’t want gray edged yolks. Mash eggs with a fork so you have a course finish. Set aside.


3- Preheat a heavy bottomed pan over medium heat. Add 1 tsp of olive oil and sunflower seeds. Toast until edges just begin to turn golden. Add sea salt to taste and remove from heat to a small bowl.


3- Using your hands, toss carrots with rice to distribute and add vinaigrette, starting with about half (1/4 cup).  Taste and add more to suit your preference being careful not to overdress. Gently incorporate parsley and season with sea salt and fresh ground pepper. Top with chopped egg and sunflower seeds. Serve and enjoy!!!

+ + + + + + + + + + + + +
+ + + + +
+ + + + + + + +
+ IMG_0681.JPG +
+ + + + + + +
+ + +
+ + + + + + +

If you happen to own a moulinex, use the finest disc. If you have a spiralizer, also choose the finest noodle option and loosely chop. Serving this as a carrot noodle dish, isn’t a bad idea! 

+ + + + + + + + + + + + +
+ + + + +
+ + + + + + + +
+ IMG_0682.JPG +
+ + + + + + +
+ + +
+ + + + + + +

The dressing couldn’t be simpler! I never tire of rice wine.  

+ + + + + + + + + + + + +
+ + + + +
+ + + + + + + +
+ IMG_0683.JPG +
+ + + + + + +
+ + +
+ + + + + + +

I would toast way more than you need. These are delicious, especially when warm, and disappear quickly.  

 

+ + + + + + + + + + + + +
+ + + + +
+ + + + + + + +
+ IMG_0684.JPG +
+ + + + + + +
+ + +
+ + + + + + +

Enjoy! 

+ + + + + +
+
+ +
+ +
+ +
+ + +
+ +
+
+
+
+ + +
+
+ + + +
+ +
+ + + +
+ + Spiced Cauliflower, Sweet Potato and Chick Pea Stew with Greens +

Spiced Cauliflower, Sweet Potato and Chick Pea Stew with Greens

+
+
+
+ + + +
+ + Crispy Sushi Rice with Smoked Salmon and Avocado +

Crispy Sushi Rice with Smoked Salmon and Avocado

+
+
+
+ + + + +
+ +
+ + + +
+ +
+ + + + + + +
+ + + + + + diff --git a/mealie/test/data/html-raw/healthy_pasta_bake_60759.html b/mealie/test/data/html-raw/healthy_pasta_bake_60759.html new file mode 100644 index 000000000..fc98bdfa5 --- /dev/null +++ b/mealie/test/data/html-raw/healthy_pasta_bake_60759.html @@ -0,0 +1,500 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Healthy pasta bake recipe - BBC Food + + + + + +
+
+ +

Healthy pasta bake

Loading
Healthy pasta bake

A fuss-free and healthy pasta bake that you can make in one dish. It's packed with the flavours and colours of Mediterranean vegetables and topped with pine nuts for crunch.

+Each serving provides 375 kcal, 12g protein, 27g carbohydrates (of which 11g sugars), 22g fat (of which 5g saturates), 8g fibre and 0.95g salt.

Ingredients

  • 2 tbsp olive oil
  • 1 red onion, roughly chopped
  • 1 yellow or red pepper, deseeded and cut into roughly 1.5cm/⅝in chunks
  • 1 courgette (approx. 225g/8oz), quartered lengthways and cut into roughly 1.5cm/⅝in chunks
  • 1 small aubergine (approx. 250g/9oz), cut into roughly 1.5cm/⅝in chunks
  • 2 large garlic cloves, crushed
  • 4 tbsp sun-dried tomato or red pepper pesto
  • 400g tin chopped tomatoes
  • 100g/3½oz wholewheat penne pasta
  • 75g/2¾oz young spinach leaves, tough stalks removed
  • 50g/1¾oz mature cheddar, grated
  • 2 tbsp pine nuts
  • salt and freshly ground black pepper

Method

  1. Preheat the oven to 200C/180 Fan/Gas 6.

  2. Heat the oil in a flameproof casserole. Add the onion, pepper, courgette and aubergine, season with a little salt and lots of pepper and fry over a medium heat for about 8 minutes, or until softened and lightly browned, stirring regularly. Add the garlic and cook for 1 minute, then stir in the pesto and cook for 1 minute more, stirring constantly.

  3. Tip the tomatoes into the casserole, then fill the tomato tin with water and pour into the pan. Add the pasta and bring to a simmer, stirring regularly. Cover with a lid, or if you prefer tip into a baking dish and cover with foil, and bake for 20 minutes.

  4. Take the casserole out of the oven and stir in the spinach, a handful at a time. Once all the spinach has been mixed in, sprinkle the cheese and pine nuts over the top. Return to the oven and cook, uncovered, for a further 10 minutes, until the pasta is tender.

Recipe Tips

If you don’t have wholewheat penne, use another shape, such as fusilli. You can also use traditional pasta, but the overall fibre content won’t be quite as high.

The red pesto adds lots of flavour; green basil pesto also works well.

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/mealie/test/test_scraper.py b/mealie/test/test_scraper.py index af5cbfae9..665ab8eb9 100644 --- a/mealie/test/test_scraper.py +++ b/mealie/test/test_scraper.py @@ -1,12 +1,22 @@ import json +import re from pathlib import Path import pytest -from services.scrape_services import normalize_data, normalize_instructions +from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html CWD = Path(__file__).parent RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw") +RAW_HTML_DIR = CWD.joinpath("data", "html-raw") +# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 +url_validation_regex = re.compile( + r'^(?:http|ftp)s?://' # http:// or https:// + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... + r'localhost|' #localhost... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', re.IGNORECASE) @pytest.mark.parametrize("json_file,num_steps", [ ("best-homemade-salsa-recipe.json", 2), @@ -37,3 +47,32 @@ def test_normalize_data(json_file, num_steps): ]) def test_normalize_instructions(instructions): assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}] + + +def test_html_no_recipe_data(): + path = RAW_HTML_DIR.joinpath("carottes-rapps-with-rice-and-sunflower-seeds.html") + url = "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds" + recipe_data = extract_recipe_from_html(open(path).read(), url) + + assert len(recipe_data["name"]) > 10 + assert len(recipe_data["slug"]) > 10 + assert recipe_data["orgURL"] == url + assert len(recipe_data["description"]) > 100 + assert url_validation_regex.match(recipe_data["image"]) + assert recipe_data["recipeIngredient"] == [] + assert recipe_data["recipeInstructions"] == [] + + +def test_html_with_recipe_data(): + path = RAW_HTML_DIR.joinpath("healthy_pasta_bake_60759.html") + url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759" + recipe_data = extract_recipe_from_html(open(path).read(), url) + + assert len(recipe_data["name"]) > 10 + assert len(recipe_data["slug"]) > 10 + assert recipe_data["orgURL"] == url + assert len(recipe_data["description"]) > 100 + assert url_validation_regex.match(recipe_data["image"]) + assert len(recipe_data["recipeIngredient"]) == 13 + assert len(recipe_data["recipeInstructions"]) == 4 + From 9efd9399d9a0e62a2f889cae0a5b614b81150cd8 Mon Sep 17 00:00:00 2001 From: Hayden Date: Sun, 10 Jan 2021 11:04:52 -0900 Subject: [PATCH 2/3] fixed instrucitons on parse --- mealie/data/debug/last_recipe.json | 91 +++--------------------------- mealie/services/scrape_services.py | 14 ++--- mealie/test/test_scraper.py | 87 +++++++++++++++++----------- 3 files changed, 69 insertions(+), 123 deletions(-) diff --git a/mealie/data/debug/last_recipe.json b/mealie/data/debug/last_recipe.json index a37d9e28c..4c23198e9 100644 --- a/mealie/data/debug/last_recipe.json +++ b/mealie/data/debug/last_recipe.json @@ -1,91 +1,16 @@ { - "@context": "http://schema.org/", - "@type": "Recipe", - "name": "Jalape\u00f1o Popper Dip", - "author": { - "@type": "Person", - "name": "Michelle" - }, - "description": "Jalapeno Popper Dip is creamy, cheesy and has just the perfect amount of kick. Great appetizer for your next party or watching the big game!", - "datePublished": "2016-02-22 00:01:37+00:00", - "image": "jalapeno-popper-dip.jpg", - "recipeYield": [ - "10", - "10 to 12 servings" - ], - "prepTime": "0:15:00", - "cookTime": "0:30:00", - "totalTime": "0:45:00", + "name": "Carottes Rapp\u00e9s with Rice and Sunflower Seeds \u2014 FEED THE SWIMMERS", + "description": " Carottes R\u00e2p\u00e9es with Rice and Sunflower Seeds thanks to @think_rice and @thefeedfeed. Carottes R\u00e2p\u00e9es is a classic French Salad found ready to go (think picnic) at every charcuterie and on most cafe menus. This is one of those insanely simple salads that explode with flavor! The carrots ar", + "image": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers.JPG?format=1500w", + "recipeYield": "", "recipeIngredient": [ - "16 ounces cream cheese (at room temperature)", - "1 cup mayonnaise", - "8 pieces of bacon (cooked and chopped)", - "6 jalape\u00f1os (seeded and minced (if you can't get fresh, substitute a 4-ounce can diced jalape\u00f1o peppers, drained))", - "2 cloves garlic (minced)", - "\u00bd teaspoon cumin", - "6 ounces cheddar cheese (shredded (about 1\u00bd cups))", - "1 cup panko breadcrumbs", - "1 cup grated Parmesan cheese", - "4 tablespoons unsalted butter, melted" + "Could not detect ingredients" ], "recipeInstructions": [ - { - "@type": "HowToStep", - "text": "Preheat oven to 375 degrees F.", - "name": "Preheat oven to 375 degrees F.", - "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-0" - }, - { - "@type": "HowToStep", - "text": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.", - "name": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.", - "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-1" - }, - { - "@type": "HowToStep", - "text": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.", - "name": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.", - "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-2" - }, - { - "@type": "HowToStep", - "text": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.", - "name": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.", - "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-3" - } + "Could not detect instructions" ], - "aggregateRating": { - "@type": "AggregateRating", - "ratingValue": "4.34", - "ratingCount": "15" - }, - "recipeCategory": [ - "Appetizer" - ], - "recipeCuisine": [ - "American" - ], - "keywords": "cheese dip, game day food, party food", - "nutrition": { - "@type": "NutritionInformation", - "calories": "560 kcal", - "carbohydrateContent": "7 g", - "proteinContent": "14 g", - "fatContent": "52 g", - "saturatedFatContent": "21 g", - "cholesterolContent": "109 mg", - "sodiumContent": "707 mg", - "sugarContent": "2 g", - "servingSize": "1 serving" - }, - "@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#recipe", - "isPartOf": { - "@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#article" - }, - "mainEntityOfPage": "https://www.browneyedbaker.com/jalapeno-popper-dip/#webpage", - "url": "https://www.browneyedbaker.com/jalapeno-popper-dip/", - "slug": "jalapeno-popper-dip", - "orgURL": "http://www.browneyedbaker.com/2011/08/03/jalapeno-popper-dip/", + "slug": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers", + "orgURL": "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds", "categories": [], "tags": [], "dateAdded": null, diff --git a/mealie/services/scrape_services.py b/mealie/services/scrape_services.py index 5a6b45c72..3a78e0924 100644 --- a/mealie/services/scrape_services.py +++ b/mealie/services/scrape_services.py @@ -1,15 +1,13 @@ -from typing import List, Tuple - import json from pathlib import Path -from typing import List +from typing import List, Tuple import extruct import requests -from w3lib.html import get_base_url import scrape_schema_recipe from slugify import slugify from utils.logger import logger +from w3lib.html import get_base_url from services.image_services import scrape_image from services.recipe_services import Recipe @@ -85,7 +83,7 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict: return new_recipe -def extract_recipe_from_html(html:str, url: str) -> dict: +def extract_recipe_from_html(html: str, url: str) -> dict: scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True) if scraped_recipes: new_recipe: dict = scraped_recipes[0] @@ -116,13 +114,15 @@ def download_image_for_recipe(recipe: dict) -> dict: def og_field(properties: dict, field_name: str) -> str: return next((val for name, val in properties if name == field_name), None) + def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]: return list({val for name, val in properties if name == field_name}) + def basic_recipe_from_opengraph(html: str, url: str) -> dict: base_url = get_base_url(html, url) data = extruct.extract(html, base_url=base_url) - properties = data["opengraph"][0]['properties'] + properties = data["opengraph"][0]["properties"] return { "name": og_field(properties, "og:title"), "description": og_field(properties, "og:description"), @@ -131,7 +131,7 @@ def basic_recipe_from_opengraph(html: str, url: str) -> dict: # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails. "recipeIngredient": ["Could not detect ingredients"], # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity. - "recipeInstructions": ["Could not detect instructions"], + "recipeInstructions": [{"text": "Could not detect instructions"}], "slug": slugify(og_field(properties, "og:title")), "orgURL": og_field(properties, "og:url"), "categories": [], diff --git a/mealie/test/test_scraper.py b/mealie/test/test_scraper.py index 665ab8eb9..ab635dc4d 100644 --- a/mealie/test/test_scraper.py +++ b/mealie/test/test_scraper.py @@ -3,7 +3,11 @@ import re from pathlib import Path import pytest -from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html +from services.scrape_services import ( + extract_recipe_from_html, + normalize_data, + normalize_instructions, +) CWD = Path(__file__).parent RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw") @@ -11,42 +15,58 @@ RAW_HTML_DIR = CWD.joinpath("data", "html-raw") # https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45 url_validation_regex = re.compile( - r'^(?:http|ftp)s?://' # http:// or https:// - r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... - r'localhost|' #localhost... - r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip - r'(?::\d+)?' # optional port - r'(?:/?|[/?]\S+)$', re.IGNORECASE) + r"^(?:http|ftp)s?://" # http:// or https:// + r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... + r"localhost|" # localhost... + r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip + r"(?::\d+)?" # optional port + r"(?:/?|[/?]\S+)$", + re.IGNORECASE, +) -@pytest.mark.parametrize("json_file,num_steps", [ - ("best-homemade-salsa-recipe.json", 2), - ("blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json", 3), - ("bon_appetit.json", 8), - ("chunky-apple-cake.json", 4), - ("dairy-free-impossible-pumpkin-pie.json", 7), - ("how-to-make-instant-pot-spaghetti.json", 8), - ("instant-pot-chicken-and-potatoes.json", 4), - ("instant-pot-kerala-vegetable-stew.json", 13), - ("jalapeno-popper-dip.json", 4), - ("microwave_sweet_potatoes_04783.json", 4), - ("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4), - ("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3), -]) + +@pytest.mark.parametrize( + "json_file,num_steps", + [ + ("best-homemade-salsa-recipe.json", 2), + ( + "blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json", + 3, + ), + ("bon_appetit.json", 8), + ("chunky-apple-cake.json", 4), + ("dairy-free-impossible-pumpkin-pie.json", 7), + ("how-to-make-instant-pot-spaghetti.json", 8), + ("instant-pot-chicken-and-potatoes.json", 4), + ("instant-pot-kerala-vegetable-stew.json", 13), + ("jalapeno-popper-dip.json", 4), + ("microwave_sweet_potatoes_04783.json", 4), + ("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4), + ("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3), + ], +) def test_normalize_data(json_file, num_steps): recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file)))) assert len(recipe_data["recipeInstructions"]) == num_steps -@pytest.mark.parametrize("instructions", [ - "A\n\nB\n\nC\n\n", - "A\nB\nC\n", - "A\r\n\r\nB\r\n\r\nC\r\n\r\n", - "A\r\nB\r\nC\r\n", - ["A","B","C"], - [{"@type": "HowToStep", "text": x} for x in ["A","B","C"]] -]) +@pytest.mark.parametrize( + "instructions", + [ + "A\n\nB\n\nC\n\n", + "A\nB\nC\n", + "A\r\n\r\nB\r\n\r\nC\r\n\r\n", + "A\r\nB\r\nC\r\n", + ["A", "B", "C"], + [{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]], + ], +) def test_normalize_instructions(instructions): - assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}] + assert normalize_instructions(instructions) == [ + {"text": "A"}, + {"text": "B"}, + {"text": "C"}, + ] def test_html_no_recipe_data(): @@ -59,8 +79,10 @@ def test_html_no_recipe_data(): assert recipe_data["orgURL"] == url assert len(recipe_data["description"]) > 100 assert url_validation_regex.match(recipe_data["image"]) - assert recipe_data["recipeIngredient"] == [] - assert recipe_data["recipeInstructions"] == [] + assert recipe_data["recipeIngredient"] == ["Could not detect ingredients"] + assert recipe_data["recipeInstructions"] == [ + {"text": "Could not detect instructions"} + ] def test_html_with_recipe_data(): @@ -75,4 +97,3 @@ def test_html_with_recipe_data(): assert url_validation_regex.match(recipe_data["image"]) assert len(recipe_data["recipeIngredient"]) == 13 assert len(recipe_data["recipeInstructions"]) == 4 - From 80468d0c47e34e5733f8bfc58caffa29e4fde7f1 Mon Sep 17 00:00:00 2001 From: Hayden Date: Sun, 10 Jan 2021 11:08:19 -0900 Subject: [PATCH 3/3] add last_recipe --- .gitignore | 1 + mealie/data/debug/last_recipe.json | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4ec39805f..72fca36ef 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,4 @@ ENV/ # Node Modules node_modules/ +mealie/data/debug/last_recipe.json diff --git a/mealie/data/debug/last_recipe.json b/mealie/data/debug/last_recipe.json index 4c23198e9..e154a0864 100644 --- a/mealie/data/debug/last_recipe.json +++ b/mealie/data/debug/last_recipe.json @@ -7,7 +7,9 @@ "Could not detect ingredients" ], "recipeInstructions": [ - "Could not detect instructions" + { + "text": "Could not detect instructions" + } ], "slug": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers", "orgURL": "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds",