fixed instrucitons on parse

This commit is contained in:
Hayden 2021-01-10 11:04:52 -09:00
commit 9efd9399d9
3 changed files with 69 additions and 123 deletions

View file

@ -1,91 +1,16 @@
{
"@context": "http://schema.org/",
"@type": "Recipe",
"name": "Jalape\u00f1o Popper Dip",
"author": {
"@type": "Person",
"name": "Michelle"
},
"description": "Jalapeno Popper Dip is creamy, cheesy and has just the perfect amount of kick. Great appetizer for your next party or watching the big game!",
"datePublished": "2016-02-22 00:01:37+00:00",
"image": "jalapeno-popper-dip.jpg",
"recipeYield": [
"10",
"10 to 12 servings"
],
"prepTime": "0:15:00",
"cookTime": "0:30:00",
"totalTime": "0:45:00",
"name": "Carottes Rapp\u00e9s with Rice and Sunflower Seeds \u2014 FEED THE SWIMMERS",
"description": " Carottes R\u00e2p\u00e9es with Rice and Sunflower Seeds thanks to @think_rice and @thefeedfeed. Carottes R\u00e2p\u00e9es is a classic French Salad found ready to go (think picnic) at every charcuterie and on most cafe menus. This is one of those insanely simple salads that explode with flavor! The carrots ar",
"image": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers.JPG?format=1500w",
"recipeYield": "",
"recipeIngredient": [
"16 ounces cream cheese (at room temperature)",
"1 cup mayonnaise",
"8 pieces of bacon (cooked and chopped)",
"6 jalape\u00f1os (seeded and minced (if you can't get fresh, substitute a 4-ounce can diced jalape\u00f1o peppers, drained))",
"2 cloves garlic (minced)",
"\u00bd teaspoon cumin",
"6 ounces cheddar cheese (shredded (about 1\u00bd cups))",
"1 cup panko breadcrumbs",
"1 cup grated Parmesan cheese",
"4 tablespoons unsalted butter, melted"
"Could not detect ingredients"
],
"recipeInstructions": [
{
"@type": "HowToStep",
"text": "Preheat oven to 375 degrees F.",
"name": "Preheat oven to 375 degrees F.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-0"
},
{
"@type": "HowToStep",
"text": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
"name": "Combine the cream cheese, mayonnaise, bacon, jalapenos, garlic, cumin and cheddar cheese in a mixing bowl. Transfer the mixture into 2-quart baking dish.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-1"
},
{
"@type": "HowToStep",
"text": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
"name": "Combine the panko breadcrumbs, Parmesan cheese and melted butter in a small bowl, tossing with a fork until the mixture is evenly moistened. Sprinkle evenly over the cream cheese mixture.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-2"
},
{
"@type": "HowToStep",
"text": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
"name": "Bake in the preheated oven for 25 to 30 minutes, until the top is golden brown and the dip is bubbling. Let rest for 5 minutes before serving. Serve with your favorite tortilla chips, crackers, vegetables, etc.",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/#wprm-recipe-44993-step-0-3"
}
"Could not detect instructions"
],
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4.34",
"ratingCount": "15"
},
"recipeCategory": [
"Appetizer"
],
"recipeCuisine": [
"American"
],
"keywords": "cheese dip, game day food, party food",
"nutrition": {
"@type": "NutritionInformation",
"calories": "560 kcal",
"carbohydrateContent": "7 g",
"proteinContent": "14 g",
"fatContent": "52 g",
"saturatedFatContent": "21 g",
"cholesterolContent": "109 mg",
"sodiumContent": "707 mg",
"sugarContent": "2 g",
"servingSize": "1 serving"
},
"@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#recipe",
"isPartOf": {
"@id": "https://www.browneyedbaker.com/jalapeno-popper-dip/#article"
},
"mainEntityOfPage": "https://www.browneyedbaker.com/jalapeno-popper-dip/#webpage",
"url": "https://www.browneyedbaker.com/jalapeno-popper-dip/",
"slug": "jalapeno-popper-dip",
"orgURL": "http://www.browneyedbaker.com/2011/08/03/jalapeno-popper-dip/",
"slug": "carottes-rappes-with-rice-and-sunflower-seeds-feed-the-swimmers",
"orgURL": "https://www.feedtheswimmers.com/blog/2019/6/5/carottes-rapps-with-rice-and-sunflower-seeds",
"categories": [],
"tags": [],
"dateAdded": null,

View file

@ -1,15 +1,13 @@
from typing import List, Tuple
import json
from pathlib import Path
from typing import List
from typing import List, Tuple
import extruct
import requests
from w3lib.html import get_base_url
import scrape_schema_recipe
from slugify import slugify
from utils.logger import logger
from w3lib.html import get_base_url
from services.image_services import scrape_image
from services.recipe_services import Recipe
@ -85,7 +83,7 @@ def process_recipe_data(new_recipe: dict, url=None) -> dict:
return new_recipe
def extract_recipe_from_html(html:str, url: str) -> dict:
def extract_recipe_from_html(html: str, url: str) -> dict:
scraped_recipes: List[dict] = scrape_schema_recipe.loads(html, python_objects=True)
if scraped_recipes:
new_recipe: dict = scraped_recipes[0]
@ -116,13 +114,15 @@ def download_image_for_recipe(recipe: dict) -> dict:
def og_field(properties: dict, field_name: str) -> str:
return next((val for name, val in properties if name == field_name), None)
def og_fields(properties: List[Tuple[str, str]], field_name: str) -> List[str]:
return list({val for name, val in properties if name == field_name})
def basic_recipe_from_opengraph(html: str, url: str) -> dict:
base_url = get_base_url(html, url)
data = extruct.extract(html, base_url=base_url)
properties = data["opengraph"][0]['properties']
properties = data["opengraph"][0]["properties"]
return {
"name": og_field(properties, "og:title"),
"description": og_field(properties, "og:description"),
@ -131,7 +131,7 @@ def basic_recipe_from_opengraph(html: str, url: str) -> dict:
# FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
"recipeIngredient": ["Could not detect ingredients"],
# FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
"recipeInstructions": ["Could not detect instructions"],
"recipeInstructions": [{"text": "Could not detect instructions"}],
"slug": slugify(og_field(properties, "og:title")),
"orgURL": og_field(properties, "og:url"),
"categories": [],

View file

@ -3,7 +3,11 @@ import re
from pathlib import Path
import pytest
from services.scrape_services import normalize_data, normalize_instructions, extract_recipe_from_html
from services.scrape_services import (
extract_recipe_from_html,
normalize_data,
normalize_instructions,
)
CWD = Path(__file__).parent
RAW_RECIPE_DIR = CWD.joinpath("data", "recipes-raw")
@ -11,42 +15,58 @@ RAW_HTML_DIR = CWD.joinpath("data", "html-raw")
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
url_validation_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
@pytest.mark.parametrize("json_file,num_steps", [
("best-homemade-salsa-recipe.json", 2),
("blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json", 3),
("bon_appetit.json", 8),
("chunky-apple-cake.json", 4),
("dairy-free-impossible-pumpkin-pie.json", 7),
("how-to-make-instant-pot-spaghetti.json", 8),
("instant-pot-chicken-and-potatoes.json", 4),
("instant-pot-kerala-vegetable-stew.json", 13),
("jalapeno-popper-dip.json", 4),
("microwave_sweet_potatoes_04783.json", 4),
("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4),
("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3),
])
@pytest.mark.parametrize(
"json_file,num_steps",
[
("best-homemade-salsa-recipe.json", 2),
(
"blue-cheese-stuffed-turkey-meatballs-with-raspberry-balsamic-glaze-2.json",
3,
),
("bon_appetit.json", 8),
("chunky-apple-cake.json", 4),
("dairy-free-impossible-pumpkin-pie.json", 7),
("how-to-make-instant-pot-spaghetti.json", 8),
("instant-pot-chicken-and-potatoes.json", 4),
("instant-pot-kerala-vegetable-stew.json", 13),
("jalapeno-popper-dip.json", 4),
("microwave_sweet_potatoes_04783.json", 4),
("moroccan-skirt-steak-with-roasted-pepper-couscous.json", 4),
("Pizza-Knoblauch-Champignon-Paprika-vegan.html.json", 3),
],
)
def test_normalize_data(json_file, num_steps):
recipe_data = normalize_data(json.load(open(RAW_RECIPE_DIR.joinpath(json_file))))
assert len(recipe_data["recipeInstructions"]) == num_steps
@pytest.mark.parametrize("instructions", [
"A\n\nB\n\nC\n\n",
"A\nB\nC\n",
"A\r\n\r\nB\r\n\r\nC\r\n\r\n",
"A\r\nB\r\nC\r\n",
["A","B","C"],
[{"@type": "HowToStep", "text": x} for x in ["A","B","C"]]
])
@pytest.mark.parametrize(
"instructions",
[
"A\n\nB\n\nC\n\n",
"A\nB\nC\n",
"A\r\n\r\nB\r\n\r\nC\r\n\r\n",
"A\r\nB\r\nC\r\n",
["A", "B", "C"],
[{"@type": "HowToStep", "text": x} for x in ["A", "B", "C"]],
],
)
def test_normalize_instructions(instructions):
assert normalize_instructions(instructions) == [{"text": "A"}, {"text": "B"}, {"text": "C"}]
assert normalize_instructions(instructions) == [
{"text": "A"},
{"text": "B"},
{"text": "C"},
]
def test_html_no_recipe_data():
@ -59,8 +79,10 @@ def test_html_no_recipe_data():
assert recipe_data["orgURL"] == url
assert len(recipe_data["description"]) > 100
assert url_validation_regex.match(recipe_data["image"])
assert recipe_data["recipeIngredient"] == []
assert recipe_data["recipeInstructions"] == []
assert recipe_data["recipeIngredient"] == ["Could not detect ingredients"]
assert recipe_data["recipeInstructions"] == [
{"text": "Could not detect instructions"}
]
def test_html_with_recipe_data():
@ -75,4 +97,3 @@ def test_html_with_recipe_data():
assert url_validation_regex.match(recipe_data["image"])
assert len(recipe_data["recipeIngredient"]) == 13
assert len(recipe_data["recipeInstructions"]) == 4