Bump beautifulsoup4 from 4.10.0 to 4.11.1 (#1717)

* Bump beautifulsoup4 from 4.10.0 to 4.11.1

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.10.0 to 4.11.1.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.11.1

* Update soupsieve==2.3.2.post1

* Update requirements.txt

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2022-05-16 20:46:21 -07:00 committed by GitHub
commit 467ae352f5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 4846 additions and 2609 deletions

View file

@ -3,10 +3,14 @@ __license__ = "MIT"
from collections import defaultdict
import itertools
import re
import warnings
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
RubyParenthesisString,
RubyTextString,
Stylesheet,
Script,
TemplateString,
@ -28,6 +32,12 @@ XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
class XMLParsedAsHTMLWarning(UserWarning):
"""The warning issued when an HTML parser is used to parse
XML that is not XHTML.
"""
MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
class TreeBuilderRegistry(object):
"""A way of looking up TreeBuilder subclasses by their name or by desired
@ -319,7 +329,7 @@ class TreeBuilder(object):
values = value
attrs[attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events.
@ -390,17 +400,25 @@ class HTMLTreeBuilder(TreeBuilder):
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines an unusual content model for these tags.
# We represent this by using a string class other than NavigableString
# inside these tags.
# These HTML tags need special treatment so they can be
# represented by a string class other than NavigableString.
#
# I made this list by going through the HTML spec
# For some of these tags, it's because the HTML standard defines
# an unusual content model for them. I made this list by going
# through the HTML spec
# (https://html.spec.whatwg.org/#metadata-content) and looking for
# "metadata content" elements that can contain strings.
#
# The Ruby tags (<rt> and <rp>) are here despite being normal
# "phrasing content" tags, because the content they contain is
# qualitatively different from other text in the document, and it
# can be useful to be able to distinguish it.
#
# TODO: Arguably <noscript> could go here but it seems
# qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS = {
'rt' : RubyTextString,
'rp' : RubyParenthesisString,
'style': Stylesheet,
'script': Script,
'template': TemplateString,
@ -431,7 +449,7 @@ class HTMLTreeBuilder(TreeBuilder):
}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
"""Replace the declared encoding in a <meta> tag with a placeholder,
to be substituted when the tag is output to a string.
@ -475,6 +493,99 @@ class HTMLTreeBuilder(TreeBuilder):
return (meta_encoding is not None)
class DetectsXMLParsedAsHTML(object):
"""A mixin class for any class (a TreeBuilder, or some class used by a
TreeBuilder) that's in a position to detect whether an XML
document is being incorrectly parsed as HTML, and issue an
appropriate warning.
This requires being able to observe an incoming processing
instruction that might be an XML declaration, and also able to
observe tags as they're opened. If you can't do that for a given
TreeBuilder, there's a less reliable implementation based on
examining the raw markup.
"""
# Regular expression for seeing if markup has an <html> tag.
LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
XML_PREFIX = '<?xml'
XML_PREFIX_B = b'<?xml'
@classmethod
def warn_if_markup_looks_like_xml(cls, markup):
"""Perform a check on some markup to see if it looks like XML
that's not XHTML. If so, issue a warning.
This is much less reliable than doing the check while parsing,
but some of the tree builders can't do that.
:return: True if the markup looks like non-XHTML XML, False
otherwise.
"""
if isinstance(markup, bytes):
prefix = cls.XML_PREFIX_B
looks_like_html = cls.LOOKS_LIKE_HTML_B
else:
prefix = cls.XML_PREFIX
looks_like_html = cls.LOOKS_LIKE_HTML
if (markup is not None
and markup.startswith(prefix)
and not looks_like_html.search(markup[:500])
):
cls._warn()
return True
return False
@classmethod
def _warn(cls):
"""Issue a warning about XML being parsed as HTML."""
warnings.warn(
XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
)
def _initialize_xml_detector(self):
"""Call this method before parsing a document."""
self._first_processing_instruction = None
self._root_tag = None
def _document_might_be_xml(self, processing_instruction):
"""Call this method when encountering an XML declaration, or a
"processing instruction" that might be an XML declaration.
"""
if (self._first_processing_instruction is not None
or self._root_tag is not None):
# The document has already started. Don't bother checking
# anymore.
return
self._first_processing_instruction = processing_instruction
# We won't know until we encounter the first tag whether or
# not this is actually a problem.
def _root_tag_encountered(self, name):
"""Call this when you encounter the document's root tag.
This is where we actually check whether an XML document is
being incorrectly parsed as HTML, and issue the warning.
"""
if self._root_tag is not None:
# This method was incorrectly called multiple times. Do
# nothing.
return
self._root_tag = name
if (name != 'html' and self._first_processing_instruction is not None
and self._first_processing_instruction.lower().startswith('xml ')):
# We encountered an XML declaration and then a tag other
# than 'html'. This is a reliable indicator that a
# non-XHTML document is being parsed as XML.
self._warn()
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
this_module = sys.modules[__name__]