Bump beautifulsoup4 from 4.11.1 to 4.11.2 (#1987)

* Bump beautifulsoup4 from 4.11.1 to 4.11.2

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.1 to 4.11.2.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.11.2

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2023-03-02 20:56:24 -08:00 committed by GitHub
commit 8e42757b2d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
23 changed files with 449 additions and 537 deletions

View file

@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
Beautiful Soup works with Python 3.5 and up. It works better if lxml
Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.11.1"
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
__version__ = "4.11.2"
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@ -211,7 +211,7 @@ class BeautifulSoup(Tag):
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name),
DeprecationWarning
DeprecationWarning, stacklevel=3
)
return kwargs.pop(old_name)
return None
@ -405,7 +405,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning
MarkupResemblesLocatorWarning,
stacklevel=3
)
return True
return False
@ -436,7 +437,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into'
' Beautiful Soup.',
MarkupResemblesLocatorWarning
MarkupResemblesLocatorWarning, stacklevel=3
)
return True
return False
@ -789,7 +790,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.',
DeprecationWarning
DeprecationWarning, stacklevel=2
)
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)

View file

@ -122,7 +122,7 @@ class TreeBuilder(object):
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
# Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()

View file

@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
warnings.warn(
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
stacklevel=3
)
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
warnings.warn(
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
stacklevel=4
)
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()
@ -249,9 +255,9 @@ class AttrList(object):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = self.element.cdata_list_attributes or {}
if (name in list_attr.get('*')
if (name in list_attr.get('*', [])
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
and name in list_attr.get(self.element.name, []))):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):

View file

@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from bs4.element import (
CData,
Comment,
@ -90,20 +69,7 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self.already_closed_empty_element = []
self._initialize_xml_detector()
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although
this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() by raising an exception,
which we don't want to do.
In any event, this method is called only on very strange
markup and our best strategy is to pretend it didn't happen
and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
@ -203,9 +169,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal.
"""
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
# TODO: This was originally a workaround for a bug in
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
# been fixed, but removing this code still makes some
# Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@ -333,10 +300,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
@ -395,105 +359,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
parser.feed(markup)
parser.close()
parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

View file

@ -496,13 +496,16 @@ class PageElement(object):
def extend(self, tags):
"""Appends the given PageElements to this one's contents.
:param tags: A list of PageElements.
:param tags: A list of PageElements. If a single Tag is
provided instead, this PageElement's contents will be extended
with that Tag's contents.
"""
if isinstance(tags, Tag):
# Calling self.append() on another tag's contents will change
# the list we're iterating over. Make a list that won't
# change.
tags = list(tags.contents)
tags = tags.contents
if isinstance(tags, list):
# Moving items around the tree may change their position in
# the original list. Make a list that won't change.
tags = list(tags)
for tag in tags:
self.append(tag)
@ -586,8 +589,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements.
"""
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements,
**kwargs)
_stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@ -624,8 +628,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit,
self.next_siblings, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit,
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
@ -663,8 +670,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit, self.previous_elements,
**kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit, self.previous_elements,
_stacklevel=_stacklevel+1, **kwargs
)
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
@ -702,8 +712,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit,
self.previous_siblings, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit,
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
@ -724,7 +737,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
l = self.find_parents(name, attrs, 1, **kwargs)
l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l:
r = l[0]
return r
@ -744,8 +757,9 @@ class PageElement(object):
:return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
_stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@ -771,19 +785,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs):
r = None
l = method(name, attrs, string, 1, **kwargs)
l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l:
r = l[0]
return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
_stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
DeprecationWarning
DeprecationWarning, stacklevel=_stacklevel
)
if isinstance(name, SoupStrainer):
@ -1306,7 +1321,8 @@ class Tag(PageElement):
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags
preserve_whitespace_tags=self.preserve_whitespace_tags,
interesting_string_types=self.interesting_string_types
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
@ -1558,7 +1574,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
),
DeprecationWarning
DeprecationWarning, stacklevel=2
)
return self.find(tag_name)
# We special case contents to avoid recursion.
@ -1862,7 +1878,8 @@ class Tag(PageElement):
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
r = None
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
**kwargs)
if l:
r = l[0]
return r
@ -1889,7 +1906,9 @@ class Tag(PageElement):
generator = self.descendants
if not recursive:
generator = self.children
return self._find_all(name, attrs, string, limit, generator, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, generator,
_stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3
findChildren = find_all # BS2
@ -1993,7 +2012,7 @@ class Tag(PageElement):
"""
warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning
DeprecationWarning, stacklevel=2
)
return self.has_attr(key)
@ -2024,7 +2043,7 @@ class SoupStrainer(object):
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning
DeprecationWarning, stacklevel=2
)
self.name = self._normalize_search_value(name)

View file

@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.

View file

@ -29,6 +29,29 @@ from bs4.builder import (
)
default_builder = HTMLParserTreeBuilder
# Some tests depend on specific third-party libraries. We use
# @pytest.mark.skipIf on the following conditionals to skip them
# if the libraries are not installed.
try:
from soupsieve import SelectorSyntaxError
SOUP_SIEVE_PRESENT = True
except ImportError:
SOUP_SIEVE_PRESENT = False
try:
import html5lib
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError:
LXML_PRESENT = False
LXML_VERSION = (0,)
BAD_DOCUMENT = """A bare string
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
@ -258,10 +281,10 @@ class TreeBuilderSmokeTest(object):
@pytest.mark.parametrize(
"multi_valued_attributes",
[None, dict(b=['class']), {'*': ['notclass']}]
[None, {}, dict(b=['class']), {'*': ['notclass']}]
)
def test_attribute_not_multi_valued(self, multi_valued_attributes):
markup = '<a class="a b c">'
markup = '<html xmlns="http://www.w3.org/1999/xhtml"><a class="a b c"></html>'
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a['class'] == 'a b c'
@ -820,26 +843,27 @@ Hello, world!
soup = self.soup(text)
assert soup.p.encode("utf-8") == expected
def test_real_iso_latin_document(self):
def test_real_iso_8859_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# Here it is in Unicode. Note that it claims to be in ISO-8859-1.
unicode_html = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
# That's because we're going to encode it into ISO-8859-1,
# and use that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML.
# Parse the ISO-8859-1 HTML.
soup = self.soup(iso_latin_html)
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
# UTF-8 instead of ISO-8859-1.
expected = unicode_html.replace("ISO-8859-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
@ -1177,15 +1201,3 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
assert isinstance(soup.contents[0], Comment)
assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?'
assert "html" == soup.contents[0].next_element.name
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator

View file

@ -10,22 +10,23 @@ from bs4.builder import (
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
from . import (
HTML5LIB_PRESENT,
LXML_PRESENT,
)
try:
if HTML5LIB_PRESENT:
from bs4.builder import HTML5TreeBuilder
if LXML_PRESENT:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
# TODO: Split out the lxml and html5lib tests into their own classes
# and gate with pytest.mark.skipIf.
class TestBuiltInRegistry(object):
"""Test the built-in registry with the default builders registered."""

View file

@ -17,26 +17,24 @@ class TestUnicodeDammit(object):
dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == markup
def test_smart_quotes_to_unicode(self):
@pytest.mark.parametrize(
"smart_quotes_to,expect_converted",
[(None, "\u2018\u2019\u201c\u201d"),
("xml", "&#x2018;&#x2019;&#x201C;&#x201D;"),
("html", "&lsquo;&rsquo;&ldquo;&rdquo;"),
("ascii", "''" + '""'),
]
)
def test_smart_quotes_to(self, smart_quotes_to, expect_converted):
"""Verify the functionality of the smart_quotes_to argument
to the UnicodeDammit constructor."""
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>"
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
assert dammit.unicode_markup == "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>"
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
assert dammit.unicode_markup == "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>"
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
assert dammit.unicode_markup == """<foo>''""</foo>"""
converted = UnicodeDammit(
markup, known_definite_encodings=["windows-1252"],
smart_quotes_to=smart_quotes_to
).unicode_markup
assert converted == "<foo>{}</foo>".format(expect_converted)
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
@ -275,23 +273,24 @@ class TestEntitySubstitution(object):
def setup_method(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
assert self.sub.substitute_html(s) == "foo&forall;\N{SNOWMAN}&otilde;bar"
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
assert self.sub.substitute_html(dammit.markup) == "&lsquo;&rsquo;foo&ldquo;&rdquo;"
@pytest.mark.parametrize(
"original,substituted",
[
# Basic case. Unicode characters corresponding to named
# HTML entites are substituted; others are not.
("foo\u2200\N{SNOWMAN}\u00f5bar",
"foo&forall;\N{SNOWMAN}&otilde;bar"),
# MS smart quotes are a common source of frustration, so we
# give them a special test.
('foo“”', "&lsquo;&rsquo;foo&ldquo;&rdquo;"),
]
)
def test_substitute_html(self, original, substituted):
assert self.sub.substitute_html(original) == substituted
def test_html5_entity(self):
# Some HTML5 entities correspond to single- or multi-character
# Unicode sequences.
for entity, u in (
# A few spot checks of our ability to recognize
# special character sequences and convert them

View file

@ -1,27 +1,26 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import pytest
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError as e:
HTML5LIB_PRESENT = False
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
from . import (
HTML5LIB_PRESENT,
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
@pytest.mark.skipif(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
reason="html5lib seems not to be present, not testing its tree builder."
)
class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
from bs4.builder import HTML5TreeBuilder
return HTML5TreeBuilder
def test_soupstrainer(self):
@ -29,10 +28,12 @@ class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
soup = BeautifulSoup(markup, "html5lib", parse_only=strainer)
assert soup.decode() == self.document_for(markup)
assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message)
[warning] = w
assert warning.filename == __file__
assert "the html5lib tree builder doesn't support parse_only" in str(warning.message)
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""

View file

@ -122,15 +122,3 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
assert with_element == expect
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
with warnings.catch_warnings(record=True) as warns:
parser.error("don't crash")
[warning] = warns
assert "don't crash" == str(warning.message)

View file

@ -1,16 +1,10 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import pickle
import pytest
import re
import warnings
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
from . import LXML_PRESENT, LXML_VERSION
if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
@ -23,13 +17,14 @@ from bs4.element import Comment, Doctype, SoupStrainer
from . import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SOUP_SIEVE_PRESENT,
SoupTest,
skipIf,
)
@skipIf(
@pytest.mark.skipif(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
reason="lxml seems not to be present, not testing its tree builder."
)
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@ -54,9 +49,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
@pytest.mark.skipif(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
reason="Skipping doctype test for old version of lxml to avoid segfault."
)
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
@ -68,7 +64,9 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
assert "<b/>" == str(soup.b)
assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)
[warning] = w
assert warning.filename == __file__
assert "BeautifulStoneSoup class is deprecated" in str(warning.message)
def test_tracking_line_numbers(self):
# The lxml TreeBuilder cannot keep track of line numbers from
@ -85,9 +83,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
assert "sourceline" == soup.p.sourceline.name
assert "sourcepos" == soup.p.sourcepos.name
@skipIf(
@pytest.mark.skipif(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
reason="lxml seems not to be present, not testing its XML tree builder."
)
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@ -148,6 +147,9 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
}
@pytest.mark.skipif(
not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"
)
def test_namespace_interaction_with_select_and_find(self):
# Demonstrate how namespaces interact with select* and
# find* methods.

View file

@ -3,15 +3,18 @@ import copy
import pickle
import pytest
from soupsieve import SelectorSyntaxError
from bs4 import BeautifulSoup
from bs4.element import (
Comment,
SoupStrainer,
)
from . import SoupTest
from . import (
SoupTest,
SOUP_SIEVE_PRESENT,
)
if SOUP_SIEVE_PRESENT:
from soupsieve import SelectorSyntaxError
class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
@ -213,6 +216,7 @@ class TestFormatters(SoupTest):
assert soup.contents[0].name == 'pre'
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality.
@ -694,6 +698,7 @@ class TestPersistence(SoupTest):
assert tag.can_be_empty_element == copied.can_be_empty_element
assert tag.cdata_list_attributes == copied.cdata_list_attributes
assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
assert tag.interesting_string_types == copied.interesting_string_types
def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled.

View file

@ -30,19 +30,11 @@ from bs4.element import (
from . import (
default_builder,
LXML_PRESENT,
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
@ -139,7 +131,7 @@ class TestConstructor(SoupTest):
assert " an id " == a['id']
assert ["a", "class"] == a['class']
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
# TreeBuilder takes an argument called 'multi_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
@ -219,10 +211,17 @@ class TestConstructor(SoupTest):
class TestWarnings(SoupTest):
# Note that some of the tests in this class create BeautifulSoup
# objects directly rather than using self.soup(). That's
# because SoupTest.soup is defined in a different file,
# which will throw off the assertion in _assert_warning
# that the code that triggered the warning is in the same
# file as the test.
def _assert_warning(self, warnings, cls):
for w in warnings:
if isinstance(w.message, cls):
assert w.filename == __file__
return w
raise Exception("%s warning not found in %r" % (cls, warnings))
@ -243,13 +242,17 @@ class TestWarnings(SoupTest):
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
soup = self.soup("<a><b></b></a>")
assert [] == w
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
soup = BeautifulSoup(
"<a><b></b></a>", "html.parser",
parseOnlyThese=SoupStrainer("b"),
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "parseOnlyThese" in msg
assert "parse_only" in msg
assert b"<b></b>" == soup.encode()
@ -257,8 +260,11 @@ class TestWarnings(SoupTest):
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
soup = BeautifulSoup(
utf8, "html.parser", fromEncoding="utf8"
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "fromEncoding" in msg
assert "from_encoding" in msg
assert "utf8" == soup.original_encoding
@ -276,7 +282,7 @@ class TestWarnings(SoupTest):
# A warning is issued if the "markup" looks like the name of
# an HTML or text file, or a full path to a file on disk.
with warnings.catch_warnings(record=True) as w:
soup = self.soup("markup" + extension)
soup = BeautifulSoup("markup" + extension, "html.parser")
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks more like a filename" in str(warning.message)
@ -291,11 +297,11 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("markup" + extension)
assert [] == w
def test_url_warning_with_bytes_url(self):
url = b"http://www.crummybytes.com/"
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(url)
soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
@ -307,7 +313,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(url)
soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
@ -347,18 +353,22 @@ class TestNewTag(SoupTest):
assert "foo" == new_tag.name
assert dict(bar="baz", name="a name") == new_tag.attrs
assert None == new_tag.parent
@pytest.mark.skipif(
not LXML_PRESENT,
reason="lxml not installed, cannot parse XML document"
)
def test_xml_tag_inherits_self_closing_rules_from_builder(self):
xml_soup = BeautifulSoup("", "xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
# Both the <br> and <p> tag are empty-element, just because
# they have no contents.
assert b"<br/>" == xml_br.encode()
assert b"<p/>" == xml_p.encode()
def test_tag_inherits_self_closing_rules_from_builder(self):
if LXML_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
# Both the <br> and <p> tag are empty-element, just because
# they have no contents.
assert b"<br/>" == xml_br.encode()
assert b"<p/>" == xml_p.encode()
html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
@ -450,13 +460,3 @@ class TestEncodingConversion(SoupTest):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
assert soup_from_unicode.encode('utf-8') == self.utf8_data
@skipIf(
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")

View file

@ -33,7 +33,6 @@ from bs4.element import (
)
from . import (
SoupTest,
skipIf,
)
class TestFind(SoupTest):
@ -910,12 +909,16 @@ class TestTreeModification(SoupTest):
soup.a.extend(l)
assert "<a><g></g><f></f><e></e><d></d><c></c><b></b></a>" == soup.decode()
def test_extend_with_another_tags_contents(self):
@pytest.mark.parametrize(
"get_tags", [lambda tag: tag, lambda tag: tag.contents]
)
def test_extend_with_another_tags_contents(self, get_tags):
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
soup = self.soup(data)
d1 = soup.find('div', id='d1')
d2 = soup.find('div', id='d2')
d2.extend(d1)
tags = get_tags(d1)
d2.extend(tags)
assert '<div id="d1"></div>' == d1.decode()
assert '<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>' == d2.decode()
@ -1272,19 +1275,30 @@ class TestTreeModification(SoupTest):
class TestDeprecatedArguments(SoupTest):
def test_find_type_method_string(self):
@pytest.mark.parametrize(
"method_name", [
"find", "find_all", "find_parent", "find_parents",
"find_next", "find_all_next", "find_previous",
"find_all_previous", "find_next_sibling", "find_next_siblings",
"find_previous_sibling", "find_previous_siblings",
]
)
def test_find_type_method_string(self, method_name):
soup = self.soup("<a>some</a><b>markup</b>")
method = getattr(soup.b, method_name)
with warnings.catch_warnings(record=True) as w:
[result] = soup.find_all(text='markup')
assert result == 'markup'
assert result.parent.name == 'b'
msg = str(w[0].message)
method(text='markup')
[warning] = w
assert warning.filename == __file__
msg = str(warning.message)
assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead."
def test_soupstrainer_constructor_string(self):
with warnings.catch_warnings(record=True) as w:
strainer = SoupStrainer(text="text")
assert strainer.text == 'text'
msg = str(w[0].message)
[warning] = w
msg = str(warning.message)
assert warning.filename == __file__
assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead."