Bump beautifulsoup4 from 4.11.2 to 4.12.2 (#2037)

* Bump beautifulsoup4 from 4.11.2 to 4.12.2

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.2 to 4.12.2.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.12.2

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2023-08-23 21:38:49 -07:00 committed by GitHub
commit e70e08c3f5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
32 changed files with 1439 additions and 755 deletions

View file

@ -297,37 +297,11 @@ class TreeBuilderSmokeTest(object):
markup, multi_valued_attributes=multi_valued_attributes
)
assert soup.a['class'] == ['a', 'b', 'c']
def test_fuzzed_input(self):
# This test centralizes in one place the various fuzz tests
# for Beautiful Soup created by the oss-fuzz project.
# These strings superficially resemble markup, but they
# generally can't be parsed into anything. The best we can
# hope for is that parsing these strings won't crash the
# parser.
#
# n.b. This markup is commented out because these fuzz tests
# _do_ crash the parser. However the crashes are due to bugs
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
# bugs!
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://bugs.python.org/issue37747
#
#b'\n<![\xff\xfe\xfe\xcd\x00',
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
# https://bugs.python.org/issue34480
#
#b'<![n\x00'
]
for markup in bad_markup:
with warnings.catch_warnings(record=False):
soup = self.soup(markup)
def test_invalid_doctype(self):
markup = '<![if word]>content<![endif]>'
markup = '<!DOCTYPE html]ff>'
soup = self.soup(markup)
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
@ -577,8 +551,8 @@ Hello, world!
"""Whitespace must be preserved in <pre> and <textarea> tags,
even if that would mean not prettifying the markup.
"""
pre_markup = "<pre> </pre>"
textarea_markup = "<textarea> woo\nwoo </textarea>"
pre_markup = "<pre>a z</pre>\n"
textarea_markup = "<textarea> woo\nwoo </textarea>\n"
self.assert_soup(pre_markup)
self.assert_soup(textarea_markup)
@ -589,7 +563,7 @@ Hello, world!
assert soup.textarea.prettify() == textarea_markup
soup = self.soup("<textarea></textarea>")
assert soup.textarea.prettify() == "<textarea></textarea>"
assert soup.textarea.prettify() == "<textarea></textarea>\n"
def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely."""

View file

@ -0,0 +1 @@
˙<!DOCTyPEV PUBLIC'''Đ'

View file

@ -0,0 +1 @@
)<a><math><TR><a><mI><a><p><a>

View file

@ -0,0 +1 @@
-<math><sElect><mi><sElect><sElect>

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
ñ<table><svg><html>

487
lib/bs4/tests/test_css.py Normal file
View file

@ -0,0 +1,487 @@
import pytest
import types
from unittest.mock import MagicMock
from bs4 import (
CSS,
BeautifulSoup,
ResultSet,
)
from . import (
SoupTest,
SOUP_SIEVE_PRESENT,
)
if SOUP_SIEVE_PRESENT:
from soupsieve import SelectorSyntaxError
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality.
This functionality is implemented in soupsieve, which has a much
more comprehensive test suite, so this is basically an extra check
that soupsieve works as expected.
"""
HTML = """
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>The title</title>
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
</head>
<body>
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
<div id="main" class="fancy">
<div id="inner">
<h1 id="header1">An H1</h1>
<p>Some text</p>
<p class="onep" id="p1">Some more text</p>
<h2 id="header2">An H2</h2>
<p class="class1 class2 class3" id="pmulti">Another</p>
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
<h2 id="header3">Another H2</h2>
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
<span class="s1">
<a href="#" id="s1a1">span1a1</a>
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
<span class="span2">
<a href="#" id="s2a1">span2a1</a>
</span>
<span class="span3"></span>
<custom-dashed-tag class="dashed" id="dash2"/>
<div data-tag="dashedvalue" id="data1"/>
</span>
</div>
<x id="xid">
<z id="zida"/>
<z id="zidab"/>
<z id="zidac"/>
</x>
<y id="yid">
<z id="zidb"/>
</y>
<p lang="en" id="lang-en">English</p>
<p lang="en-gb" id="lang-en-gb">English UK</p>
<p lang="en-us" id="lang-en-us">English US</p>
<p lang="fr" id="lang-fr">French</p>
</div>
<div id="footer">
</div>
"""
def setup_method(self):
self.soup = BeautifulSoup(self.HTML, 'html.parser')
def assert_selects(self, selector, expected_ids, **kwargs):
results = self.soup.select(selector, **kwargs)
assert isinstance(results, ResultSet)
el_ids = [el['id'] for el in results]
el_ids.sort()
expected_ids.sort()
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
selector, ', '.join(expected_ids), ', '.join(el_ids)
)
assertSelect = assert_selects
def assert_select_multiple(self, *tests):
for selector, expected_ids in tests:
self.assert_selects(selector, expected_ids)
def test_precompiled(self):
sel = self.soup.css.compile('div')
els = self.soup.select(sel)
assert len(els) == 4
for div in els:
assert div.name == 'div'
el = self.soup.select_one(sel)
assert 'main' == el['id']
def test_one_tag_one(self):
els = self.soup.select('title')
assert len(els) == 1
assert els[0].name == 'title'
assert els[0].contents == ['The title']
def test_one_tag_many(self):
els = self.soup.select('div')
assert len(els) == 4
for div in els:
assert div.name == 'div'
el = self.soup.select_one('div')
assert 'main' == el['id']
def test_select_one_returns_none_if_no_match(self):
match = self.soup.select_one('nonexistenttag')
assert None == match
def test_tag_in_tag_one(self):
els = self.soup.select('div div')
self.assert_selects('div div', ['inner', 'data1'])
def test_tag_in_tag_many(self):
for selector in ('html div', 'html body div', 'body div'):
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
def test_limit(self):
self.assert_selects('html div', ['main'], limit=1)
self.assert_selects('html body div', ['inner', 'main'], limit=2)
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
limit=10)
def test_tag_no_match(self):
assert len(self.soup.select('del')) == 0
def test_invalid_tag(self):
with pytest.raises(SelectorSyntaxError):
self.soup.select('tag%t')
def test_select_dashed_tag_ids(self):
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
def test_select_dashed_by_id(self):
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
assert dashed[0].name == 'custom-dashed-tag'
assert dashed[0]['id'] == 'dash2'
def test_dashed_tag_text(self):
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
def test_select_dashed_matches_find_all(self):
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
def test_header_tags(self):
self.assert_select_multiple(
('h1', ['header1']),
('h2', ['header2', 'header3']),
)
def test_class_one(self):
for selector in ('.onep', 'p.onep', 'html p.onep'):
els = self.soup.select(selector)
assert len(els) == 1
assert els[0].name == 'p'
assert els[0]['class'] == ['onep']
def test_class_mismatched_tag(self):
els = self.soup.select('div.onep')
assert len(els) == 0
def test_one_id(self):
for selector in ('div#inner', '#inner', 'div div#inner'):
self.assert_selects(selector, ['inner'])
def test_bad_id(self):
els = self.soup.select('#doesnotexist')
assert len(els) == 0
def test_items_in_id(self):
els = self.soup.select('div#inner p')
assert len(els) == 3
for el in els:
assert el.name == 'p'
assert els[1]['class'] == ['onep']
assert not els[0].has_attr('class')
def test_a_bunch_of_emptys(self):
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
assert len(self.soup.select(selector)) == 0
def test_multi_class_support(self):
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
self.assert_selects(selector, ['pmulti'])
def test_multi_class_selection(self):
for selector in ('.class1.class3', '.class3.class2',
'.class1.class2.class3'):
self.assert_selects(selector, ['pmulti'])
def test_child_selector(self):
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
self.assert_selects('.s1 > a span', ['s1a2s1'])
def test_child_selector_id(self):
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
def test_attribute_equals(self):
self.assert_select_multiple(
('p[class="onep"]', ['p1']),
('p[id="p1"]', ['p1']),
('[class="onep"]', ['p1']),
('[id="p1"]', ['p1']),
('link[rel="stylesheet"]', ['l1']),
('link[type="text/css"]', ['l1']),
('link[href="blah.css"]', ['l1']),
('link[href="no-blah.css"]', []),
('[rel="stylesheet"]', ['l1']),
('[type="text/css"]', ['l1']),
('[href="blah.css"]', ['l1']),
('[href="no-blah.css"]', []),
('p[href="no-blah.css"]', []),
('[href="no-blah.css"]', []),
)
def test_attribute_tilde(self):
self.assert_select_multiple(
('p[class~="class1"]', ['pmulti']),
('p[class~="class2"]', ['pmulti']),
('p[class~="class3"]', ['pmulti']),
('[class~="class1"]', ['pmulti']),
('[class~="class2"]', ['pmulti']),
('[class~="class3"]', ['pmulti']),
('a[rel~="friend"]', ['bob']),
('a[rel~="met"]', ['bob']),
('[rel~="friend"]', ['bob']),
('[rel~="met"]', ['bob']),
)
def test_attribute_startswith(self):
self.assert_select_multiple(
('[rel^="style"]', ['l1']),
('link[rel^="style"]', ['l1']),
('notlink[rel^="notstyle"]', []),
('[rel^="notstyle"]', []),
('link[rel^="notstyle"]', []),
('link[href^="bla"]', ['l1']),
('a[href^="http://"]', ['bob', 'me']),
('[href^="http://"]', ['bob', 'me']),
('[id^="p"]', ['pmulti', 'p1']),
('[id^="m"]', ['me', 'main']),
('div[id^="m"]', ['main']),
('a[id^="m"]', ['me']),
('div[data-tag^="dashed"]', ['data1'])
)
def test_attribute_endswith(self):
self.assert_select_multiple(
('[href$=".css"]', ['l1']),
('link[href$=".css"]', ['l1']),
('link[id$="1"]', ['l1']),
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
('div[id$="1"]', ['data1']),
('[id$="noending"]', []),
)
def test_attribute_contains(self):
self.assert_select_multiple(
# From test_attribute_startswith
('[rel*="style"]', ['l1']),
('link[rel*="style"]', ['l1']),
('notlink[rel*="notstyle"]', []),
('[rel*="notstyle"]', []),
('link[rel*="notstyle"]', []),
('link[href*="bla"]', ['l1']),
('[href*="http://"]', ['bob', 'me']),
('[id*="p"]', ['pmulti', 'p1']),
('div[id*="m"]', ['main']),
('a[id*="m"]', ['me']),
# From test_attribute_endswith
('[href*=".css"]', ['l1']),
('link[href*=".css"]', ['l1']),
('link[id*="1"]', ['l1']),
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
('div[id*="1"]', ['data1']),
('[id*="noending"]', []),
# New for this test
('[href*="."]', ['bob', 'me', 'l1']),
('a[href*="."]', ['bob', 'me']),
('link[href*="."]', ['l1']),
('div[id*="n"]', ['main', 'inner']),
('div[id*="nn"]', ['inner']),
('div[data-tag*="edval"]', ['data1'])
)
def test_attribute_exact_or_hypen(self):
self.assert_select_multiple(
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
('p[lang|="fr"]', ['lang-fr']),
('p[lang|="gb"]', []),
)
def test_attribute_exists(self):
self.assert_select_multiple(
('[rel]', ['l1', 'bob', 'me']),
('link[rel]', ['l1']),
('a[rel]', ['bob', 'me']),
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
('p[class]', ['p1', 'pmulti']),
('[blah]', []),
('p[blah]', []),
('div[data-tag]', ['data1'])
)
def test_quoted_space_in_selector_name(self):
html = """<div style="display: wrong">nope</div>
<div style="display: right">yes</div>
"""
soup = BeautifulSoup(html, 'html.parser')
[chosen] = soup.select('div[style="display: right"]')
assert "yes" == chosen.string
def test_unsupported_pseudoclass(self):
with pytest.raises(NotImplementedError):
self.soup.select("a:no-such-pseudoclass")
with pytest.raises(SelectorSyntaxError):
self.soup.select("a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
assert len(els) == 1
assert els[0].string == 'Some text'
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
assert len(els) == 1
assert els[0].string == 'Another'
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
assert len(els) == 0
# Zero will select no tags.
els = self.soup.select('div p:nth-of-type(0)')
assert len(els) == 0
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
assert len(els) == 1
assert els[0].string == 'Some text'
def test_id_child_selector_nth_of_type(self):
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element
# within the tree.
inner = self.soup.find("div", id="main")
selected = inner.select("div")
# The <div id="inner"> tag was selected. The <div id="footer">
# tag was not.
self.assert_selects_ids(selected, ['inner', 'data1'])
def test_overspecified_child_id(self):
self.assert_selects(".fancy #inner", ['inner'])
self.assert_selects(".normal #inner", [])
def test_adjacent_sibling_selector(self):
self.assert_selects('#p1 + h2', ['header2'])
self.assert_selects('#p1 + h2 + p', ['pmulti'])
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
assert [] == self.soup.select('#p1 + p')
def test_general_sibling_selector(self):
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
self.assert_selects('#p1 ~ #header2', ['header2'])
self.assert_selects('#p1 ~ h2 + a', ['me'])
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
assert [] == self.soup.select('#inner ~ h2')
def test_dangling_combinator(self):
with pytest.raises(SelectorSyntaxError):
self.soup.select('h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
# Test the selector grouping operator (the comma)
def test_multiple_select(self):
self.assert_selects('x, y', ['xid', 'yid'])
def test_multiple_select_with_no_space(self):
self.assert_selects('x,y', ['xid', 'yid'])
def test_multiple_select_with_more_space(self):
self.assert_selects('x, y', ['xid', 'yid'])
def test_multiple_select_duplicated(self):
self.assert_selects('x, x', ['xid'])
def test_multiple_select_sibling(self):
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
def test_multiple_select_tag_and_direct_descendant(self):
self.assert_selects('x, y > z', ['xid', 'zidb'])
def test_multiple_select_direct_descendant_and_tags(self):
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_multiple_select_indirect_descendant(self):
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
with pytest.raises(SelectorSyntaxError):
self.soup.select(',x, y')
with pytest.raises(SelectorSyntaxError):
self.soup.select('x,,y')
def test_multiple_select_attrs(self):
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
def test_multiple_select_ids(self):
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
def test_multiple_select_nested(self):
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
def test_select_duplicate_elements(self):
# When markup contains duplicate elements, a multiple select
# will find all of them.
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
soup = BeautifulSoup(markup, 'html.parser')
selected = soup.select(".c1, .c2")
assert 3 == len(selected)
# Verify that find_all finds the same elements, though because
# of an implementation detail it finds them in a different
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected
def test_closest(self):
inner = self.soup.find("div", id="inner")
closest = inner.css.closest("div[id=main]")
assert closest == self.soup.find("div", id="main")
def test_match(self):
inner = self.soup.find("div", id="inner")
main = self.soup.find("div", id="main")
assert inner.css.match("div[id=main]") == False
assert main.css.match("div[id=main]") == True
def test_iselect(self):
gen = self.soup.css.iselect("h2")
assert isinstance(gen, types.GeneratorType)
[header2, header3] = gen
assert header2['id'] == 'header2'
assert header3['id'] == 'header3'
def test_filter(self):
inner = self.soup.find("div", id="inner")
results = inner.css.filter("h2")
assert len(inner.css.filter("h2")) == 2
results = inner.css.filter("h2[id=header3]")
assert isinstance(results, ResultSet)
[result] = results
assert result['id'] == 'header3'
def test_escape(self):
m = self.soup.css.escape
assert m(".foo#bar") == '\\.foo\\#bar'
assert m("()[]{}") == '\\(\\)\\[\\]\\{\\}'
assert m(".foo") == self.soup.css.escape(".foo")

View file

@ -80,20 +80,20 @@ class TestFormatter(SoupTest):
@pytest.mark.parametrize(
"indent,expect",
[
(None, '<a>\n<b>\ntext\n</b>\n</a>'),
(-1, '<a>\n<b>\ntext\n</b>\n</a>'),
(0, '<a>\n<b>\ntext\n</b>\n</a>'),
("", '<a>\n<b>\ntext\n</b>\n</a>'),
(None, '<a>\n<b>\ntext\n</b>\n</a>\n'),
(-1, '<a>\n<b>\ntext\n</b>\n</a>\n'),
(0, '<a>\n<b>\ntext\n</b>\n</a>\n'),
("", '<a>\n<b>\ntext\n</b>\n</a>\n'),
(1, '<a>\n <b>\n text\n </b>\n</a>'),
(2, '<a>\n <b>\n text\n </b>\n</a>'),
(1, '<a>\n <b>\n text\n </b>\n</a>\n'),
(2, '<a>\n <b>\n text\n </b>\n</a>\n'),
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n'),
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n'),
# Some invalid inputs -- the default behavior is used.
(object(), '<a>\n <b>\n text\n </b>\n</a>'),
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>'),
(object(), '<a>\n <b>\n text\n </b>\n</a>\n'),
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>\n'),
]
)
def test_indent(self, indent, expect):

View file

@ -0,0 +1,91 @@
"""This file contains test cases reported by third parties using
fuzzing tools, primarily from Google's oss-fuzz project. Some of these
represent real problems with Beautiful Soup, but many are problems in
libraries that Beautiful Soup depends on, and many of the test cases
represent different ways of triggering the same problem.
Grouping these test cases together makes it easy to see which test
cases represent the same problem, and puts the test cases in close
proximity to code that can trigger the problems.
"""
import os
import pytest
from bs4 import (
BeautifulSoup,
ParserRejectedMarkup,
)
class TestFuzz(object):
# Test case markup files from fuzzers are given this extension so
# they can be included in builds.
TESTCASE_SUFFIX = ".testcase"
# This class of error has been fixed by catching a less helpful
# exception from html.parser and raising ParserRejectedMarkup
# instead.
@pytest.mark.parametrize(
"filename", [
"clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
]
)
def test_rejected_markup(self, filename):
markup = self.__markup(filename)
with pytest.raises(ParserRejectedMarkup):
BeautifulSoup(markup, 'html.parser')
# This class of error has to do with very deeply nested documents
# which overflow the Python call stack when the tree is converted
# to a string. This is an issue with Beautiful Soup which was fixed
# as part of [bug=1471755].
@pytest.mark.parametrize(
"filename", [
"clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
"clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632",
"clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464",
"clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
]
)
def test_deeply_nested_document(self, filename):
# Parsing the document and encoding it back to a string is
# sufficient to demonstrate that the overflow problem has
# been fixed.
markup = self.__markup(filename)
BeautifulSoup(markup, 'html.parser').encode()
# This class of error represents problems with html5lib's parser,
# not Beautiful Soup. I use
# https://github.com/html5lib/html5lib-python/issues/568 to notify
# the html5lib developers of these issues.
@pytest.mark.skip("html5lib problems")
@pytest.mark.parametrize(
"filename", [
# b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
"clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320",
# b')<a><math><TR><a><mI><a><p><a>'
"clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456",
# b'-<math><sElect><mi><sElect><sElect>'
"clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",
# b'ñ<table><svg><html>'
"clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",
# <TABLE>, some ^@ characters, some <math> tags.
"clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744",
# Nested table
"crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08"
]
)
def test_html5lib_parse_errors(self, filename):
markup = self.__markup(filename)
print(BeautifulSoup(markup, 'html5lib').encode())
def __markup(self, filename):
if not filename.endswith(self.TESTCASE_SUFFIX):
filename += self.TESTCASE_SUFFIX
this_dir = os.path.split(__file__)[0]
path = os.path.join(this_dir, 'fuzz', filename)
return open(path, 'rb').read()

View file

@ -3,9 +3,11 @@ trees."""
from pdb import set_trace
import pickle
import pytest
import warnings
from bs4.builder import (
HTMLParserTreeBuilder,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
)
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
def test_rejected_input(self):
# Python's html.parser will occasionally reject markup,
# especially when there is a problem with the initial DOCTYPE
# declaration. Different versions of Python sound the alarm in
# different ways, but Beautiful Soup consistently raises
# errors as ParserRejectedMarkup exceptions.
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://github.com/python/cpython/issues/81928
b'\n<![\xff\xfe\xfe\xcd\x00',
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
# https://github.com/python/cpython/issues/78661
#
b'<![n\x00',
b"<![UNKNOWN[]]>",
]
for markup in bad_markup:
with pytest.raises(ParserRejectedMarkup):
soup = self.soup(markup)
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass

View file

@ -189,13 +189,15 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
assert soup.find('prefix:tag3').name == 'tag3'
assert soup.subtag.find('prefix:tag3').name == 'tag3'
def test_pickle_removes_builder(self):
# The lxml TreeBuilder is not picklable, so it won't be
# preserved in a pickle/unpickle operation.
def test_pickle_restores_builder(self):
# The lxml TreeBuilder is not picklable, so when unpickling
# a document created with it, a new TreeBuilder of the
# appropriate class is created.
soup = self.soup("<a>some markup</a>")
assert isinstance(soup.builder, self.default_builder)
pickled = pickle.dumps(soup)
unpickled = pickle.loads(pickled)
assert "some markup" == unpickled.a.string
assert unpickled.builder is None
assert unpickled.builder != soup.builder
assert isinstance(unpickled.builder, self.default_builder)

View file

@ -2,20 +2,18 @@
import copy
import pickle
import pytest
import sys
from bs4 import BeautifulSoup
from bs4.element import (
Comment,
ResultSet,
SoupStrainer,
)
from . import (
SoupTest,
SOUP_SIEVE_PRESENT,
)
if SOUP_SIEVE_PRESENT:
from soupsieve import SelectorSyntaxError
class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
@ -51,10 +49,21 @@ class TestEncoding(SoupTest):
assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(
encoding="utf8"
)
def test_encode_deeply_nested_document(self):
# This test verifies that encoding a string doesn't involve
# any recursive function calls. If it did, this test would
# overflow the Python interpreter stack.
limit = sys.getrecursionlimit() + 1
markup = "<span>" * limit
soup = self.soup(markup)
encoded = soup.encode()
assert limit == encoded.count(b"<span>")
def test_deprecated_renderContents(self):
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
soup.renderContents()
assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents()
def test_repr(self):
@ -159,7 +168,31 @@ class TestFormatters(SoupTest):
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify()
def test_prettify_handles_nested_string_literal_tags(self):
# Most of this markup is inside a <pre> tag, so prettify()
# only does three things to it:
# 1. Add a newline and a space between the <div> and the <pre>
# 2. Add a newline after the </pre>
# 3. Add a newline at the end.
#
# The contents of the <pre> tag are left completely alone. In
# particular, we don't start adding whitespace again once we
# encounter the first </pre> tag, because we know it's not
# the one that put us into string literal mode.
markup = """<div><pre><code>some
<script><pre>code</pre></script> for you
</code></pre></div>"""
expect = """<div>
<pre><code>some
<script><pre>code</pre></script> for you
</code></pre>
</div>
"""
soup = self.soup(markup)
assert expect == soup.div.prettify()
def test_prettify_accepts_formatter_function(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
@ -216,429 +249,6 @@ class TestFormatters(SoupTest):
assert soup.contents[0].name == 'pre'
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality.
This functionality is implemented in soupsieve, which has a much
more comprehensive test suite, so this is basically an extra check
that soupsieve works as expected.
"""
HTML = """
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title>The title</title>
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
</head>
<body>
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
<div id="main" class="fancy">
<div id="inner">
<h1 id="header1">An H1</h1>
<p>Some text</p>
<p class="onep" id="p1">Some more text</p>
<h2 id="header2">An H2</h2>
<p class="class1 class2 class3" id="pmulti">Another</p>
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
<h2 id="header3">Another H2</h2>
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
<span class="s1">
<a href="#" id="s1a1">span1a1</a>
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
<span class="span2">
<a href="#" id="s2a1">span2a1</a>
</span>
<span class="span3"></span>
<custom-dashed-tag class="dashed" id="dash2"/>
<div data-tag="dashedvalue" id="data1"/>
</span>
</div>
<x id="xid">
<z id="zida"/>
<z id="zidab"/>
<z id="zidac"/>
</x>
<y id="yid">
<z id="zidb"/>
</y>
<p lang="en" id="lang-en">English</p>
<p lang="en-gb" id="lang-en-gb">English UK</p>
<p lang="en-us" id="lang-en-us">English US</p>
<p lang="fr" id="lang-fr">French</p>
</div>
<div id="footer">
</div>
"""
def setup_method(self):
self.soup = BeautifulSoup(self.HTML, 'html.parser')
def assert_selects(self, selector, expected_ids, **kwargs):
el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
el_ids.sort()
expected_ids.sort()
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
selector, ', '.join(expected_ids), ', '.join(el_ids)
)
assertSelect = assert_selects
def assert_select_multiple(self, *tests):
for selector, expected_ids in tests:
self.assert_selects(selector, expected_ids)
def test_one_tag_one(self):
els = self.soup.select('title')
assert len(els) == 1
assert els[0].name == 'title'
assert els[0].contents == ['The title']
def test_one_tag_many(self):
els = self.soup.select('div')
assert len(els) == 4
for div in els:
assert div.name == 'div'
el = self.soup.select_one('div')
assert 'main' == el['id']
def test_select_one_returns_none_if_no_match(self):
match = self.soup.select_one('nonexistenttag')
assert None == match
def test_tag_in_tag_one(self):
els = self.soup.select('div div')
self.assert_selects('div div', ['inner', 'data1'])
def test_tag_in_tag_many(self):
for selector in ('html div', 'html body div', 'body div'):
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
def test_limit(self):
self.assert_selects('html div', ['main'], limit=1)
self.assert_selects('html body div', ['inner', 'main'], limit=2)
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
limit=10)
def test_tag_no_match(self):
assert len(self.soup.select('del')) == 0
def test_invalid_tag(self):
with pytest.raises(SelectorSyntaxError):
self.soup.select('tag%t')
def test_select_dashed_tag_ids(self):
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
def test_select_dashed_by_id(self):
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
assert dashed[0].name == 'custom-dashed-tag'
assert dashed[0]['id'] == 'dash2'
def test_dashed_tag_text(self):
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
def test_select_dashed_matches_find_all(self):
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
def test_header_tags(self):
self.assert_select_multiple(
('h1', ['header1']),
('h2', ['header2', 'header3']),
)
def test_class_one(self):
for selector in ('.onep', 'p.onep', 'html p.onep'):
els = self.soup.select(selector)
assert len(els) == 1
assert els[0].name == 'p'
assert els[0]['class'] == ['onep']
def test_class_mismatched_tag(self):
els = self.soup.select('div.onep')
assert len(els) == 0
def test_one_id(self):
for selector in ('div#inner', '#inner', 'div div#inner'):
self.assert_selects(selector, ['inner'])
def test_bad_id(self):
els = self.soup.select('#doesnotexist')
assert len(els) == 0
def test_items_in_id(self):
els = self.soup.select('div#inner p')
assert len(els) == 3
for el in els:
assert el.name == 'p'
assert els[1]['class'] == ['onep']
assert not els[0].has_attr('class')
def test_a_bunch_of_emptys(self):
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
assert len(self.soup.select(selector)) == 0
def test_multi_class_support(self):
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
self.assert_selects(selector, ['pmulti'])
def test_multi_class_selection(self):
for selector in ('.class1.class3', '.class3.class2',
'.class1.class2.class3'):
self.assert_selects(selector, ['pmulti'])
def test_child_selector(self):
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
self.assert_selects('.s1 > a span', ['s1a2s1'])
def test_child_selector_id(self):
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
def test_attribute_equals(self):
self.assert_select_multiple(
('p[class="onep"]', ['p1']),
('p[id="p1"]', ['p1']),
('[class="onep"]', ['p1']),
('[id="p1"]', ['p1']),
('link[rel="stylesheet"]', ['l1']),
('link[type="text/css"]', ['l1']),
('link[href="blah.css"]', ['l1']),
('link[href="no-blah.css"]', []),
('[rel="stylesheet"]', ['l1']),
('[type="text/css"]', ['l1']),
('[href="blah.css"]', ['l1']),
('[href="no-blah.css"]', []),
('p[href="no-blah.css"]', []),
('[href="no-blah.css"]', []),
)
def test_attribute_tilde(self):
self.assert_select_multiple(
('p[class~="class1"]', ['pmulti']),
('p[class~="class2"]', ['pmulti']),
('p[class~="class3"]', ['pmulti']),
('[class~="class1"]', ['pmulti']),
('[class~="class2"]', ['pmulti']),
('[class~="class3"]', ['pmulti']),
('a[rel~="friend"]', ['bob']),
('a[rel~="met"]', ['bob']),
('[rel~="friend"]', ['bob']),
('[rel~="met"]', ['bob']),
)
def test_attribute_startswith(self):
self.assert_select_multiple(
('[rel^="style"]', ['l1']),
('link[rel^="style"]', ['l1']),
('notlink[rel^="notstyle"]', []),
('[rel^="notstyle"]', []),
('link[rel^="notstyle"]', []),
('link[href^="bla"]', ['l1']),
('a[href^="http://"]', ['bob', 'me']),
('[href^="http://"]', ['bob', 'me']),
('[id^="p"]', ['pmulti', 'p1']),
('[id^="m"]', ['me', 'main']),
('div[id^="m"]', ['main']),
('a[id^="m"]', ['me']),
('div[data-tag^="dashed"]', ['data1'])
)
def test_attribute_endswith(self):
self.assert_select_multiple(
('[href$=".css"]', ['l1']),
('link[href$=".css"]', ['l1']),
('link[id$="1"]', ['l1']),
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
('div[id$="1"]', ['data1']),
('[id$="noending"]', []),
)
def test_attribute_contains(self):
self.assert_select_multiple(
# From test_attribute_startswith
('[rel*="style"]', ['l1']),
('link[rel*="style"]', ['l1']),
('notlink[rel*="notstyle"]', []),
('[rel*="notstyle"]', []),
('link[rel*="notstyle"]', []),
('link[href*="bla"]', ['l1']),
('[href*="http://"]', ['bob', 'me']),
('[id*="p"]', ['pmulti', 'p1']),
('div[id*="m"]', ['main']),
('a[id*="m"]', ['me']),
# From test_attribute_endswith
('[href*=".css"]', ['l1']),
('link[href*=".css"]', ['l1']),
('link[id*="1"]', ['l1']),
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
('div[id*="1"]', ['data1']),
('[id*="noending"]', []),
# New for this test
('[href*="."]', ['bob', 'me', 'l1']),
('a[href*="."]', ['bob', 'me']),
('link[href*="."]', ['l1']),
('div[id*="n"]', ['main', 'inner']),
('div[id*="nn"]', ['inner']),
('div[data-tag*="edval"]', ['data1'])
)
def test_attribute_exact_or_hypen(self):
self.assert_select_multiple(
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
('p[lang|="fr"]', ['lang-fr']),
('p[lang|="gb"]', []),
)
def test_attribute_exists(self):
self.assert_select_multiple(
('[rel]', ['l1', 'bob', 'me']),
('link[rel]', ['l1']),
('a[rel]', ['bob', 'me']),
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
('p[class]', ['p1', 'pmulti']),
('[blah]', []),
('p[blah]', []),
('div[data-tag]', ['data1'])
)
def test_quoted_space_in_selector_name(self):
html = """<div style="display: wrong">nope</div>
<div style="display: right">yes</div>
"""
soup = BeautifulSoup(html, 'html.parser')
[chosen] = soup.select('div[style="display: right"]')
assert "yes" == chosen.string
def test_unsupported_pseudoclass(self):
with pytest.raises(NotImplementedError):
self.soup.select("a:no-such-pseudoclass")
with pytest.raises(SelectorSyntaxError):
self.soup.select("a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
assert len(els) == 1
assert els[0].string == 'Some text'
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
assert len(els) == 1
assert els[0].string == 'Another'
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
assert len(els) == 0
# Zero will select no tags.
els = self.soup.select('div p:nth-of-type(0)')
assert len(els) == 0
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
assert len(els) == 1
assert els[0].string == 'Some text'
def test_id_child_selector_nth_of_type(self):
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element
# within the tree.
inner = self.soup.find("div", id="main")
selected = inner.select("div")
# The <div id="inner"> tag was selected. The <div id="footer">
# tag was not.
self.assert_selects_ids(selected, ['inner', 'data1'])
def test_overspecified_child_id(self):
self.assert_selects(".fancy #inner", ['inner'])
self.assert_selects(".normal #inner", [])
def test_adjacent_sibling_selector(self):
self.assert_selects('#p1 + h2', ['header2'])
self.assert_selects('#p1 + h2 + p', ['pmulti'])
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
assert [] == self.soup.select('#p1 + p')
def test_general_sibling_selector(self):
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
self.assert_selects('#p1 ~ #header2', ['header2'])
self.assert_selects('#p1 ~ h2 + a', ['me'])
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
assert [] == self.soup.select('#inner ~ h2')
def test_dangling_combinator(self):
with pytest.raises(SelectorSyntaxError):
self.soup.select('h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
# Test the selector grouping operator (the comma)
def test_multiple_select(self):
self.assert_selects('x, y', ['xid', 'yid'])
def test_multiple_select_with_no_space(self):
self.assert_selects('x,y', ['xid', 'yid'])
def test_multiple_select_with_more_space(self):
self.assert_selects('x, y', ['xid', 'yid'])
def test_multiple_select_duplicated(self):
self.assert_selects('x, x', ['xid'])
def test_multiple_select_sibling(self):
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
def test_multiple_select_tag_and_direct_descendant(self):
self.assert_selects('x, y > z', ['xid', 'zidb'])
def test_multiple_select_direct_descendant_and_tags(self):
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_multiple_select_indirect_descendant(self):
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
with pytest.raises(SelectorSyntaxError):
self.soup.select(',x, y')
with pytest.raises(SelectorSyntaxError):
self.soup.select('x,,y')
def test_multiple_select_attrs(self):
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
def test_multiple_select_ids(self):
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
def test_multiple_select_nested(self):
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
def test_select_duplicate_elements(self):
# When markup contains duplicate elements, a multiple select
# will find all of them.
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
soup = BeautifulSoup(markup, 'html.parser')
selected = soup.select(".c1, .c2")
assert 3 == len(selected)
# Verify that find_all finds the same elements, though because
# of an implementation detail it finds them in a different
# order.
for element in soup.find_all(class_=['c1', 'c2']):
assert element in selected
class TestPersistence(SoupTest):
"Testing features like pickle and deepcopy."
@ -668,12 +278,24 @@ class TestPersistence(SoupTest):
loaded = pickle.loads(dumped)
assert loaded.__class__ == BeautifulSoup
assert loaded.decode() == self.tree.decode()
def test_deepcopy_identity(self):
# Making a deepcopy of a tree yields an identical tree.
copied = copy.deepcopy(self.tree)
assert copied.decode() == self.tree.decode()
def test_copy_deeply_nested_document(self):
# This test verifies that copy and deepcopy don't involve any
# recursive function calls. If they did, this test would
# overflow the Python interpreter stack.
limit = sys.getrecursionlimit() + 1
markup = "<span>" * limit
soup = self.soup(markup)
copied = copy.copy(soup)
copied = copy.deepcopy(soup)
def test_copy_preserves_encoding(self):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding

View file

@ -24,6 +24,7 @@ from bs4.builder import (
from bs4.element import (
Comment,
SoupStrainer,
PYTHON_SPECIFIC_ENCODINGS,
Tag,
NavigableString,
)
@ -210,6 +211,47 @@ class TestConstructor(SoupTest):
assert [] == soup.string_container_stack
class TestOutput(SoupTest):
@pytest.mark.parametrize(
"eventual_encoding,actual_encoding", [
("utf-8", "utf-8"),
("utf-16", "utf-16"),
]
)
def test_decode_xml_declaration(self, eventual_encoding, actual_encoding):
# Most of the time, calling decode() on an XML document will
# give you a document declaration that mentions the encoding
# you intend to use when encoding the document as a
# bytestring.
soup = self.soup("<tag></tag>")
soup.is_xml = True
assert (f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>'
== soup.decode(eventual_encoding=eventual_encoding))
@pytest.mark.parametrize(
"eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None]
)
def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(self, eventual_encoding):
# But if you pass a Python internal encoding into decode(), or
# omit the eventual_encoding altogether, the document
# declaration won't mention any particular encoding.
soup = BeautifulSoup("<tag></tag>", "html.parser")
soup.is_xml = True
assert (f'<?xml version="1.0"?>\n<tag></tag>'
== soup.decode(eventual_encoding=eventual_encoding))
def test(self):
# BeautifulSoup subclasses Tag and extends the decode() method.
# Make sure the other Tag methods which call decode() call
# it correctly.
soup = self.soup("<tag></tag>")
assert b"<tag></tag>" == soup.encode(encoding="utf-8")
assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8")
assert "<tag></tag>" == soup.decode_contents()
assert "<tag>\n</tag>\n" == soup.prettify()
class TestWarnings(SoupTest):
# Note that some of the tests in this class create BeautifulSoup
# objects directly rather than using self.soup(). That's