Bump beautifulsoup4 from 4.11.1 to 4.11.2 (#1987)

* Bump beautifulsoup4 from 4.11.1 to 4.11.2 Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.1 to 4.11.2. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> * Update beautifulsoup4==4.11.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-20 21:33:18 -07:00 · 2023-03-02 20:56:24 -08:00 · 2023-03-02 20:56:24 -08:00 · 8e42757b2d
commit 8e42757b2d
parent ded93ef2f5
23 changed files with 449 additions and 537 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
 provides methods and Pythonic idioms that make it easy to navigate,
 search, and modify the parse tree.

-Beautiful Soup works with Python 3.5 and up. It works better if lxml
+Beautiful Soup works with Python 3.6 and up. It works better if lxml
 and/or html5lib is installed.

 For more than you ever wanted to know about Beautiful Soup, see the
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.11.1"
-__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
+__version__ = "4.11.2"
+__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"

@ -211,7 +211,7 @@ class BeautifulSoup(Tag):
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
                    'has been renamed to "%s."' % (old_name, new_name),
-                    DeprecationWarning
+                    DeprecationWarning, stacklevel=3
                )
                return kwargs.pop(old_name)
            return None
@ -405,7 +405,8 @@ class BeautifulSoup(Tag):
                    'The input looks more like a URL than markup. You may want to use'
                    ' an HTTP client like requests to get the document behind'
                    ' the URL, and feed that document to Beautiful Soup.',
-                    MarkupResemblesLocatorWarning
+                    MarkupResemblesLocatorWarning,
+                    stacklevel=3
                )
                return True
        return False
@ -436,7 +437,7 @@ class BeautifulSoup(Tag):
                'The input looks more like a filename than markup. You may'
                ' want to open this file and pass the filehandle into'
                ' Beautiful Soup.',
-                MarkupResemblesLocatorWarning
+                MarkupResemblesLocatorWarning, stacklevel=3
            )
            return True
        return False
@ -789,7 +790,7 @@ class BeautifulStoneSoup(BeautifulSoup):
        warnings.warn(
            'The BeautifulStoneSoup class is deprecated. Instead of using '
            'it, pass features="xml" into the BeautifulSoup constructor.',
-            DeprecationWarning
+            DeprecationWarning, stacklevel=2
        )
        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)

--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -122,7 +122,7 @@ class TreeBuilder(object):
    
    # A value for these tag/attribute combinations is a space- or
    # comma-separated list of CDATA, rather than a single CDATA.
-    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+    DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)

    # Whitespace should be preserved inside these tags.
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        # ATM because the html5lib TreeBuilder doesn't use
        # UnicodeDammit.
        if exclude_encodings:
-            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+            warnings.warn(
+                "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
+                stacklevel=3
+            )

        # html5lib only parses HTML, so if it's given XML that's worth
        # noting.
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
    # These methods are defined by Beautiful Soup.
    def feed(self, markup):
        if self.soup.parse_only is not None:
-            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+            warnings.warn(
+                "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
+                stacklevel=4
+            )
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        self.underlying_builder.parser = parser
        extra_kwargs = dict()
@ -249,9 +255,9 @@ class AttrList(object):
        # If this attribute is a multi-valued attribute for this element,
        # turn its value into a list.
        list_attr = self.element.cdata_list_attributes or {}
-        if (name in list_attr.get('*')
+        if (name in list_attr.get('*', [])
            or (self.element.name in list_attr
-                and name in list_attr[self.element.name])):
+                and name in list_attr.get(self.element.name, []))):
            # A node that is being cloned may have already undergone
            # this procedure.
            if not isinstance(value, list):
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -10,30 +10,9 @@ __all__ = [

 from html.parser import HTMLParser

-try:
-    from html.parser import HTMLParseError
-except ImportError as e:
-    # HTMLParseError is removed in Python 3.5. Since it can never be
-    # thrown in 3.5, we can just define our own class as a placeholder.
-    class HTMLParseError(Exception):
-        pass
-
 import sys
 import warnings

-# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
-# argument, which we'd like to set to False. Unfortunately,
-# http://bugs.python.org/issue13273 makes strict=True a better bet
-# before Python 3.2.3.
-#
-# At the end of this file, we monkeypatch HTMLParser so that
-# strict=True works well on Python 3.2.2.
-major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
-CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
-CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
-
-
 from bs4.element import (
    CData,
    Comment,
@ -90,20 +69,7 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
        self.already_closed_empty_element = []

        self._initialize_xml_detector()
-        
-    def error(self, msg):
-        """In Python 3, HTMLParser subclasses must implement error(), although
-        this requirement doesn't appear to be documented.

-        In Python 2, HTMLParser implements error() by raising an exception,
-        which we don't want to do.
-
-        In any event, this method is called only on very strange
-        markup and our best strategy is to pretend it didn't happen
-        and keep going.
-        """
-        warnings.warn(msg)
-        
    def handle_startendtag(self, name, attrs):
        """Handle an incoming empty-element tag.

@ -203,9 +169,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):

        :param name: Character number, possibly in hexadecimal.
        """
-        # XXX workaround for a bug in HTMLParser. Remove this once
-        # it's fixed in all supported versions.
-        # http://bugs.python.org/issue13633
+        # TODO: This was originally a workaround for a bug in
+        # HTMLParser. (http://bugs.python.org/issue13633) The bug has
+        # been fixed, but removing this code still makes some
+        # Beautiful Soup tests fail. This needs investigation.
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
@ -333,10 +300,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser_args = parser_args or []
        parser_kwargs = parser_kwargs or {}
        parser_kwargs.update(extra_parser_kwargs)
-        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
-            parser_kwargs['strict'] = False
-        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
-            parser_kwargs['convert_charrefs'] = False
+        parser_kwargs['convert_charrefs'] = False
        self.parser_args = (parser_args, parser_kwargs)
        
    def prepare_markup(self, markup, user_specified_encoding=None,
@ -395,105 +359,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
-        try:
-            parser.feed(markup)
-            parser.close()
-        except HTMLParseError as e:
-            warnings.warn(RuntimeWarning(
-                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
-            raise e
+        parser.feed(markup)
+        parser.close()
        parser.already_closed_empty_element = []
-
-# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
-# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
-# string.
-#
-# XXX This code can be removed once most Python 3 users are on 3.2.3.
-if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
-    import re
-    attrfind_tolerant = re.compile(
-        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
-        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
-    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
-
-    locatestarttagend = re.compile(r"""
-  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
-  (?:\s+                             # whitespace before attribute name
-    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
-      (?:\s*=\s*                     # value indicator
-        (?:'[^']*'                   # LITA-enclosed value
-          |\"[^\"]*\"                # LIT-enclosed value
-          |[^'\">\s]+                # bare value
-         )
-       )?
-     )
-   )*
-  \s*                                # trailing whitespace
-""", re.VERBOSE)
-    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
-
-    from html.parser import tagfind, attrfind
-
-    def parse_starttag(self, i):
-        self.__starttag_text = None
-        endpos = self.check_for_whole_start_tag(i)
-        if endpos < 0:
-            return endpos
-        rawdata = self.rawdata
-        self.__starttag_text = rawdata[i:endpos]
-
-        # Now parse the data between i+1 and j into a tag and attrs
-        attrs = []
-        match = tagfind.match(rawdata, i+1)
-        assert match, 'unexpected call to parse_starttag()'
-        k = match.end()
-        self.lasttag = tag = rawdata[i+1:k].lower()
-        while k < endpos:
-            if self.strict:
-                m = attrfind.match(rawdata, k)
-            else:
-                m = attrfind_tolerant.match(rawdata, k)
-            if not m:
-                break
-            attrname, rest, attrvalue = m.group(1, 2, 3)
-            if not rest:
-                attrvalue = None
-            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
-                 attrvalue[:1] == '"' == attrvalue[-1:]:
-                attrvalue = attrvalue[1:-1]
-            if attrvalue:
-                attrvalue = self.unescape(attrvalue)
-            attrs.append((attrname.lower(), attrvalue))
-            k = m.end()
-
-        end = rawdata[k:endpos].strip()
-        if end not in (">", "/>"):
-            lineno, offset = self.getpos()
-            if "\n" in self.__starttag_text:
-                lineno = lineno + self.__starttag_text.count("\n")
-                offset = len(self.__starttag_text) \
-                         - self.__starttag_text.rfind("\n")
-            else:
-                offset = offset + len(self.__starttag_text)
-            if self.strict:
-                self.error("junk characters in start tag: %r"
-                           % (rawdata[k:endpos][:20],))
-            self.handle_data(rawdata[i:endpos])
-            return endpos
-        if end.endswith('/>'):
-            # XHTML-style empty tag: <span attr="value" />
-            self.handle_startendtag(tag, attrs)
-        else:
-            self.handle_starttag(tag, attrs)
-            if tag in self.CDATA_CONTENT_ELEMENTS:
-                self.set_cdata_mode(tag)
-        return endpos
-
-    def set_cdata_mode(self, elem):
-        self.cdata_elem = elem.lower()
-        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
-
-    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
-    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
-
-    CONSTRUCTOR_TAKES_STRICT = True
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -496,13 +496,16 @@ class PageElement(object):
    def extend(self, tags):
        """Appends the given PageElements to this one's contents.

-        :param tags: A list of PageElements.
+        :param tags: A list of PageElements. If a single Tag is
+            provided instead, this PageElement's contents will be extended
+            with that Tag's contents.
        """
        if isinstance(tags, Tag):
-            # Calling self.append() on another tag's contents will change
-            # the list we're iterating over. Make a list that won't
-            # change.
-            tags = list(tags.contents)
+            tags = tags.contents
+        if isinstance(tags, list):
+            # Moving items around the tree may change their position in
+            # the original list. Make a list that won't change.
+            tags = list(tags)
        for tag in tags:
            self.append(tag)

@ -586,8 +589,9 @@ class PageElement(object):
        :kwargs: A dictionary of filters on attribute values.
        :return: A ResultSet containing PageElements.
        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
        return self._find_all(name, attrs, string, limit, self.next_elements,
-                             **kwargs)
+                              _stacklevel=_stacklevel+1, **kwargs)
    findAllNext = find_all_next  # BS3

    def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@ -624,8 +628,11 @@ class PageElement(object):
        :return: A ResultSet of PageElements.
        :rtype: bs4.element.ResultSet
        """
-        return self._find_all(name, attrs, string, limit,
-                              self.next_siblings, **kwargs)
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(
+            name, attrs, string, limit,
+            self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
+        )
    findNextSiblings = find_next_siblings   # BS3
    fetchNextSiblings = find_next_siblings  # BS2

@ -663,8 +670,11 @@ class PageElement(object):
        :return: A ResultSet of PageElements.
        :rtype: bs4.element.ResultSet
        """
-        return self._find_all(name, attrs, string, limit, self.previous_elements,
-                           **kwargs)
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(
+            name, attrs, string, limit, self.previous_elements,
+            _stacklevel=_stacklevel+1, **kwargs
+        )
    findAllPrevious = find_all_previous  # BS3
    fetchPrevious = find_all_previous    # BS2

@ -702,8 +712,11 @@ class PageElement(object):
        :return: A ResultSet of PageElements.
        :rtype: bs4.element.ResultSet
        """
-        return self._find_all(name, attrs, string, limit,
-                              self.previous_siblings, **kwargs)
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(
+            name, attrs, string, limit,
+            self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
+        )
    findPreviousSiblings = find_previous_siblings   # BS3
    fetchPreviousSiblings = find_previous_siblings  # BS2

@ -724,7 +737,7 @@ class PageElement(object):
        # NOTE: We can't use _find_one because findParents takes a different
        # set of arguments.
        r = None
-        l = self.find_parents(name, attrs, 1, **kwargs)
+        l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
        if l:
            r = l[0]
        return r
@ -744,8 +757,9 @@ class PageElement(object):
        :return: A PageElement.
        :rtype: bs4.element.Tag | bs4.element.NavigableString
        """
+        _stacklevel = kwargs.pop('_stacklevel', 2)
        return self._find_all(name, attrs, None, limit, self.parents,
-                             **kwargs)
+                              _stacklevel=_stacklevel+1, **kwargs)
    findParents = find_parents   # BS3
    fetchParents = find_parents  # BS2

@ -771,19 +785,20 @@ class PageElement(object):

    def _find_one(self, method, name, attrs, string, **kwargs):
        r = None
-        l = method(name, attrs, string, 1, **kwargs)
+        l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
        if l:
            r = l[0]
        return r

    def _find_all(self, name, attrs, string, limit, generator, **kwargs):
        "Iterates over a generator looking for things that match."
+        _stacklevel = kwargs.pop('_stacklevel', 3)

        if string is None and 'text' in kwargs:
            string = kwargs.pop('text')
            warnings.warn(
                "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
-                DeprecationWarning
+                DeprecationWarning, stacklevel=_stacklevel
            )

        if isinstance(name, SoupStrainer):
@ -1306,7 +1321,8 @@ class Tag(PageElement):
            sourceline=self.sourceline, sourcepos=self.sourcepos,
            can_be_empty_element=self.can_be_empty_element,
            cdata_list_attributes=self.cdata_list_attributes,
-            preserve_whitespace_tags=self.preserve_whitespace_tags
+            preserve_whitespace_tags=self.preserve_whitespace_tags,
+            interesting_string_types=self.interesting_string_types
        )
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
@ -1558,7 +1574,7 @@ class Tag(PageElement):
                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
                    name=tag_name
                ),
-                DeprecationWarning
+                DeprecationWarning, stacklevel=2
            )
            return self.find(tag_name)
        # We special case contents to avoid recursion.
@ -1862,7 +1878,8 @@ class Tag(PageElement):
        :rtype: bs4.element.Tag | bs4.element.NavigableString
        """
        r = None
-        l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
+        l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
+                          **kwargs)
        if l:
            r = l[0]
        return r
@ -1889,7 +1906,9 @@ class Tag(PageElement):
        generator = self.descendants
        if not recursive:
            generator = self.children
-        return self._find_all(name, attrs, string, limit, generator, **kwargs)
+        _stacklevel = kwargs.pop('_stacklevel', 2)
+        return self._find_all(name, attrs, string, limit, generator,
+                              _stacklevel=_stacklevel+1, **kwargs)
    findAll = find_all       # BS3
    findChildren = find_all  # BS2

@ -1993,7 +2012,7 @@ class Tag(PageElement):
        """
        warnings.warn(
            'has_key is deprecated. Use has_attr(key) instead.',
-            DeprecationWarning
+            DeprecationWarning, stacklevel=2
        )
        return self.has_attr(key)

@ -2024,7 +2043,7 @@ class SoupStrainer(object):
            string = kwargs.pop('text')
            warnings.warn(
                "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
-                DeprecationWarning
+                DeprecationWarning, stacklevel=2
            )

        self.name = self._normalize_search_value(name)
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
    """A generic Formatter for HTML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
-        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
+        super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)

    
 class XMLFormatter(Formatter):
    """A generic Formatter for XML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
-        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+        super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)


 # Set up aliases for the default formatters.
--- a/lib/bs4/tests/init.py
+++ b/lib/bs4/tests/init.py
@ -29,6 +29,29 @@ from bs4.builder import (
 )
 default_builder = HTMLParserTreeBuilder

+# Some tests depend on specific third-party libraries. We use
+# @pytest.mark.skipIf on the following conditionals to skip them
+# if the libraries are not installed.
+try:
+    from soupsieve import SelectorSyntaxError
+    SOUP_SIEVE_PRESENT = True
+except ImportError:
+    SOUP_SIEVE_PRESENT = False
+
+try:
+    import html5lib
+    HTML5LIB_PRESENT = True
+except ImportError:
+    HTML5LIB_PRESENT = False
+
+try:
+    import lxml.etree
+    LXML_PRESENT = True
+    LXML_VERSION = lxml.etree.LXML_VERSION
+except ImportError:
+    LXML_PRESENT = False
+    LXML_VERSION = (0,)
+
 BAD_DOCUMENT = """A bare string
 <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
 <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
@ -258,10 +281,10 @@ class TreeBuilderSmokeTest(object):

    @pytest.mark.parametrize(
        "multi_valued_attributes",
-        [None, dict(b=['class']), {'*': ['notclass']}]
+        [None, {}, dict(b=['class']), {'*': ['notclass']}]
    )
    def test_attribute_not_multi_valued(self, multi_valued_attributes):
-        markup = '<a class="a b c">'
+        markup = '<html xmlns="http://www.w3.org/1999/xhtml"><a class="a b c"></html>'
        soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
        assert soup.a['class'] == 'a b c'

@ -820,26 +843,27 @@ Hello, world!
        soup = self.soup(text)
        assert soup.p.encode("utf-8") == expected

-    def test_real_iso_latin_document(self):
+    def test_real_iso_8859_document(self):
        # Smoke test of interrelated functionality, using an
        # easy-to-understand document.

-        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
-        unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+        # Here it is in Unicode. Note that it claims to be in ISO-8859-1.
+        unicode_html = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'

-        # That's because we're going to encode it into ISO-Latin-1, and use
-        # that to test.
+        # That's because we're going to encode it into ISO-8859-1,
+        # and use that to test.
        iso_latin_html = unicode_html.encode("iso-8859-1")

-        # Parse the ISO-Latin-1 HTML.
+        # Parse the ISO-8859-1 HTML.
        soup = self.soup(iso_latin_html)
+
        # Encode it to UTF-8.
        result = soup.encode("utf-8")

        # What do we expect the result to look like? Well, it would
        # look like unicode_html, except that the META tag would say
-        # UTF-8 instead of ISO-Latin-1.
-        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+        # UTF-8 instead of ISO-8859-1.
+        expected = unicode_html.replace("ISO-8859-1", "utf-8")

        # And, of course, it would be in UTF-8, not Unicode.
        expected = expected.encode("utf-8")
@ -1177,15 +1201,3 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
        assert isinstance(soup.contents[0], Comment)
        assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?'
        assert "html" == soup.contents[0].next_element.name
-
-def skipIf(condition, reason):
-   def nothing(test, *args, **kwargs):
-       return None
-
-   def decorator(test_item):
-       if condition:
-           return nothing
-       else:
-           return test_item
-
-   return decorator
--- a/lib/bs4/tests/test_builder_registry.py
+++ b/lib/bs4/tests/test_builder_registry.py
@ -10,22 +10,23 @@ from bs4.builder import (
    TreeBuilderRegistry,
 )

-try:
-    from bs4.builder import HTML5TreeBuilder
-    HTML5LIB_PRESENT = True
-except ImportError:
-    HTML5LIB_PRESENT = False
+from . import (
+    HTML5LIB_PRESENT,
+    LXML_PRESENT,
+)

-try:
+if HTML5LIB_PRESENT:
+    from bs4.builder import HTML5TreeBuilder
+
+if LXML_PRESENT:
    from bs4.builder import (
        LXMLTreeBuilderForXML,
        LXMLTreeBuilder,
        )
-    LXML_PRESENT = True
-except ImportError:
-    LXML_PRESENT = False


+# TODO: Split out the lxml and html5lib tests into their own classes
+# and gate with pytest.mark.skipIf.
 class TestBuiltInRegistry(object):
    """Test the built-in registry with the default builders registered."""

--- a/lib/bs4/tests/test_dammit.py
+++ b/lib/bs4/tests/test_dammit.py
@ -17,26 +17,24 @@ class TestUnicodeDammit(object):
        dammit = UnicodeDammit(markup)
        assert dammit.unicode_markup == markup

-    def test_smart_quotes_to_unicode(self):
+    @pytest.mark.parametrize(
+        "smart_quotes_to,expect_converted",
+        [(None, "\u2018\u2019\u201c\u201d"),
+         ("xml", "&#x2018;&#x2019;&#x201C;&#x201D;"),
+         ("html", "&lsquo;&rsquo;&ldquo;&rdquo;"),
+         ("ascii", "''" + '""'),
+        ]
+    )
+    def test_smart_quotes_to(self, smart_quotes_to, expect_converted):
+        """Verify the functionality of the smart_quotes_to argument
+        to the UnicodeDammit constructor."""
        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup)
-        assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>"
-
-    def test_smart_quotes_to_xml_entities(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
-        assert dammit.unicode_markup == "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>"
-
-    def test_smart_quotes_to_html_entities(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup, smart_quotes_to="html")
-        assert dammit.unicode_markup == "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>"
-
-    def test_smart_quotes_to_ascii(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
-        assert dammit.unicode_markup == """<foo>''""</foo>"""
-
+        converted = UnicodeDammit(
+            markup, known_definite_encodings=["windows-1252"],
+            smart_quotes_to=smart_quotes_to
+        ).unicode_markup
+        assert converted == "<foo>{}</foo>".format(expect_converted)
+        
    def test_detect_utf8(self):
        utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
        dammit = UnicodeDammit(utf8)
@ -275,23 +273,24 @@ class TestEntitySubstitution(object):
    def setup_method(self):
        self.sub = EntitySubstitution

-    def test_simple_html_substitution(self):
-        # Unicode characters corresponding to named HTML entites
-        # are substituted, and no others.
-        s = "foo\u2200\N{SNOWMAN}\u00f5bar"
-        assert self.sub.substitute_html(s) == "foo&forall;\N{SNOWMAN}&otilde;bar"

-    def test_smart_quote_substitution(self):
-        # MS smart quotes are a common source of frustration, so we
-        # give them a special test.
-        quotes = b"\x91\x92foo\x93\x94"
-        dammit = UnicodeDammit(quotes)
-        assert self.sub.substitute_html(dammit.markup) == "&lsquo;&rsquo;foo&ldquo;&rdquo;"
+    @pytest.mark.parametrize(
+        "original,substituted",
+        [
+            # Basic case. Unicode characters corresponding to named
+            # HTML entites are substituted; others are not.
+            ("foo\u2200\N{SNOWMAN}\u00f5bar",
+             "foo&forall;\N{SNOWMAN}&otilde;bar"),

+            # MS smart quotes are a common source of frustration, so we
+            # give them a special test.
+            ('‘’foo“”', "&lsquo;&rsquo;foo&ldquo;&rdquo;"),           
+        ]
+    )
+    def test_substitute_html(self, original, substituted):
+        assert self.sub.substitute_html(original) == substituted
+        
    def test_html5_entity(self):
-        # Some HTML5 entities correspond to single- or multi-character
-        # Unicode sequences.
-
        for entity, u in (
            # A few spot checks of our ability to recognize
            # special character sequences and convert them
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -1,27 +1,26 @@
 """Tests to ensure that the html5lib tree builder generates good trees."""

+import pytest
 import warnings

-try:
-    from bs4.builder import HTML5TreeBuilder
-    HTML5LIB_PRESENT = True
-except ImportError as e:
-    HTML5LIB_PRESENT = False
+from bs4 import BeautifulSoup
 from bs4.element import SoupStrainer
 from . import (
+    HTML5LIB_PRESENT,
    HTML5TreeBuilderSmokeTest,
    SoupTest,
-    skipIf,
 )

-@skipIf(
+@pytest.mark.skipif(
    not HTML5LIB_PRESENT,
-    "html5lib seems not to be present, not testing its tree builder.")
+    reason="html5lib seems not to be present, not testing its tree builder."
+)
 class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
    """See ``HTML5TreeBuilderSmokeTest``."""

    @property
    def default_builder(self):
+        from bs4.builder import HTML5TreeBuilder
        return HTML5TreeBuilder

    def test_soupstrainer(self):
@ -29,10 +28,12 @@ class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
        strainer = SoupStrainer("b")
        markup = "<p>A <b>bold</b> statement.</p>"
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup(markup, parse_only=strainer)
+            soup = BeautifulSoup(markup, "html5lib", parse_only=strainer)
        assert soup.decode() == self.document_for(markup)

-        assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message)
+        [warning] = w
+        assert warning.filename == __file__
+        assert "the html5lib tree builder doesn't support parse_only" in str(warning.message)

    def test_correctly_nested_tables(self):
        """html5lib inserts <tbody> tags where other parsers don't."""
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -122,15 +122,3 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
            with_element = div.encode(formatter="html")
            expect = b"<div>%s</div>" % output_element
            assert with_element == expect
-
-class TestHTMLParserSubclass(SoupTest):
-    def test_error(self):
-        """Verify that our HTMLParser subclass implements error() in a way
-        that doesn't cause a crash.
-        """
-        parser = BeautifulSoupHTMLParser()
-        with warnings.catch_warnings(record=True) as warns:
-            parser.error("don't crash")
-        [warning] = warns
-        assert "don't crash" == str(warning.message)
-
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -1,16 +1,10 @@
 """Tests to ensure that the lxml tree builder generates good trees."""

 import pickle
+import pytest
 import re
 import warnings
-
-try:
-    import lxml.etree
-    LXML_PRESENT = True
-    LXML_VERSION = lxml.etree.LXML_VERSION
-except ImportError as e:
-    LXML_PRESENT = False
-    LXML_VERSION = (0,)
+from . import LXML_PRESENT, LXML_VERSION

 if LXML_PRESENT:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
@ -23,13 +17,14 @@ from bs4.element import Comment, Doctype, SoupStrainer
 from . import (
    HTMLTreeBuilderSmokeTest,
    XMLTreeBuilderSmokeTest,
+    SOUP_SIEVE_PRESENT,
    SoupTest,
-    skipIf,
 )

-@skipIf(
+@pytest.mark.skipif(
    not LXML_PRESENT,
-    "lxml seems not to be present, not testing its tree builder.")
+    reason="lxml seems not to be present, not testing its tree builder."
+)
 class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""

@ -54,9 +49,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
    # test if an old version of lxml is installed.

-    @skipIf(
+    @pytest.mark.skipif(
        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
-        "Skipping doctype test for old version of lxml to avoid segfault.")
+        reason="Skipping doctype test for old version of lxml to avoid segfault."
+    )
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
@ -68,7 +64,9 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<b />")
        assert "<b/>" == str(soup.b)
-        assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)
+        [warning] = w
+        assert warning.filename == __file__
+        assert "BeautifulStoneSoup class is deprecated" in str(warning.message)

    def test_tracking_line_numbers(self):
        # The lxml TreeBuilder cannot keep track of line numbers from
@ -85,9 +83,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
        assert "sourceline" == soup.p.sourceline.name
        assert "sourcepos" == soup.p.sourcepos.name
        
-@skipIf(
+@pytest.mark.skipif(
    not LXML_PRESENT,
-    "lxml seems not to be present, not testing its XML tree builder.")
+    reason="lxml seems not to be present, not testing its XML tree builder."
+)
 class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""

@ -148,6 +147,9 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
        }


+    @pytest.mark.skipif(
+        not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"
+    )
    def test_namespace_interaction_with_select_and_find(self):
        # Demonstrate how namespaces interact with select* and
        # find* methods.
--- a/lib/bs4/tests/test_pageelement.py
+++ b/lib/bs4/tests/test_pageelement.py
@ -3,15 +3,18 @@ import copy
 import pickle
 import pytest

-from soupsieve import SelectorSyntaxError
-
 from bs4 import BeautifulSoup
 from bs4.element import (
    Comment,
    SoupStrainer,
 )
-from . import SoupTest
+from . import (
+    SoupTest,
+    SOUP_SIEVE_PRESENT,
+)

+if SOUP_SIEVE_PRESENT:
+    from soupsieve import SelectorSyntaxError

 class TestEncoding(SoupTest):
    """Test the ability to encode objects into strings."""
@ -213,6 +216,7 @@ class TestFormatters(SoupTest):
        assert soup.contents[0].name == 'pre'


+@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
 class TestCSSSelectors(SoupTest):
    """Test basic CSS selector functionality.

@ -694,6 +698,7 @@ class TestPersistence(SoupTest):
        assert tag.can_be_empty_element == copied.can_be_empty_element
        assert tag.cdata_list_attributes == copied.cdata_list_attributes
        assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
+        assert tag.interesting_string_types == copied.interesting_string_types
        
    def test_unicode_pickle(self):
        # A tree containing Unicode characters can be pickled.
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -30,19 +30,11 @@ from bs4.element import (

 from . import (
    default_builder,
+    LXML_PRESENT,
    SoupTest,
-    skipIf,
 )
 import warnings
-
-try:
-    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
-    LXML_PRESENT = True
-except ImportError as e:
-    LXML_PRESENT = False
    
-PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
-
 class TestConstructor(SoupTest):

    def test_short_unicode_input(self):
@ -139,7 +131,7 @@ class TestConstructor(SoupTest):
        assert " an id " == a['id']
        assert ["a", "class"] == a['class']

-        # TreeBuilder takes an argument called 'mutli_valued_attributes'  which lets
+        # TreeBuilder takes an argument called 'multi_valued_attributes'  which lets
        # you customize or disable this. As always, you can customize the TreeBuilder
        # by passing in a keyword argument to the BeautifulSoup constructor.
        soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
@ -219,10 +211,17 @@ class TestConstructor(SoupTest):


 class TestWarnings(SoupTest):
+    # Note that some of the tests in this class create BeautifulSoup
+    # objects directly rather than using self.soup(). That's
+    # because SoupTest.soup is defined in a different file,
+    # which will throw off the assertion in _assert_warning
+    # that the code that triggered the warning is in the same
+    # file as the test.

    def _assert_warning(self, warnings, cls):
        for w in warnings:
            if isinstance(w.message, cls):
+                assert w.filename == __file__
                return w
        raise Exception("%s warning not found in %r" % (cls, warnings))
    
@ -243,13 +242,17 @@ class TestWarnings(SoupTest):

    def test_no_warning_if_explicit_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = BeautifulSoup("<a><b></b></a>", "html.parser")
+            soup = self.soup("<a><b></b></a>")
        assert [] == w

    def test_parseOnlyThese_renamed_to_parse_only(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
-        msg = str(w[0].message)
+            soup = BeautifulSoup(
+                "<a><b></b></a>", "html.parser",
+                parseOnlyThese=SoupStrainer("b"),
+            )
+        warning = self._assert_warning(w, DeprecationWarning)
+        msg = str(warning.message)
        assert "parseOnlyThese" in msg
        assert "parse_only" in msg
        assert b"<b></b>" == soup.encode()
@ -257,8 +260,11 @@ class TestWarnings(SoupTest):
    def test_fromEncoding_renamed_to_from_encoding(self):
        with warnings.catch_warnings(record=True) as w:
            utf8 = b"\xc3\xa9"
-            soup = self.soup(utf8, fromEncoding="utf8")
-        msg = str(w[0].message)
+            soup = BeautifulSoup(
+                utf8, "html.parser", fromEncoding="utf8"
+            )
+        warning = self._assert_warning(w, DeprecationWarning)
+        msg = str(warning.message)
        assert "fromEncoding" in msg
        assert "from_encoding" in msg
        assert "utf8" == soup.original_encoding
@ -276,7 +282,7 @@ class TestWarnings(SoupTest):
        # A warning is issued if the "markup" looks like the name of
        # an HTML or text file, or a full path to a file on disk.
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("markup" + extension)
+            soup = BeautifulSoup("markup" + extension, "html.parser")
            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
            assert "looks more like a filename" in str(warning.message)

@ -291,11 +297,11 @@ class TestWarnings(SoupTest):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("markup" + extension)
        assert [] == w
-        
+
    def test_url_warning_with_bytes_url(self):
        url = b"http://www.crummybytes.com/"
        with warnings.catch_warnings(record=True) as warning_list:
-            soup = self.soup(url)
+            soup = BeautifulSoup(url, "html.parser")
        warning = self._assert_warning(
            warning_list, MarkupResemblesLocatorWarning
        )
@ -307,7 +313,7 @@ class TestWarnings(SoupTest):
        with warnings.catch_warnings(record=True) as warning_list:
            # note - this url must differ from the bytes one otherwise
            # python's warnings system swallows the second warning
-            soup = self.soup(url)
+            soup = BeautifulSoup(url, "html.parser")
        warning = self._assert_warning(
            warning_list, MarkupResemblesLocatorWarning
        )
@ -347,18 +353,22 @@ class TestNewTag(SoupTest):
        assert "foo" == new_tag.name
        assert dict(bar="baz", name="a name") == new_tag.attrs
        assert None == new_tag.parent
-        
+
+    @pytest.mark.skipif(
+        not LXML_PRESENT,
+        reason="lxml not installed, cannot parse XML document"
+    )
+    def test_xml_tag_inherits_self_closing_rules_from_builder(self):
+        xml_soup = BeautifulSoup("", "xml")
+        xml_br = xml_soup.new_tag("br")
+        xml_p = xml_soup.new_tag("p")
+
+        # Both the <br> and <p> tag are empty-element, just because
+        # they have no contents.
+        assert b"<br/>" == xml_br.encode()
+        assert b"<p/>" == xml_p.encode()
+
    def test_tag_inherits_self_closing_rules_from_builder(self):
-        if LXML_PRESENT:
-            xml_soup = BeautifulSoup("", "lxml-xml")
-            xml_br = xml_soup.new_tag("br")
-            xml_p = xml_soup.new_tag("p")
-
-            # Both the <br> and <p> tag are empty-element, just because
-            # they have no contents.
-            assert b"<br/>" == xml_br.encode()
-            assert b"<p/>" == xml_p.encode()
-
        html_soup = BeautifulSoup("", "html.parser")
        html_br = html_soup.new_tag("br")
        html_p = html_soup.new_tag("p")
@ -450,13 +460,3 @@ class TestEncodingConversion(SoupTest):
        # The internal data structures can be encoded as UTF-8.
        soup_from_unicode = self.soup(self.unicode_data)
        assert soup_from_unicode.encode('utf-8') == self.utf8_data
-
-    @skipIf(
-        PYTHON_3_PRE_3_2,
-        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
-    def test_attribute_name_containing_unicode_characters(self):
-        markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
-        assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")
-
-
-
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@ -33,7 +33,6 @@ from bs4.element import (
 )
 from . import (
    SoupTest,
-    skipIf,
 )

 class TestFind(SoupTest):
@ -910,12 +909,16 @@ class TestTreeModification(SoupTest):
        soup.a.extend(l)
        assert "<a><g></g><f></f><e></e><d></d><c></c><b></b></a>" == soup.decode()

-    def test_extend_with_another_tags_contents(self):
+    @pytest.mark.parametrize(
+        "get_tags", [lambda tag: tag, lambda tag: tag.contents]
+    )
+    def test_extend_with_another_tags_contents(self, get_tags):
        data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
        soup = self.soup(data)
        d1 = soup.find('div', id='d1')
        d2 = soup.find('div', id='d2')
-        d2.extend(d1)
+        tags = get_tags(d1)
+        d2.extend(tags)
        assert '<div id="d1"></div>' == d1.decode()
        assert '<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>' == d2.decode()
        
@ -1272,19 +1275,30 @@ class TestTreeModification(SoupTest):

 class TestDeprecatedArguments(SoupTest):

-    def test_find_type_method_string(self):
+    @pytest.mark.parametrize(
+        "method_name", [
+            "find", "find_all", "find_parent", "find_parents",
+            "find_next", "find_all_next", "find_previous",
+            "find_all_previous", "find_next_sibling", "find_next_siblings",
+            "find_previous_sibling", "find_previous_siblings",
+        ]
+    )
+    def test_find_type_method_string(self, method_name):
        soup = self.soup("<a>some</a><b>markup</b>")
+        method = getattr(soup.b, method_name)
        with warnings.catch_warnings(record=True) as w:
-            [result] = soup.find_all(text='markup')
-            assert result == 'markup'
-            assert result.parent.name == 'b'
-            msg = str(w[0].message)
+            method(text='markup')
+            [warning] = w
+            assert warning.filename == __file__
+            msg = str(warning.message)
            assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead."

    def test_soupstrainer_constructor_string(self):
        with warnings.catch_warnings(record=True) as w:
            strainer = SoupStrainer(text="text")
            assert strainer.text == 'text'
-            msg = str(w[0].message)
+            [warning] = w
+            msg = str(warning.message)
+            assert warning.filename == __file__
            assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead."