Bump beautifulsoup4 from 4.12.2 to 4.12.3 (#2267)

* Bump beautifulsoup4 from 4.12.2 to 4.12.3 Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.12.2 to 4.12.3. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> * Update beautifulsoup4==4.12.3 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-20 05:13:21 -07:00 · 2024-03-24 15:26:22 -07:00 · 2024-03-24 15:26:22 -07:00 · a0170a6f3d
commit a0170a6f3d
parent faef9a94c4
25 changed files with 263 additions and 173 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.12.2"
-__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
+__version__ = "4.12.3"
+__copyright__ = "Copyright (c) 2004-2024 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"

--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -514,15 +514,19 @@ class DetectsXMLParsedAsHTML(object):
    XML_PREFIX_B = b'<?xml'
    
    @classmethod
-    def warn_if_markup_looks_like_xml(cls, markup):
+    def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
        """Perform a check on some markup to see if it looks like XML
        that's not XHTML. If so, issue a warning.

        This is much less reliable than doing the check while parsing,
        but some of the tree builders can't do that.

+        :param stacklevel: The stacklevel of the code calling this
+        function.
+
        :return: True if the markup looks like non-XHTML XML, False
        otherwise.
+
        """
        if isinstance(markup, bytes):
            prefix = cls.XML_PREFIX_B
@ -535,15 +539,16 @@ class DetectsXMLParsedAsHTML(object):
            and markup.startswith(prefix)
            and not looks_like_html.search(markup[:500])
        ):
-            cls._warn()
+            cls._warn(stacklevel=stacklevel+2)
            return True
        return False

    @classmethod
-    def _warn(cls):
+    def _warn(cls, stacklevel=5):
        """Issue a warning about XML being parsed as HTML."""
        warnings.warn(
-            XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
+            XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
+            stacklevel=stacklevel
        )
        
    def _initialize_xml_detector(self):
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -77,7 +77,9 @@ class HTML5TreeBuilder(HTMLTreeBuilder):

        # html5lib only parses HTML, so if it's given XML that's worth
        # noting.
-        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
+        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
+            markup, stacklevel=3
+        )

        yield (markup, None, None, False)

--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -378,10 +378,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser.soup = self.soup
        try:
            parser.feed(markup)
+            parser.close()
        except AssertionError as e:
            # html.parser raises AssertionError in rare cases to
            # indicate a fatal problem with the markup, especially
            # when there's an error in the doctype declaration.
            raise ParserRejectedMarkup(e)
-        parser.close()
        parser.already_closed_empty_element = []
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -179,7 +179,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.processing_instruction_class = ProcessingInstruction
            # We're in HTML mode, so if we're given XML, that's worth
            # noting.
-            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
+            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
+                markup, stacklevel=3
+            )
        else:
            self.processing_instruction_class = XMLProcessingInstruction

--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -1356,7 +1356,7 @@ class Tag(PageElement):
        This is the first step in the deepcopy process.
        """
        clone = type(self)(
-            None, self.builder, self.name, self.namespace,
+            None, None, self.name, self.namespace,
            self.prefix, self.attrs, is_xml=self._is_xml,
            sourceline=self.sourceline, sourcepos=self.sourcepos,
            can_be_empty_element=self.can_be_empty_element,
@ -1845,6 +1845,11 @@ class Tag(PageElement):
        return space_before + s + space_after

    def _format_tag(self, eventual_encoding, formatter, opening):
+        if self.hidden:
+            # A hidden tag is invisible, although its contents
+            # are visible.
+            return ''
+
        # A tag starts with the < character (see below).

        # Then the / character, if this is a closing tag.
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -51,7 +51,7 @@ class Formatter(EntitySubstitution):
            void_element_close_prefix='/', cdata_containing_tags=None,
            empty_attributes_are_booleans=False, indent=1,
    ):
-        """Constructor.
+        r"""Constructor.

        :param language: This should be Formatter.XML if you are formatting
           XML markup and Formatter.HTML if you are formatting HTML markup.
@ -76,7 +76,7 @@ class Formatter(EntitySubstitution):
            negative, or "" will only insert newlines. Using a
            positive integer indent indents that many spaces per
            level. If indent is a string (such as "\t"), that string
-            is used to indent each level. The default behavior to
+            is used to indent each level. The default behavior is to
            indent one space per level.
        """
        self.language = language
--- a/lib/bs4/tests/init.py
+++ b/lib/bs4/tests/init.py
@ -1105,7 +1105,7 @@ class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
        doc = """<?xml version="1.0" encoding="utf-8"?>
 <Document xmlns="http://example.com/ns0"
    xmlns:ns1="http://example.com/ns1"
-    xmlns:ns2="http://example.com/ns2"
+    xmlns:ns2="http://example.com/ns2">
    <ns1:tag>foo</ns1:tag>
    <ns1:tag>bar</ns1:tag>
    <ns2:tag key="value">baz</ns2:tag>
--- a/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256.testcase
+++ b/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256.testcase
@ -0,0 +1 @@
+ <20><>      <20> <css
--- a/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016.testcase
+++ b/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016.testcase
--- a/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824.testcase
+++ b/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824.testcase
--- a/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000.testcase
+++ b/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000.testcase
@ -0,0 +1 @@
+˙                 ><applet></applet><applet></applet><apple|><applet><applet><appl›„><applet><applet></applet></applet></applet></applet><applet></applet><apple>t<applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet>et><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><azplet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><plet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet></applet></applet></applet></applet></appt></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet><<meta charset=utf-8>
--- a/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624.testcase
+++ b/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624.testcase
--- a/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640.testcase
+++ b/lib/bs4/tests/fuzz/clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640.testcase
@ -0,0 +1 @@
+-      ˙˙  <math><select><mi><select><select>t
--- a/lib/bs4/tests/fuzz/crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a.testcase
+++ b/lib/bs4/tests/fuzz/crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a.testcase
--- a/lib/bs4/tests/test_fuzz.py
+++ b/lib/bs4/tests/test_fuzz.py
@ -14,30 +14,75 @@ from bs4 import (
    BeautifulSoup,
    ParserRejectedMarkup,
 )
+try:
+    from soupsieve.util import SelectorSyntaxError
+    import lxml
+    import html5lib
+    fully_fuzzable = True
+except ImportError:
+    fully_fuzzable = False
+    

+@pytest.mark.skipif(not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed.")
 class TestFuzz(object):

    # Test case markup files from fuzzers are given this extension so
    # they can be included in builds.
    TESTCASE_SUFFIX = ".testcase"

+    # Copied 20230512 from
+    # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py
+    #
+    # Copying the code lets us precisely duplicate the behavior of
+    # oss-fuzz.  The downside is that this code changes over time, so
+    # multiple copies of the code must be kept around to run against
+    # older tests. I'm not sure what to do about this, but I may
+    # retire old tests after a time.
+    def fuzz_test_with_css(self, filename):
+        data = self.__markup(filename)
+        parsers = ['lxml-xml', 'html5lib', 'html.parser', 'lxml']
+        try:
+            idx = int(data[0]) % len(parsers)
+        except ValueError:
+            return
+
+        css_selector, data = data[1:10], data[10:]
+
+        try:
+            soup = BeautifulSoup(data[1:], features=parsers[idx])
+        except ParserRejectedMarkup:
+            return
+        except ValueError:
+            return
+
+        list(soup.find_all(True))
+        try:
+            soup.css.select(css_selector.decode('utf-8', 'replace'))
+        except SelectorSyntaxError:
+            return
+        soup.prettify()
+    
    # This class of error has been fixed by catching a less helpful
    # exception from html.parser and raising ParserRejectedMarkup
    # instead.
    @pytest.mark.parametrize(
        "filename", [
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
+            "crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a",
        ]
    )
    def test_rejected_markup(self, filename):
        markup = self.__markup(filename)
        with pytest.raises(ParserRejectedMarkup):
            BeautifulSoup(markup, 'html.parser')
-
+            
    # This class of error has to do with very deeply nested documents
    # which overflow the Python call stack when the tree is converted
    # to a string. This is an issue with Beautiful Soup which was fixed
    # as part of [bug=1471755].
+    #
+    # These test cases are in the older format that doesn't specify
+    # which parser to use or give a CSS selector.
    @pytest.mark.parametrize(
        "filename", [
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
@ -46,18 +91,44 @@ class TestFuzz(object):
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
        ]
    )
-    def test_deeply_nested_document(self, filename):
+    def test_deeply_nested_document_without_css(self, filename):
        # Parsing the document and encoding it back to a string is
        # sufficient to demonstrate that the overflow problem has
        # been fixed.
        markup = self.__markup(filename)
        BeautifulSoup(markup, 'html.parser').encode()

+    # This class of error has to do with very deeply nested documents
+    # which overflow the Python call stack when the tree is converted
+    # to a string. This is an issue with Beautiful Soup which was fixed
+    # as part of [bug=1471755].
+    @pytest.mark.parametrize(
+        "filename", [
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624",
+        ]
+    )
+    def test_deeply_nested_document(self, filename): 
+       self.fuzz_test_with_css(filename)
+        
+    @pytest.mark.parametrize(
+        "filename", [
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256",
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824",
+        ]
+    )
+    def test_soupsieve_errors(self, filename):
+        self.fuzz_test_with_css(filename)
+        
    # This class of error represents problems with html5lib's parser,
    # not Beautiful Soup. I use
    # https://github.com/html5lib/html5lib-python/issues/568 to notify
    # the html5lib developers of these issues.
-    @pytest.mark.skip("html5lib problems")
+    #
+    # These test cases are in the older format that doesn't specify
+    # which parser to use or give a CSS selector.
+    @pytest.mark.skip(reason="html5lib-specific problems")
    @pytest.mark.parametrize(
        "filename", [
            # b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
@ -68,7 +139,7 @@ class TestFuzz(object):

            # b'-<math><sElect><mi><sElect><sElect>'
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",
-
+           
            # b'ñ<table><svg><html>'
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",

@ -79,10 +150,24 @@ class TestFuzz(object):
            "crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08"
        ]
    )
-    def test_html5lib_parse_errors(self, filename):
+    def test_html5lib_parse_errors_without_css(self, filename):
        markup = self.__markup(filename)
        print(BeautifulSoup(markup, 'html5lib').encode())

+    # This class of error represents problems with html5lib's parser,
+    # not Beautiful Soup. I use
+    # https://github.com/html5lib/html5lib-python/issues/568 to notify
+    # the html5lib developers of these issues.
+    @pytest.mark.skip(reason="html5lib-specific problems")
+    @pytest.mark.parametrize(
+        "filename", [
+            # b'-      \xff\xff  <math>\x10<select><mi><select><select>t'
+            "clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640",
+        ]
+    )
+    def test_html5lib_parse_errors(self, filename):
+        self.fuzz_test_with_css(filename)
+        
    def __markup(self, filename):
        if not filename.endswith(self.TESTCASE_SUFFIX):
            filename += self.TESTCASE_SUFFIX
--- a/lib/bs4/tests/test_tag.py
+++ b/lib/bs4/tests/test_tag.py
@ -219,3 +219,16 @@ class TestMultiValuedAttributes(SoupTest):
        )
        assert soup.a['class'] == 'foo'
        assert soup.a['id'] == ['bar']
+
+    def test_hidden_tag_is_invisible(self):
+        # Setting .hidden on a tag makes it invisible in output, but
+        # leaves its contents visible.
+        #
+        # This is not a documented or supported feature of Beautiful
+        # Soup (e.g. NavigableString doesn't support .hidden even
+        # though it could), but some people use it and it's not
+        # hurting anything to verify that it keeps working.
+        #
+        soup = self.soup('<div id="1"><span id="2">a string</span></div>')
+        soup.span.hidden = True
+        assert '<div id="1">a string</div>' == str(soup.div)
				`@ -0,0 +1 @@`
				˙ ><applet></applet><applet></applet><apple\|><applet><applet><appl›„><applet><applet></applet></applet></applet></applet><applet></applet><apple>t<applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet>et><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><azplet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><plet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet></applet></applet></applet></applet></appt></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet><<meta charset=utf-8>