Update vendored beautifulsoup4 to 4.11.1

Adds soupsieve 2.3.2.post1
2025-08-14 02:26:53 -07:00 · 2022-11-28 21:11:01 -05:00 · 2022-11-28 21:11:01 -05:00 · 968ec8a1d8
commit 968ec8a1d8
parent 2226a74ef8
33 changed files with 10852 additions and 3694 deletions
--- a/libs/common/bs4/tests/test_lxml.py
+++ b/libs/common/bs4/tests/test_lxml.py
@ -1,5 +1,6 @@
 """Tests to ensure that the lxml tree builder generates good trees."""

+import pickle
 import re
 import warnings

@ -19,9 +20,7 @@ from bs4 import (
    BeautifulStoneSoup,
    )
 from bs4.element import Comment, Doctype, SoupStrainer
-from bs4.testing import skipIf
-from bs4.tests import test_htmlparser
-from bs4.testing import (
+from . import (
    HTMLTreeBuilderSmokeTest,
    XMLTreeBuilderSmokeTest,
    SoupTest,
@ -31,21 +30,21 @@ from bs4.testing import (
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its tree builder.")
-class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""

    @property
    def default_builder(self):
-        return LXMLTreeBuilder()
+        return LXMLTreeBuilder

    def test_out_of_range_entity(self):
-        self.assertSoupEquals(
+        self.assert_soup(
            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
-        self.assertSoupEquals(
+        self.assert_soup(
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
-        self.assertSoupEquals(
+        self.assert_soup(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
-
+        
    def test_entities_in_foreign_document_encoding(self):
        # We can't implement this case correctly because by the time we
        # hear about markup like "&#147;", it's been (incorrectly) converted into
@ -61,22 +60,140 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
-        self.assertEqual("", doctype.strip())
+        assert "" == doctype.strip()

    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<b />")
-        self.assertEqual("<b/>", str(soup.b))
-        self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
+        assert "<b/>" == str(soup.b)
+        assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)

+    def test_tracking_line_numbers(self):
+        # The lxml TreeBuilder cannot keep track of line numbers from
+        # the original markup. Even if you ask for line numbers, we
+        # don't have 'em.
+        #
+        # This means that if you have a tag like <sourceline> or
+        # <sourcepos>, attribute access will find it rather than
+        # giving you a numeric answer.
+        soup = self.soup(
+            "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
+            store_line_numbers=True
+        )
+        assert "sourceline" == soup.p.sourceline.name
+        assert "sourcepos" == soup.p.sourcepos.name
+        
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its XML tree builder.")
-class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
+class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""

    @property
    def default_builder(self):
-        return LXMLTreeBuilderForXML()
+        return LXMLTreeBuilderForXML
+
+    def test_namespace_indexing(self):
+        soup = self.soup(
+            '<?xml version="1.1"?>\n'
+            '<root>'
+            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
+            '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
+            '<subtag xmlns="http://another-unprefixed-namespace.com">'
+            '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
+            '</prefix2:tag3>'
+            '</root>'
+        )
+
+        # The BeautifulSoup object includes every namespace prefix
+        # defined in the entire document. This is the default set of
+        # namespaces used by soupsieve.
+        #
+        # Un-prefixed namespaces are not included, and if a given
+        # prefix is defined twice, only the first prefix encountered
+        # in the document shows up here.
+        assert soup._namespaces == {
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+            'prefix': 'http://prefixed-namespace.com',
+            'prefix2': 'http://another-namespace.com'
+        }
+
+        # A Tag object includes only the namespace prefixes
+        # that were in scope when it was parsed.
+
+        # We do not track un-prefixed namespaces as we can only hold
+        # one (the first one), and it will be recognized as the
+        # default namespace by soupsieve, even when operating from a
+        # tag with a different un-prefixed namespace.
+        assert soup.tag._namespaces == {
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+        assert soup.tag2._namespaces == {
+            'prefix': 'http://prefixed-namespace.com',
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+        assert soup.subtag._namespaces == {
+            'prefix2': 'http://another-namespace.com',
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+        assert soup.subsubtag._namespaces == {
+            'prefix2': 'http://another-namespace.com',
+            'xml': 'http://www.w3.org/XML/1998/namespace',
+        }
+
+
+    def test_namespace_interaction_with_select_and_find(self):
+        # Demonstrate how namespaces interact with select* and
+        # find* methods.
+        
+        soup = self.soup(
+            '<?xml version="1.1"?>\n'
+            '<root>'
+            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
+            '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
+            '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
+             '<prefix:tag3>'
+            '</subtag>'
+            '</root>'
+        )
+
+        # soupselect uses namespace URIs.
+        assert soup.select_one('tag').name == 'tag'
+        assert soup.select_one('prefix|tag2').name == 'tag2'
+
+        # If a prefix is declared more than once, only the first usage
+        # is registered with the BeautifulSoup object.
+        assert soup.select_one('prefix|tag3') is None
+
+        # But you can always explicitly specify a namespace dictionary.
+        assert soup.select_one(
+            'prefix|tag3', namespaces=soup.subtag._namespaces
+        ).name == 'tag3'
+
+        # And a Tag (as opposed to the BeautifulSoup object) will
+        # have a set of default namespaces scoped to that Tag.
+        assert soup.subtag.select_one('prefix|tag3').name=='tag3'
+
+        # the find() methods aren't fully namespace-aware; they just
+        # look at prefixes.
+        assert soup.find('tag').name == 'tag'
+        assert soup.find('prefix:tag2').name == 'tag2'
+        assert soup.find('prefix:tag3').name == 'tag3'
+        assert soup.subtag.find('prefix:tag3').name == 'tag3'
+
+    def test_pickle_removes_builder(self):
+        # The lxml TreeBuilder is not picklable, so it won't be
+        # preserved in a pickle/unpickle operation.
+
+        soup = self.soup("<a>some markup</a>")
+        assert isinstance(soup.builder, self.default_builder)
+        pickled = pickle.dumps(soup)
+        unpickled = pickle.loads(pickled)
+        assert "some markup" == unpickled.a.string
+        assert unpickled.builder is None