Bump beautifulsoup4 from 4.11.2 to 4.12.2 (#2037)

* Bump beautifulsoup4 from 4.11.2 to 4.12.2 Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.2 to 4.12.2. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update beautifulsoup4==4.12.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-14 02:26:58 -07:00 · 2023-08-23 21:38:49 -07:00 · 2023-08-23 21:38:49 -07:00 · e70e08c3f5
commit e70e08c3f5
parent 1798594569
32 changed files with 1439 additions and 755 deletions
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -8,14 +8,8 @@ except ImportError as e:
 import re
 import sys
 import warnings
-try:
-    import soupsieve
-except ImportError as e:
-    soupsieve = None
-    warnings.warn(
-        'The soupsieve package is not installed. CSS selectors cannot be used.'
-    )

+from bs4.css import CSS
 from bs4.formatter import (
    Formatter,
    HTMLFormatter,
@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([
    "string-escape",
    "string_escape",
 ])
-    
+

 class NamespacedAttribute(str):
    """A namespaced string (e.g. 'xml:lang') that remembers the namespace
    ('xml') and the name ('lang') that were used to create it.
    """
-    
+
    def __new__(cls, prefix, name=None, namespace=None):
        if not name:
            # This is the default namespace. Its name "has no value"
@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
            return match.group(1) + encoding
        return self.CHARSET_RE.sub(rewrite, self.original_value)

-    
+
 class PageElement(object):
    """Contains the navigational information for some part of the page:
    that is, its current location in the parse tree.

    NavigableString, Tag, etc. are all subclasses of PageElement.
    """
-   
+
+    # In general, we can't tell just by looking at an element whether
+    # it's contained in an XML document or an HTML document. But for
+    # Tags (q.v.) we can store this information at parse time.
+    known_xml = None
+
    def setup(self, parent=None, previous_element=None, next_element=None,
              previous_sibling=None, next_sibling=None):
        """Sets up the initial relations between this element and
@ -163,7 +162,7 @@ class PageElement(object):

        :param previous_element: The element parsed immediately before
            this one.
-        
+
        :param next_element: The element parsed immediately before
            this one.

@ -257,11 +256,11 @@ class PageElement(object):
    default = object()
    def _all_strings(self, strip=False, types=default):
        """Yield all strings of certain classes, possibly stripping them.
-        
+
        This is implemented differently in Tag and NavigableString.
        """
        raise NotImplementedError()
-   
+
    @property
    def stripped_strings(self):
        """Yield all strings in this PageElement, stripping them first.
@ -294,11 +293,11 @@ class PageElement(object):
                    strip, types=types)])
    getText = get_text
    text = property(get_text)
-    
+
    def replace_with(self, *args):
-        """Replace this PageElement with one or more PageElements, keeping the 
+        """Replace this PageElement with one or more PageElements, keeping the
        rest of the tree the same.
-        
+
        :param args: One or more PageElements.
        :return: `self`, no longer part of the tree.
        """
@ -410,7 +409,7 @@ class PageElement(object):
        This works the same way as `list.insert`.

        :param position: The numeric position that should be occupied
-           in `self.children` by the new PageElement. 
+           in `self.children` by the new PageElement.
        :param new_child: A PageElement.
        """
        if new_child is None:
@ -546,7 +545,7 @@ class PageElement(object):
                "Element has no parent, so 'after' has no meaning.")
        if any(x is self for x in args):
            raise ValueError("Can't insert an element after itself.")
-        
+
        offset = 0
        for successor in args:
            # Extract first so that the index won't be screwed up if they
@ -912,7 +911,7 @@ class PageElement(object):
        :rtype: bool
        """
        return getattr(self, '_decomposed', False) or False
-            
+   
    # Old non-property versions of the generators, for backwards
    # compatibility with BS3.
    def nextGenerator(self):
@ -936,16 +935,11 @@ class NavigableString(str, PageElement):

    When Beautiful Soup parses the markup <b>penguin</b>, it will
    create a NavigableString for the string "penguin".
-    """   
+    """

    PREFIX = ''
    SUFFIX = ''

-    # We can't tell just by looking at a string whether it's contained
-    # in an XML document or an HTML document.
-
-    known_xml = None
-
    def __new__(cls, value):
        """Create a new NavigableString.

@ -961,12 +955,22 @@ class NavigableString(str, PageElement):
        u.setup()
        return u

-    def __copy__(self):
+    def __deepcopy__(self, memo, recursive=False):
        """A copy of a NavigableString has the same contents and class
        as the original, but it is not connected to the parse tree.
+
+        :param recursive: This parameter is ignored; it's only defined
+           so that NavigableString.__deepcopy__ implements the same
+           signature as Tag.__deepcopy__.
        """
        return type(self)(self)

+    def __copy__(self):
+        """A copy of a NavigableString can only be a deep copy, because
+        only one PageElement can occupy a given place in a parse tree.
+        """
+        return self.__deepcopy__({})
+
    def __getnewargs__(self):
        return (str(self),)

@ -1059,10 +1063,10 @@ class PreformattedString(NavigableString):
    as comments (the Comment class) and CDATA blocks (the CData
    class).
    """
-    
+
    PREFIX = ''
    SUFFIX = ''
-    
+
    def output_ready(self, formatter=None):
        """Make this string ready for output by adding any subclass-specific
            prefix or suffix.
@ -1144,7 +1148,7 @@ class Stylesheet(NavigableString):
    """
    pass

-    
+
 class Script(NavigableString):
    """A NavigableString representing an executable script (probably
    Javascript).
@ -1250,7 +1254,7 @@ class Tag(PageElement):
        if ((not builder or builder.store_line_numbers)
            and (sourceline is not None or sourcepos is not None)):
            self.sourceline = sourceline
-            self.sourcepos = sourcepos        
+            self.sourcepos = sourcepos
        if attrs is None:
            attrs = {}
        elif attrs:
@ -1308,13 +1312,49 @@ class Tag(PageElement):
                self.interesting_string_types = builder.string_containers[self.name]
            else:
                self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
-            
+
    parserClass = _alias("parser_class")  # BS3

-    def __copy__(self):
-        """A copy of a Tag is a new Tag, unconnected to the parse tree.
+    def __deepcopy__(self, memo, recursive=True):
+        """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
        Its contents are a copy of the old Tag's contents.
        """
+        clone = self._clone()
+
+        if recursive:
+            # Clone this tag's descendants recursively, but without
+            # making any recursive function calls.
+            tag_stack = [clone]
+            for event, element in self._event_stream(self.descendants):
+                if event is Tag.END_ELEMENT_EVENT:
+                    # Stop appending incoming Tags to the Tag that was
+                    # just closed.
+                    tag_stack.pop()
+                else:
+                    descendant_clone = element.__deepcopy__(
+                        memo, recursive=False
+                    )
+                    # Add to its parent's .contents
+                    tag_stack[-1].append(descendant_clone)
+
+                    if event is Tag.START_ELEMENT_EVENT:
+                        # Add the Tag itself to the stack so that its
+                        # children will be .appended to it.
+                        tag_stack.append(descendant_clone)
+        return clone
+
+    def __copy__(self):
+        """A copy of a Tag must always be a deep copy, because a Tag's
+        children can only have one parent at a time.
+        """
+        return self.__deepcopy__({})
+
+    def _clone(self):
+        """Create a new Tag just like this one, but with no
+        contents and unattached to any parse tree.
+
+        This is the first step in the deepcopy process.
+        """
        clone = type(self)(
            None, self.builder, self.name, self.namespace,
            self.prefix, self.attrs, is_xml=self._is_xml,
@ -1326,8 +1366,6 @@ class Tag(PageElement):
        )
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
-        for child in self.contents:
-            clone.append(child.__copy__())
        return clone
    
    @property
@ -1433,7 +1471,7 @@ class Tag(PageElement):
            i.contents = []
            i._decomposed = True
            i = n
-           
+
    def clear(self, decompose=False):
        """Wipe out all children of this PageElement by calling extract()
           on them.
@ -1521,7 +1559,7 @@ class Tag(PageElement):
        if not isinstance(value, list):
            value = [value]
        return value
-    
+
    def has_attr(self, key):
        """Does this PageElement have an attribute with the given name?"""
        return key in self.attrs
@ -1608,7 +1646,7 @@ class Tag(PageElement):
    def __repr__(self, encoding="unicode-escape"):
        """Renders this PageElement as a string.

-        :param encoding: The encoding to use (Python 2 only). 
+        :param encoding: The encoding to use (Python 2 only).
            TODO: This is now ignored and a warning should be issued
            if a value is provided.
        :return: A (Unicode) string.
@ -1650,106 +1688,212 @@ class Tag(PageElement):

    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               formatter="minimal"):
-        """Render a Unicode representation of this PageElement and its
-        contents.
-
-        :param indent_level: Each line of the rendering will be
-             indented this many spaces. Used internally in
-             recursive calls while pretty-printing.
-        :param eventual_encoding: The tag is destined to be
-            encoded into this encoding. This method is _not_
-            responsible for performing that encoding. This information
-            is passed in so that it can be substituted in if the
-            document contains a <META> tag that mentions the document's
-            encoding.
-        :param formatter: A Formatter object, or a string naming one of
-            the standard formatters.
-        """
-
+               formatter="minimal",
+               iterator=None):
+        pieces = []
        # First off, turn a non-Formatter `formatter` into a Formatter
        # object. This will stop the lookup from happening over and
        # over again.
        if not isinstance(formatter, Formatter):
            formatter = self.formatter_for_name(formatter)
-        attributes = formatter.attributes(self)
-        attrs = []
-        for key, val in attributes:
-            if val is None:
-                decoded = key
+
+        if indent_level is True:
+            indent_level = 0
+
+        # The currently active tag that put us into string literal
+        # mode. Until this element is closed, children will be treated
+        # as string literals and not pretty-printed. String literal
+        # mode is turned on immediately after this tag begins, and
+        # turned off immediately before it's closed. This means there
+        # will be whitespace before and after the tag itself.
+        string_literal_tag = None
+
+        for event, element in self._event_stream(iterator):
+            if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
+                piece = element._format_tag(
+                    eventual_encoding, formatter, opening=True
+                )
+            elif event is Tag.END_ELEMENT_EVENT:
+                piece = element._format_tag(
+                    eventual_encoding, formatter, opening=False
+                )
+                if indent_level is not None:
+                    indent_level -= 1
            else:
-                if isinstance(val, list) or isinstance(val, tuple):
-                    val = ' '.join(val)
-                elif not isinstance(val, str):
-                    val = str(val)
-                elif (
-                        isinstance(val, AttributeValueWithCharsetSubstitution)
-                        and eventual_encoding is not None
-                ):
-                    val = val.encode(eventual_encoding)
+                piece = element.output_ready(formatter)

-                text = formatter.attribute_value(val)
-                decoded = (
-                    str(key) + '='
-                    + formatter.quoted_attribute_value(text))
-            attrs.append(decoded)
-        close = ''
-        closeTag = ''
+            # Now we need to apply the 'prettiness' -- extra
+            # whitespace before and/or after this tag. This can get
+            # complicated because certain tags, like <pre> and
+            # <script>, can't be prettified, since adding whitespace would
+            # change the meaning of the content.

+            # The default behavior is to add whitespace before and
+            # after an element when string literal mode is off, and to
+            # leave things as they are when string literal mode is on.
+            if string_literal_tag:
+                indent_before = indent_after = False
+            else:
+                indent_before = indent_after = True
+
+            # The only time the behavior is more complex than that is
+            # when we encounter an opening or closing tag that might
+            # put us into or out of string literal mode.
+            if (event is Tag.START_ELEMENT_EVENT
+                and not string_literal_tag
+                and not element._should_pretty_print()):
+                    # We are about to enter string literal mode. Add
+                    # whitespace before this tag, but not after. We
+                    # will stay in string literal mode until this tag
+                    # is closed.
+                    indent_before = True
+                    indent_after = False
+                    string_literal_tag = element
+            elif (event is Tag.END_ELEMENT_EVENT
+                  and element is string_literal_tag):
+                # We are about to exit string literal mode by closing
+                # the tag that sent us into that mode. Add whitespace
+                # after this tag, but not before.
+                indent_before = False
+                indent_after = True
+                string_literal_tag = None
+
+            # Now we know whether to add whitespace before and/or
+            # after this element.
+            if indent_level is not None:
+                if (indent_before or indent_after):
+                    if isinstance(element, NavigableString):
+                        piece = piece.strip()
+                    if piece:
+                        piece = self._indent_string(
+                            piece, indent_level, formatter,
+                            indent_before, indent_after
+                        )
+                if event == Tag.START_ELEMENT_EVENT:
+                    indent_level += 1
+            pieces.append(piece)
+        return "".join(pieces)
+
+    # Names for the different events yielded by _event_stream
+    START_ELEMENT_EVENT = object()
+    END_ELEMENT_EVENT = object()
+    EMPTY_ELEMENT_EVENT = object()
+    STRING_ELEMENT_EVENT = object()
+
+    def _event_stream(self, iterator=None):
+        """Yield a sequence of events that can be used to reconstruct the DOM
+        for this element.
+
+        This lets us recreate the nested structure of this element
+        (e.g. when formatting it as a string) without using recursive
+        method calls.
+
+        This is similar in concept to the SAX API, but it's a simpler
+        interface designed for internal use. The events are different
+        from SAX and the arguments associated with the events are Tags
+        and other Beautiful Soup objects.
+
+        :param iterator: An alternate iterator to use when traversing
+         the tree.
+        """
+        tag_stack = []
+
+        iterator = iterator or self.self_and_descendants
+
+        for c in iterator:
+            # If the parent of the element we're about to yield is not
+            # the tag currently on the stack, it means that the tag on
+            # the stack closed before this element appeared.
+            while tag_stack and c.parent != tag_stack[-1]:
+                now_closed_tag = tag_stack.pop()
+                yield Tag.END_ELEMENT_EVENT, now_closed_tag
+
+            if isinstance(c, Tag):
+                if c.is_empty_element:
+                    yield Tag.EMPTY_ELEMENT_EVENT, c
+                else:
+                    yield Tag.START_ELEMENT_EVENT, c
+                    tag_stack.append(c)
+                    continue
+            else:
+                yield Tag.STRING_ELEMENT_EVENT, c
+
+        while tag_stack:
+            now_closed_tag = tag_stack.pop()
+            yield Tag.END_ELEMENT_EVENT, now_closed_tag
+
+    def _indent_string(self, s, indent_level, formatter,
+                       indent_before, indent_after):
+        """Add indentation whitespace before and/or after a string.
+
+        :param s: The string to amend with whitespace.
+        :param indent_level: The indentation level; affects how much
+           whitespace goes before the string.
+        :param indent_before: Whether or not to add whitespace
+           before the string.
+        :param indent_after: Whether or not to add whitespace
+           (a newline) after the string.
+        """
+        space_before = ''
+        if indent_before and indent_level:
+            space_before = (formatter.indent * indent_level)
+
+        space_after = ''
+        if indent_after:
+            space_after = "\n"
+
+        return space_before + s + space_after
+
+    def _format_tag(self, eventual_encoding, formatter, opening):
+        # A tag starts with the < character (see below).
+
+        # Then the / character, if this is a closing tag.
+        closing_slash = ''
+        if not opening:
+            closing_slash = '/'
+
+        # Then an optional namespace prefix.
        prefix = ''
        if self.prefix:
            prefix = self.prefix + ":"

-        if self.is_empty_element:
-            close = formatter.void_element_close_prefix or ''
-        else:
-            closeTag = '</%s%s>' % (prefix, self.name)
+        # Then a list of attribute values, if this is an opening tag.
+        attribute_string = ''
+        if opening:
+            attributes = formatter.attributes(self)
+            attrs = []
+            for key, val in attributes:
+                if val is None:
+                    decoded = key
+                else:
+                    if isinstance(val, list) or isinstance(val, tuple):
+                        val = ' '.join(val)
+                    elif not isinstance(val, str):
+                        val = str(val)
+                    elif (
+                            isinstance(val, AttributeValueWithCharsetSubstitution)
+                            and eventual_encoding is not None
+                    ):
+                        val = val.encode(eventual_encoding)

-        pretty_print = self._should_pretty_print(indent_level)
-        space = ''
-        indent_space = ''
-        if indent_level is not None:
-            indent_space = (formatter.indent * (indent_level - 1))
-        if pretty_print:
-            space = indent_space
-            indent_contents = indent_level + 1
-        else:
-            indent_contents = None
-        contents = self.decode_contents(
-            indent_contents, eventual_encoding, formatter
-        )
-
-        if self.hidden:
-            # This is the 'document root' object.
-            s = contents
-        else:
-            s = []
-            attribute_string = ''
+                    text = formatter.attribute_value(val)
+                    decoded = (
+                        str(key) + '='
+                        + formatter.quoted_attribute_value(text))
+                attrs.append(decoded)
            if attrs:
                attribute_string = ' ' + ' '.join(attrs)
-            if indent_level is not None:
-                # Even if this particular tag is not pretty-printed,
-                # we should indent up to the start of the tag.
-                s.append(indent_space)
-            s.append('<%s%s%s%s>' % (
-                    prefix, self.name, attribute_string, close))
-            if pretty_print:
-                s.append("\n")
-            s.append(contents)
-            if pretty_print and contents and contents[-1] != "\n":
-                s.append("\n")
-            if pretty_print and closeTag:
-                s.append(space)
-            s.append(closeTag)
-            if indent_level is not None and closeTag and self.next_sibling:
-                # Even if this particular tag is not pretty-printed,
-                # we're now done with the tag, and we should add a
-                # newline if appropriate.
-                s.append("\n")
-            s = ''.join(s)
-        return s

-    def _should_pretty_print(self, indent_level):
+        # Then an optional closing slash (for a void element in an
+        # XML document).
+        void_element_closing_slash = ''
+        if self.is_empty_element:
+            void_element_closing_slash = formatter.void_element_close_prefix or ''
+
+        # Put it all together.
+        return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
+
+    def _should_pretty_print(self, indent_level=1):
        """Should this tag be pretty-printed?

        Most of them should, but some (such as <pre> in HTML
@ -1770,7 +1914,7 @@ class Tag(PageElement):
            a Unicode string will be returned.
        :param formatter: A Formatter object, or a string naming one of
            the standard formatters.
-        :return: A Unicode string (if encoding==None) or a bytestring 
+        :return: A Unicode string (if encoding==None) or a bytestring
            (otherwise).
        """
        if encoding is None:
@ -1800,33 +1944,9 @@ class Tag(PageElement):
            the standard Formatters.

        """
-        # First off, turn a string formatter into a Formatter object. This
-        # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter):
-            formatter = self.formatter_for_name(formatter)
+        return self.decode(indent_level, eventual_encoding, formatter,
+                           iterator=self.descendants)

-        pretty_print = (indent_level is not None)
-        s = []
-        for c in self:
-            text = None
-            if isinstance(c, NavigableString):
-                text = c.output_ready(formatter)
-            elif isinstance(c, Tag):
-                s.append(c.decode(indent_level, eventual_encoding,
-                                  formatter))
-            preserve_whitespace = (
-                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
-            )
-            if text and indent_level and not preserve_whitespace:
-                text = text.strip()
-            if text:
-                if pretty_print and not preserve_whitespace:
-                    s.append(formatter.indent * (indent_level - 1))
-                s.append(text)
-                if pretty_print and not preserve_whitespace:
-                    s.append("\n")
-        return ''.join(s)
-       
    def encode_contents(
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
        formatter="minimal"):
@ -1922,6 +2042,18 @@ class Tag(PageElement):
        # return iter() to make the purpose of the method clear
        return iter(self.contents)  # XXX This seems to be untested.

+    @property
+    def self_and_descendants(self):
+        """Iterate over this PageElement and its children in a
+        breadth-first sequence.
+
+        :yield: A sequence of PageElements.
+        """
+        if not self.hidden:
+            yield self
+        for i in self.descendants:
+            yield i
+
    @property
    def descendants(self):
        """Iterate over all children of this PageElement in a
@ -1948,16 +2080,13 @@ class Tag(PageElement):
           Beautiful Soup will use the prefixes it encountered while
           parsing the document.

-        :param kwargs: Keyword arguments to be passed into SoupSieve's 
+        :param kwargs: Keyword arguments to be passed into Soup Sieve's
           soupsieve.select() method.

        :return: A Tag.
        :rtype: bs4.element.Tag
        """
-        value = self.select(selector, namespaces, 1, **kwargs)
-        if value:
-            return value[0]
-        return None
+        return self.css.select_one(selector, namespaces, **kwargs)

    def select(self, selector, namespaces=None, limit=None, **kwargs):
        """Perform a CSS selection operation on the current element.
@ -1973,27 +2102,18 @@ class Tag(PageElement):

        :param limit: After finding this number of results, stop looking.

-        :param kwargs: Keyword arguments to be passed into SoupSieve's 
+        :param kwargs: Keyword arguments to be passed into SoupSieve's
           soupsieve.select() method.

        :return: A ResultSet of Tags.
        :rtype: bs4.element.ResultSet
        """
-        if namespaces is None:
-            namespaces = self._namespaces
-        
-        if limit is None:
-            limit = 0
-        if soupsieve is None:
-            raise NotImplementedError(
-                "Cannot execute CSS selectors because the soupsieve package is not installed."
-            )
-            
-        results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
+        return self.css.select(selector, namespaces, limit, **kwargs)

-        # We do this because it's more consistent and because
-        # ResultSet.__getattr__ has a helpful error message.
-        return ResultSet(None, results)
+    @property
+    def css(self):
+        """Return an interface to the CSS selector API."""
+        return CSS(self)

    # Old names for backwards compatibility
    def childGenerator(self):
@ -2038,7 +2158,7 @@ class SoupStrainer(object):
        :param attrs: A dictionary of filters on attribute values.
        :param string: A filter for a NavigableString with specific text.
        :kwargs: A dictionary of filters on attribute values.
-        """        
+        """
        if string is None and 'text' in kwargs:
            string = kwargs.pop('text')
            warnings.warn(
@ -2137,7 +2257,7 @@ class SoupStrainer(object):
            # looking at a tag with a different name.
            if markup and not markup.prefix and self.name != markup.name:
                 return False
-            
+
        call_function_with_tag_data = (
            isinstance(self.name, Callable)
            and not isinstance(markup_name, Tag))
@ -2223,7 +2343,7 @@ class SoupStrainer(object):
            if self._matches(' '.join(markup), match_against):
                return True
            return False
-        
+
        if match_against is True:
            # True matches any non-None value.
            return markup is not None
@ -2267,11 +2387,11 @@ class SoupStrainer(object):
                        return True
            else:
                return False
-        
+
        # Beyond this point we might need to run the test twice: once against
        # the tag's name and once against its prefixed name.
        match = False
-        
+
        if not match and isinstance(match_against, str):
            # Exact string match
            match = markup == match_against