Update beautifulsoup4-4.10.0

2025-08-14 02:26:58 -07:00 · 2021-10-14 20:46:06 -07:00 · 2021-10-14 20:46:06 -07:00 · ab8fa4d5b3
commit ab8fa4d5b3
parent b581460b51
16 changed files with 4599 additions and 743 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -1,6 +1,5 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
+"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
+
 http://www.crummy.com/software/BeautifulSoup/

 Beautiful Soup uses a pluggable XML or HTML parser to parse a
@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
 provides methods and Pythonic idioms that make it easy to navigate,
 search, and modify the parse tree.

-Beautiful Soup works with Python 2.7 and up. It works better if lxml
+Beautiful Soup works with Python 3.5 and up. It works better if lxml
 and/or html5lib is installed.

 For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
+documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.1"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__version__ = "4.10.0"
+__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']

+
+from collections import Counter
 import os
 import re
 import sys
 import traceback
 import warnings

+# The very first thing we do is give a useful error if someone is
+# running this code under Python 2.
+if sys.version_info.major < 3:
+    raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
+
 from .builder import builder_registry, ParserRejectedMarkup
 from .dammit import UnicodeDammit
 from .element import (
@ -42,28 +46,49 @@ from .element import (
    NavigableString,
    PageElement,
    ProcessingInstruction,
+    PYTHON_SPECIFIC_ENCODINGS,
    ResultSet,
+    Script,
+    Stylesheet,
    SoupStrainer,
    Tag,
+    TemplateString,
    )

-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+# Define some custom warnings.
+class GuessedAtParserWarning(UserWarning):
+    """The warning issued when BeautifulSoup has to guess what parser to
+    use -- probably because no parser was specified in the constructor.
+    """
+
+class MarkupResemblesLocatorWarning(UserWarning):
+    """The warning issued when BeautifulSoup is given 'markup' that
+    actually looks like a resource locator -- a URL or a path to a file
+    on disk.
+    """
+

 class BeautifulSoup(Tag):
-    """
-    This class defines the basic interface called by the tree builders.
+    """A data structure representing a parsed HTML or XML document.

-    These methods will be called by the parser:
-      reset()
-      feed(markup)
+    Most of the methods you'll call on a BeautifulSoup object are inherited from
+    PageElement or Tag.
+
+    Internally, this class defines the basic interface called by the
+    tree builders when converting an HTML/XML document into a data
+    structure. The interface abstracts away the differences between
+    parsers. To write a new tree builder, you'll need to understand
+    these methods as a whole.
+
+    These methods will be called by the BeautifulSoup constructor:
+      * reset()
+      * feed(markup)

    The tree builder may call these methods from its feed() implementation:
-      handle_starttag(name, attrs) # See note about return value
-      handle_endtag(name)
-      handle_data(data) # Appends to the current data node
-      endData(containerClass) # Ends the current data node
+      * handle_starttag(name, attrs) # See note about return value
+      * handle_endtag(name)
+      * handle_data(data) # Appends to the current data node
+      * endData(containerClass) # Ends the current data node

    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
@ -73,68 +98,75 @@ class BeautifulSoup(Tag):
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    """
+
+    # Since BeautifulSoup subclasses Tag, it's possible to treat it as
+    # a Tag with a .name. This name makes it clear the BeautifulSoup
+    # object isn't a real markup tag.
    ROOT_TAG_NAME = '[document]'

    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-   
+
+    # A string containing all ASCII whitespace characters, used in
+    # endData() to detect data chunks that seem 'empty'.
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+    
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
                 element_classes=None, **kwargs):
        """Constructor.

        :param markup: A string or a file-like object representing
-        markup to be parsed.
+         markup to be parsed.

-        :param features: Desirable features of the parser to be used. This
-        may be the name of a specific parser ("lxml", "lxml-xml",
-        "html.parser", or "html5lib") or it may be the type of markup
-        to be used ("html", "html5", "xml"). It's recommended that you
-        name a specific parser, so that Beautiful Soup gives you the
-        same results across platforms and virtual environments.
+        :param features: Desirable features of the parser to be
+         used. This may be the name of a specific parser ("lxml",
+         "lxml-xml", "html.parser", or "html5lib") or it may be the
+         type of markup to be used ("html", "html5", "xml"). It's
+         recommended that you name a specific parser, so that
+         Beautiful Soup gives you the same results across platforms
+         and virtual environments.

        :param builder: A TreeBuilder subclass to instantiate (or
-        instance to use) instead of looking one up based on
-        `features`. You only need to use this if you've implemented a
-        custom TreeBuilder.
+         instance to use) instead of looking one up based on
+         `features`. You only need to use this if you've implemented a
+         custom TreeBuilder.

        :param parse_only: A SoupStrainer. Only parts of the document
-        matching the SoupStrainer will be considered. This is useful
-        when parsing part of a document that would otherwise be too
-        large to fit into memory.
+         matching the SoupStrainer will be considered. This is useful
+         when parsing part of a document that would otherwise be too
+         large to fit into memory.

        :param from_encoding: A string indicating the encoding of the
-        document to be parsed. Pass this in if Beautiful Soup is
-        guessing wrongly about the document's encoding.
+         document to be parsed. Pass this in if Beautiful Soup is
+         guessing wrongly about the document's encoding.

        :param exclude_encodings: A list of strings indicating
-        encodings known to be wrong. Pass this in if you don't know
-        the document's encoding but you know Beautiful Soup's guess is
-        wrong.
+         encodings known to be wrong. Pass this in if you don't know
+         the document's encoding but you know Beautiful Soup's guess is
+         wrong.

        :param element_classes: A dictionary mapping BeautifulSoup
-        classes like Tag and NavigableString to other classes you'd
-        like to be instantiated instead as the parse tree is
-        built. This is useful for using subclasses to modify the
-        default behavior of Tag or NavigableString.
+         classes like Tag and NavigableString, to other classes you'd
+         like to be instantiated instead as the parse tree is
+         built. This is useful for subclassing Tag or NavigableString
+         to modify default behavior.

        :param kwargs: For backwards compatibility purposes, the
-        constructor accepts certain keyword arguments used in
-        Beautiful Soup 3. None of these arguments do anything in
-        Beautiful Soup 4; they will result in a warning and then be ignored.
-
-        Apart from this, any keyword arguments passed into the BeautifulSoup
-        constructor are propagated to the TreeBuilder constructor. This
-        makes it possible to configure a TreeBuilder beyond saying
-        which one to use.
-
+         constructor accepts certain keyword arguments used in
+         Beautiful Soup 3. None of these arguments do anything in
+         Beautiful Soup 4; they will result in a warning and then be
+         ignored.
+         
+         Apart from this, any keyword arguments passed into the
+         BeautifulSoup constructor are propagated to the TreeBuilder
+         constructor. This makes it possible to configure a
+         TreeBuilder by passing in arguments, not just by saying which
+         one to use.
        """
-
        if 'convertEntities' in kwargs:
            del kwargs['convertEntities']
            warnings.warn(
@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
            if not original_builder and not (
                    original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES
-            ):
+            ) and markup:
+                # The user did not tell us which TreeBuilder to use,
+                # and we had to guess. Issue a warning.
                if builder.is_xml:
                    markup_type = "XML"
                else:
@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
                        parser=builder.NAME,
                        markup_type=markup_type
                    )
-                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+                    warnings.warn(
+                        self.NO_PARSER_SPECIFIED_WARNING % values,
+                        GuessedAtParserWarning, stacklevel=2
+                    )
        else:
            if kwargs:
                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
            else:
                possible_filename = markup
            is_file = False
+            is_directory = False
            try:
                is_file = os.path.exists(possible_filename)
+                if is_file:
+                    is_directory = os.path.isdir(possible_filename)
            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
-            if is_file:
-                if isinstance(markup, str):
-                    markup = markup.encode("utf8")
+            if is_directory:
+                warnings.warn(
+                    '"%s" looks like a directory name, not markup. You may'
+                    ' want to open a file found in this directory and pass'
+                    ' the filehandle into Beautiful Soup.' % (
+                        self._decode_markup(markup)
+                    ),
+                    MarkupResemblesLocatorWarning
+                )
+            elif is_file:
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
                    ' probably open this file and pass the filehandle into'
-                    ' Beautiful Soup.' % markup)
+                    ' Beautiful Soup.' % self._decode_markup(markup),
+                    MarkupResemblesLocatorWarning
+                )
            self._check_markup_is_url(markup)

        rejections = []
@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
        self.builder.soup = None

    def __copy__(self):
+        """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
        copy = type(self)(
            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
        )
@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
            d['builder'] = None
        return d

-    @staticmethod
-    def _check_markup_is_url(markup):
-        """ 
-        Check if markup looks like it's actually a url and raise a warning 
-        if so. Markup can be unicode or str (py2) / bytes (py3).
+    @classmethod
+    def _decode_markup(cls, markup):
+        """Ensure `markup` is bytes so it's safe to send into warnings.warn.
+
+        TODO: warnings.warn had this problem back in 2010 but it might not
+        anymore.
+        """
+        if isinstance(markup, bytes):
+            decoded = markup.decode('utf-8', 'replace')
+        else:
+            decoded = markup
+        return decoded
+
+    @classmethod
+    def _check_markup_is_url(cls, markup):
+        """Error-handling method to raise a warning if incoming markup looks
+        like a URL.
+
+        :param markup: A string.
        """
        if isinstance(markup, bytes):
            space = b' '
@ -364,18 +428,20 @@ class BeautifulSoup(Tag):

        if any(markup.startswith(prefix) for prefix in cant_start_with):
            if not space in markup:
-                if isinstance(markup, bytes):
-                    decoded_markup = markup.decode('utf-8', 'replace')
-                else:
-                    decoded_markup = markup
                warnings.warn(
                    '"%s" looks like a URL. Beautiful Soup is not an'
                    ' HTTP client. You should probably use an HTTP client like'
                    ' requests to get the document behind the URL, and feed'
-                    ' that document to Beautiful Soup.' % decoded_markup
+                    ' that document to Beautiful Soup.' % cls._decode_markup(
+                        markup
+                    ),
+                    MarkupResemblesLocatorWarning
                )

    def _feed(self):
+        """Internal method that parses previously set markup, creating a large
+        number of Tag and NavigableString objects.
+        """
        # Convert the document to Unicode.
        self.builder.reset()

@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
            self.popTag()

    def reset(self):
+        """Reset this object to a state as though it had never parsed any
+        markup.
+        """
        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
        self.hidden = 1
        self.builder.reset()
        self.current_data = []
        self.currentTag = None
        self.tagStack = []
+        self.open_tag_counter = Counter()
        self.preserve_whitespace_tag_stack = []
+        self.string_container_stack = []
        self.pushTag(self)

    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
                sourceline=None, sourcepos=None, **kwattrs):
-        """Create a new tag associated with this soup."""
+        """Create a new Tag associated with this BeautifulSoup object.
+
+        :param name: The name of the new Tag.
+        :param namespace: The URI of the new Tag's XML namespace, if any.
+        :param prefix: The prefix for the new Tag's XML namespace, if any.
+        :param attrs: A dictionary of this Tag's attribute values; can
+            be used instead of `kwattrs` for attributes like 'class'
+            that are reserved words in Python.
+        :param sourceline: The line number where this tag was
+            (purportedly) found in its source document.
+        :param sourcepos: The character position within `sourceline` where this
+            tag was (purportedly) found.
+        :param kwattrs: Keyword arguments for the new Tag's attribute values.
+
+        """
        kwattrs.update(attrs)
        return self.element_classes.get(Tag, Tag)(
            None, self.builder, name, namespace, nsprefix, kwattrs,
            sourceline=sourceline, sourcepos=sourcepos
        )

-    def new_string(self, s, subclass=None):
-        """Create a new NavigableString associated with this soup."""
-        subclass = subclass or self.element_classes.get(
-            NavigableString, NavigableString
+    def string_container(self, base_class=None):
+        container = base_class or NavigableString
+        
+        # There may be a general override of NavigableString.
+        container = self.element_classes.get(
+            container, container
        )
-        return subclass(s)

-    def insert_before(self, successor):
+        # On top of that, we may be inside a tag that needs a special
+        # container class.
+        if self.string_container_stack and container is NavigableString:
+            container = self.builder.string_containers.get(
+                self.string_container_stack[-1].name, container
+            )
+        return container
+        
+    def new_string(self, s, subclass=None):
+        """Create a new NavigableString associated with this BeautifulSoup
+        object.
+        """
+        container = self.string_container(subclass)
+        return container(s)
+
+    def insert_before(self, *args):
+        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+        it because there is nothing before or after it in the parse tree.
+        """
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")

-    def insert_after(self, successor):
+    def insert_after(self, *args):
+        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+        it because there is nothing before or after it in the parse tree.
+        """
        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")

    def popTag(self):
+        """Internal method called by _popToTag when a tag is closed."""
        tag = self.tagStack.pop()
+        if tag.name in self.open_tag_counter:
+            self.open_tag_counter[tag.name] -= 1
        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
            self.preserve_whitespace_tag_stack.pop()
-        #print "Pop", tag.name
+        if self.string_container_stack and tag == self.string_container_stack[-1]:
+            self.string_container_stack.pop()
+        #print("Pop", tag.name)
        if self.tagStack:
            self.currentTag = self.tagStack[-1]
        return self.currentTag

    def pushTag(self, tag):
-        #print "Push", tag.name
+        """Internal method called by handle_starttag when a tag is opened."""
+        #print("Push", tag.name)
        if self.currentTag is not None:
            self.currentTag.contents.append(tag)
        self.tagStack.append(tag)
        self.currentTag = self.tagStack[-1]
+        if tag.name != self.ROOT_TAG_NAME:
+            self.open_tag_counter[tag.name] += 1
        if tag.name in self.builder.preserve_whitespace_tags:
            self.preserve_whitespace_tag_stack.append(tag)
+        if tag.name in self.builder.string_containers:
+            self.string_container_stack.append(tag)

    def endData(self, containerClass=None):
-
-        # Default container is NavigableString.
-        containerClass = containerClass or NavigableString
-
-        # The user may want us to instantiate some alias for the
-        # container class.
-        containerClass = self.element_classes.get(
-            containerClass, containerClass
-        )
-        
+        """Method called by the TreeBuilder when the end of a data segment
+        occurs.
+        """       
        if self.current_data:
            current_data = ''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
                    not self.parse_only.search(current_data)):
                return

+            containerClass = self.string_container(containerClass)
            o = containerClass(current_data)
            self.object_was_parsed(o)

    def object_was_parsed(self, o, parent=None, most_recent_element=None):
-        """Add an object to the parse tree."""
+        """Method called by the TreeBuilder to integrate an object into the parse tree."""
        if parent is None:
            parent = self.currentTag
        if most_recent_element is not None:
@ -545,10 +656,19 @@ class BeautifulSoup(Tag):

    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
-        instance of the given tag. If inclusivePop is false, pops the tag
-        stack up to but *not* including the most recent instqance of
-        the given tag."""
-        #print "Popping to %s" % name
+        instance of the given tag.
+
+        If there are no open tags with the given name, nothing will be
+        popped.
+
+        :param name: Pop up to the most recent tag with this name.
+        :param nsprefix: The namespace prefix that goes with `name`.
+        :param inclusivePop: It this is false, pops the tag stack up
+          to but *not* including the most recent instqance of the
+          given tag.
+
+        """
+        #print("Popping to %s" % name)
        if name == self.ROOT_TAG_NAME:
            # The BeautifulSoup object itself can never be popped.
            return
@ -557,6 +677,8 @@ class BeautifulSoup(Tag):

        stack_size = len(self.tagStack)
        for i in range(stack_size - 1, 0, -1):
+            if not self.open_tag_counter.get(name):
+                break
            t = self.tagStack[i]
            if (name == t.name and nsprefix == t.prefix):
                if inclusivePop:
@ -568,15 +690,22 @@ class BeautifulSoup(Tag):

    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
                        sourcepos=None):
-        """Push a start tag on to the stack.
+        """Called by the tree builder when a new tag is encountered.

-        If this method returns None, the tag was rejected by the
+        :param name: Name of the tag.
+        :param nsprefix: Namespace prefix for the tag.
+        :param attrs: A dictionary of attribute values.
+        :param sourceline: The line number where this tag was found in its
+            source document.
+        :param sourcepos: The character position within `sourceline` where this
+            tag was found.
+
+        If this method returns None, the tag was rejected by an active
        SoupStrainer. You should proceed as if the tag had not occurred
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
-
-        # print "Start tag %s: %s" % (name, attrs)
+        # print("Start tag %s: %s" % (name, attrs))
        self.endData()

        if (self.parse_only and len(self.tagStack) <= 1
@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
        return tag

    def handle_endtag(self, name, nsprefix=None):
-        #print "End tag: " + name
+        """Called by the tree builder when an ending tag is encountered.
+
+        :param name: Name of the tag.
+        :param nsprefix: Namespace prefix for the tag.
+        """
+        #print("End tag: " + name)
        self.endData()
        self._popToTag(name, nsprefix)

    def handle_data(self, data):
+        """Called by the tree builder when a chunk of textual data is encountered."""
        self.current_data.append(data)
-
+       
    def decode(self, pretty_print=False,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
-        """Returns a string or Unicode representation of this document.
-        To get Unicode, pass None for encoding."""
+        """Returns a string or Unicode representation of the parse tree
+            as an HTML or XML document.

+        :param pretty_print: If this is True, indentation will be used to
+            make the document more readable.
+        :param eventual_encoding: The encoding of the final document.
+            If this is None, the document will be a Unicode string.
+        """
        if self.is_xml:
            # Print the XML declaration
            encoding_part = ''
+            if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
+                # This is a special Python encoding; it can't actually
+                # go into an XML document because it means nothing
+                # outside of Python.
+                eventual_encoding = None
            if eventual_encoding != None:
                encoding_part = ' encoding="%s"' % eventual_encoding
            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
        return prefix + super(BeautifulSoup, self).decode(
            indent_level, eventual_encoding, formatter)

-# Alias to make it easier to type import: 'from bs4 import _soup'
+# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
 _s = BeautifulSoup
 _soup = BeautifulSoup

@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):


 class StopParsing(Exception):
+    """Exception raised by a TreeBuilder if it's unable to continue parsing."""
    pass

 class FeatureNotFound(ValueError):
+    """Exception raised by the BeautifulSoup constructor if no parser with the
+    requested features is found.
+    """
    pass


-#By default, act as an HTML pretty-printer.
+#If this file is run as a script, act as an HTML pretty-printer.
 if __name__ == '__main__':
    import sys
    soup = BeautifulSoup(sys.stdin)
-    print(soup.prettify())
+    print((soup.prettify()))