Update beautifulsoup4-4.10.0

2025-08-19 21:03:21 -07:00 · 2021-10-14 20:46:06 -07:00 · 2021-10-14 20:46:06 -07:00 · ab8fa4d5b3
commit ab8fa4d5b3
parent b581460b51
16 changed files with 4599 additions and 743 deletions
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -7,8 +7,11 @@ import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
+    Stylesheet,
+    Script,
+    TemplateString,
    nonwhitespace_re
-    )
+)

 __all__ = [
    'HTMLTreeBuilder',
@ -27,18 +30,33 @@ HTML_5 = 'html5'


 class TreeBuilderRegistry(object):
-
+    """A way of looking up TreeBuilder subclasses by their name or by desired
+    features.
+    """
+    
    def __init__(self):
        self.builders_for_feature = defaultdict(list)
        self.builders = []

    def register(self, treebuilder_class):
-        """Register a treebuilder based on its advertised features."""
+        """Register a treebuilder based on its advertised features.
+
+        :param treebuilder_class: A subclass of Treebuilder. its .features
+           attribute should list its features.
+        """
        for feature in treebuilder_class.features:
            self.builders_for_feature[feature].insert(0, treebuilder_class)
        self.builders.insert(0, treebuilder_class)

    def lookup(self, *features):
+        """Look up a TreeBuilder subclass with the desired features.
+
+        :param features: A list of features to look for. If none are
+            provided, the most recently registered TreeBuilder subclass
+            will be used.
+        :return: A TreeBuilder subclass, or None if there's no
+            registered subclass with all the requested features.
+        """
        if len(self.builders) == 0:
            # There are no builders at all.
            return None
@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
 builder_registry = TreeBuilderRegistry()

 class TreeBuilder(object):
-    """Turn a document into a Beautiful Soup object tree."""
+    """Turn a textual document into a Beautiful Soup object tree."""

    NAME = "[Unknown tree builder]"
    ALTERNATE_NAMES = []
@ -96,7 +114,12 @@ class TreeBuilder(object):
    # comma-separated list of CDATA, rather than a single CDATA.
    DEFAULT_CDATA_LIST_ATTRIBUTES = {}

+    # Whitespace should be preserved inside these tags.
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+    # The textual contents of tags with these names should be
+    # instantiated with some class other than NavigableString.
+    DEFAULT_STRING_CONTAINERS = {}
    
    USE_DEFAULT = object()

@ -105,30 +128,39 @@ class TreeBuilder(object):
    
    def __init__(self, multi_valued_attributes=USE_DEFAULT,
                 preserve_whitespace_tags=USE_DEFAULT,
-                 store_line_numbers=USE_DEFAULT):
+                 store_line_numbers=USE_DEFAULT,
+                 string_containers=USE_DEFAULT,
+    ):
        """Constructor.

        :param multi_valued_attributes: If this is set to None, the
-        TreeBuilder will not turn any values for attributes like
-        'class' into lists. Setting this do a dictionary will
-        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
-        for an example.
+         TreeBuilder will not turn any values for attributes like
+         'class' into lists. Setting this to a dictionary will
+         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+         for an example.

-        Internally, these are called "CDATA list attributes", but that
-        probably doesn't make sense to an end-user, so the argument name
-        is `multi_valued_attributes`.
+         Internally, these are called "CDATA list attributes", but that
+         probably doesn't make sense to an end-user, so the argument name
+         is `multi_valued_attributes`.

        :param preserve_whitespace_tags: A list of tags to treat
-        the way <pre> tags are treated in HTML. Tags in this list
-        will have 
+         the way <pre> tags are treated in HTML. Tags in this list
+         are immune from pretty-printing; their contents will always be
+         output as-is.
+
+        :param string_containers: A dictionary mapping tag names to
+        the classes that should be instantiated to contain the textual
+        contents of those tags. The default is to use NavigableString
+        for every tag, no matter what the name. You can override the
+        default by changing DEFAULT_STRING_CONTAINERS.

        :param store_line_numbers: If the parser keeps track of the
-        line numbers and positions of the original markup, that
-        information will, by default, be stored in each corresponding
-        `Tag` object. You can turn this off by passing
-        store_line_numbers=False. If the parser you're using doesn't 
-        keep track of this information, then setting store_line_numbers=True
-        will do nothing.
+         line numbers and positions of the original markup, that
+         information will, by default, be stored in each corresponding
+         `Tag` object. You can turn this off by passing
+         store_line_numbers=False. If the parser you're using doesn't 
+         keep track of this information, then setting store_line_numbers=True
+         will do nothing.
        """
        self.soup = None
        if multi_valued_attributes is self.USE_DEFAULT:
@ -139,15 +171,25 @@ class TreeBuilder(object):
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers
+        self.store_line_numbers = store_line_numbers 
+        if string_containers == self.USE_DEFAULT:
+            string_containers = self.DEFAULT_STRING_CONTAINERS
+        self.string_containers = string_containers
        
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.
+
+        :param soup: A BeautifulSoup object.
        """
        self.soup = soup
        
    def reset(self):
+        """Do any work necessary to reset the underlying parser
+        for a new document.
+
+        By default, this does nothing.
+        """
        pass

    def can_be_empty_element(self, tag_name):
@ -159,23 +201,57 @@ class TreeBuilder(object):
        For instance: an HTMLBuilder does not consider a <p> tag to be
        an empty-element tag (it's not in
        HTMLBuilder.empty_element_tags). This means an empty <p> tag
-        will be presented as "<p></p>", not "<p />".
+        will be presented as "<p></p>", not "<p/>" or "<p>".

        The default implementation has no opinion about which tags are
        empty-element tags, so a tag will be presented as an
-        empty-element tag if and only if it has no contents.
-        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        empty-element tag if and only if it has no children.
+        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
        be left alone.
+
+        :param tag_name: The name of a markup tag.
        """
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
    
    def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+
+        This method is not implemented in TreeBuilder; it must be
+        implemented in subclasses.
+
+        :return: None.
+        """
        raise NotImplementedError()

    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding. NOTE: This argument is not used by the
+            calling code and can probably be removed.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
+
+         By default, the only strategy is to parse the markup
+         as-is. See `LXMLTreeBuilderForXML` and
+         `HTMLParserTreeBuilder` for implementations that take into
+         account the quirks of particular parsers.
+        """
        yield markup, None, None, False

    def test_fragment_to_document(self, fragment):
@ -188,16 +264,36 @@ class TreeBuilder(object):
        results against other HTML fragments.

        This method should not be used outside of tests.
+
+        :param fragment: A string -- fragment of HTML.
+        :return: A string -- a full HTML document.
        """
        return fragment

    def set_up_substitutions(self, tag):
+        """Set up any substitutions that will need to be performed on 
+        a `Tag` when it's output as a string.
+
+        By default, this does nothing. See `HTMLTreeBuilder` for a
+        case where this is used.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
        return False

    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
-        """Replaces class="foo bar" with class=["foo", "bar"]
+        """When an attribute value is associated with a tag that can
+        have multiple values for that attribute, convert the string
+        value to a list of strings.

-        Modifies its input in place.
+        Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+        NOTE: This method modifies its input in place.
+
+        :param tag_name: The name of a tag.
+        :param attrs: A dictionary containing the tag's attributes.
+           Any appropriate attribute values will be modified in place.
        """
        if not attrs:
            return attrs
@ -225,7 +321,11 @@ class TreeBuilder(object):
        return attrs

 class SAXTreeBuilder(TreeBuilder):
-    """A Beautiful Soup treebuilder that listens for SAX events."""
+    """A Beautiful Soup treebuilder that listens for SAX events.
+
+    This is not currently used for anything, but it demonstrates
+    how a simple TreeBuilder would work.
+    """

    def feed(self, markup):
        raise NotImplementedError()
@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):

    def startElement(self, name, attrs):
        attrs = dict((key[1], value) for key, value in list(attrs.items()))
-        #print "Start %s, %r" % (name, attrs)
+        #print("Start %s, %r" % (name, attrs))
        self.soup.handle_starttag(name, attrs)

    def endElement(self, name):
-        #print "End %s" % name
+        #print("End %s" % name)
        self.soup.handle_endtag(name)

    def startElementNS(self, nsTuple, nodeName, attrs):
@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder):
    # but it may do so eventually, and this information is available if
    # you need to use it.
    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
+    # The HTML standard defines an unusual content model for these tags.
+    # We represent this by using a string class other than NavigableString
+    # inside these tags.
+    #
+    # I made this list by going through the HTML spec
+    # (https://html.spec.whatwg.org/#metadata-content) and looking for
+    # "metadata content" elements that can contain strings.
+    #
+    # TODO: Arguably <noscript> could go here but it seems
+    # qualitatively different from the other tags.
+    DEFAULT_STRING_CONTAINERS = {
+        'style': Stylesheet,
+        'script': Script,
+        'template': TemplateString,
+    }    
    
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
    
    def set_up_substitutions(self, tag):
+        """Replace the declared encoding in a <meta> tag with a placeholder,
+        to be substituted when the tag is output to a string.
+
+        An HTML document may come in to Beautiful Soup as one
+        encoding, but exit in a different encoding, and the <meta> tag
+        needs to be changed to reflect this.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
        # We are only interested in <meta> tags
        if tag.name != 'meta':
            return False
@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):

 def register_treebuilders_from(module):
    """Copy TreeBuilders from the given module into this module."""
-    # I'm fairly sure this is not the best way to do this.
-    this_module = sys.modules['bs4.builder']
+    this_module = sys.modules[__name__]
    for name in module.__all__:
        obj = getattr(module, name)

@ -363,6 +488,9 @@ def register_treebuilders_from(module):
            this_module.builder_registry.register(obj)

 class ParserRejectedMarkup(Exception):
+    """An Exception to be raised when the underlying parser simply
+    refuses to parse the given markup.
+    """
    def __init__(self, message_or_exception):
        """Explain why the parser rejected the given markup, either
        with a textual explanation or another exception.
@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
 from . import _htmlparser
 register_treebuilders_from(_htmlparser)
 try: