Bump beautifulsoup4 from 4.10.0 to 4.11.1 (#1717)

* Bump beautifulsoup4 from 4.10.0 to 4.11.1 Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.10.0 to 4.11.1. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update beautifulsoup4==4.11.1 * Update soupsieve==2.3.2.post1 * Update requirements.txt Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-14 02:26:58 -07:00 · 2022-05-16 20:46:21 -07:00 · 2022-05-16 20:46:21 -07:00 · 467ae352f5
commit 467ae352f5
parent a1fe0b04d7
28 changed files with 4846 additions and 2609 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -15,14 +15,13 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.10.0"
-__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
+__version__ = "4.11.1"
+__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']

-
 from collections import Counter
 import os
 import re
@ -35,7 +34,11 @@ import warnings
 if sys.version_info.major < 3:
    raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')

-from .builder import builder_registry, ParserRejectedMarkup
+from .builder import (
+    builder_registry,
+    ParserRejectedMarkup,
+    XMLParsedAsHTMLWarning,
+)
 from .dammit import UnicodeDammit
 from .element import (
    CData,
@ -67,7 +70,7 @@ class MarkupResemblesLocatorWarning(UserWarning):
    on disk.
    """

-
+   
 class BeautifulSoup(Tag):
    """A data structure representing a parsed HTML or XML document.

@ -207,10 +210,10 @@ class BeautifulSoup(Tag):
            if old_name in kwargs:
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
-                    'has been renamed to "%s."' % (old_name, new_name))
-                value = kwargs[old_name]
-                del kwargs[old_name]
-                return value
+                    'has been renamed to "%s."' % (old_name, new_name),
+                    DeprecationWarning
+                )
+                return kwargs.pop(old_name)
            return None

        parse_only = parse_only or deprecated_argument(
@ -305,51 +308,18 @@ class BeautifulSoup(Tag):
        self._namespaces = dict()
        self.parse_only = parse_only

-        self.builder.initialize_soup(self)
-
        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
        elif len(markup) <= 256 and (
                (isinstance(markup, bytes) and not b'<' in markup)
                or (isinstance(markup, str) and not '<' in markup)
        ):
-            # Print out warnings for a couple beginner problems
+            # Issue warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
-            # just in case that's what the user really wants.
-            if (isinstance(markup, str)
-                and not os.path.supports_unicode_filenames):
-                possible_filename = markup.encode("utf8")
-            else:
-                possible_filename = markup
-            is_file = False
-            is_directory = False
-            try:
-                is_file = os.path.exists(possible_filename)
-                if is_file:
-                    is_directory = os.path.isdir(possible_filename)
-            except Exception as e:
-                # This is almost certainly a problem involving
-                # characters not valid in filenames on this
-                # system. Just let it go.
-                pass
-            if is_directory:
-                warnings.warn(
-                    '"%s" looks like a directory name, not markup. You may'
-                    ' want to open a file found in this directory and pass'
-                    ' the filehandle into Beautiful Soup.' % (
-                        self._decode_markup(markup)
-                    ),
-                    MarkupResemblesLocatorWarning
-                )
-            elif is_file:
-                warnings.warn(
-                    '"%s" looks like a filename, not markup. You should'
-                    ' probably open this file and pass the filehandle into'
-                    ' Beautiful Soup.' % self._decode_markup(markup),
-                    MarkupResemblesLocatorWarning
-                )
-            self._check_markup_is_url(markup)
+            # since that is sometimes the intended behavior.
+            if not self._markup_is_url(markup):
+                self._markup_resembles_filename(markup)                

        rejections = []
        success = False
@ -358,6 +328,7 @@ class BeautifulSoup(Tag):
             self.builder.prepare_markup(
                 markup, from_encoding, exclude_encodings=exclude_encodings)):
            self.reset()
+            self.builder.initialize_soup(self)
            try:
                self._feed()
                success = True
@ -393,10 +364,10 @@ class BeautifulSoup(Tag):
    def __getstate__(self):
        # Frequently a tree builder can't be pickled.
        d = dict(self.__dict__)
-        if 'builder' in d and not self.builder.picklable:
+        if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
            d['builder'] = None
        return d
-
+    
    @classmethod
    def _decode_markup(cls, markup):
        """Ensure `markup` is bytes so it's safe to send into warnings.warn.
@ -411,11 +382,13 @@ class BeautifulSoup(Tag):
        return decoded

    @classmethod
-    def _check_markup_is_url(cls, markup):
+    def _markup_is_url(cls, markup):
        """Error-handling method to raise a warning if incoming markup looks
        like a URL.

        :param markup: A string.
+        :return: Whether or not the markup resembles a URL
+            closely enough to justify a warning.
        """
        if isinstance(markup, bytes):
            space = b' '
@ -424,20 +397,50 @@ class BeautifulSoup(Tag):
            space = ' '
            cant_start_with = ("http:", "https:")
        else:
-            return
+            return False

        if any(markup.startswith(prefix) for prefix in cant_start_with):
            if not space in markup:
                warnings.warn(
-                    '"%s" looks like a URL. Beautiful Soup is not an'
-                    ' HTTP client. You should probably use an HTTP client like'
-                    ' requests to get the document behind the URL, and feed'
-                    ' that document to Beautiful Soup.' % cls._decode_markup(
-                        markup
-                    ),
+                    'The input looks more like a URL than markup. You may want to use'
+                    ' an HTTP client like requests to get the document behind'
+                    ' the URL, and feed that document to Beautiful Soup.',
                    MarkupResemblesLocatorWarning
                )
+                return True
+        return False

+    @classmethod
+    def _markup_resembles_filename(cls, markup):
+        """Error-handling method to raise a warning if incoming markup
+        resembles a filename.
+
+        :param markup: A bytestring or string.
+        :return: Whether or not the markup resembles a filename
+            closely enough to justify a warning.
+        """
+        path_characters = '/\\'
+        extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
+        if isinstance(markup, bytes):
+            path_characters = path_characters.encode("utf8")
+            extensions = [x.encode('utf8') for x in extensions]
+        filelike = False
+        if any(x in markup for x in path_characters):
+            filelike = True
+        else:
+            lower = markup.lower()
+            if any(lower.endswith(ext) for ext in extensions):
+                filelike = True
+        if filelike:
+            warnings.warn(
+                'The input looks more like a filename than markup. You may'
+                ' want to open this file and pass the filehandle into'
+                ' Beautiful Soup.',
+                MarkupResemblesLocatorWarning
+            )
+            return True
+        return False
+    
    def _feed(self):
        """Internal method that parses previously set markup, creating a large
        number of Tag and NavigableString objects.
@ -689,7 +692,7 @@ class BeautifulSoup(Tag):
        return most_recently_popped

    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
-                        sourcepos=None):
+                        sourcepos=None, namespaces=None):
        """Called by the tree builder when a new tag is encountered.

        :param name: Name of the tag.
@ -699,6 +702,8 @@ class BeautifulSoup(Tag):
            source document.
        :param sourcepos: The character position within `sourceline` where this
            tag was found.
+        :param namespaces: A dictionary of all namespace prefix mappings 
+            currently in scope in the document.

        If this method returns None, the tag was rejected by an active
        SoupStrainer. You should proceed as if the tag had not occurred
@ -716,7 +721,8 @@ class BeautifulSoup(Tag):
        tag = self.element_classes.get(Tag, Tag)(
            self, self.builder, name, namespace, nsprefix, attrs,
            self.currentTag, self._most_recent_element,
-            sourceline=sourceline, sourcepos=sourcepos
+            sourceline=sourceline, sourcepos=sourcepos,
+            namespaces=namespaces
        )
        if tag is None:
            return tag
@ -735,7 +741,7 @@ class BeautifulSoup(Tag):
        #print("End tag: " + name)
        self.endData()
        self._popToTag(name, nsprefix)
-
+        
    def handle_data(self, data):
        """Called by the tree builder when a chunk of textual data is encountered."""
        self.current_data.append(data)
@ -782,7 +788,9 @@ class BeautifulStoneSoup(BeautifulSoup):
        kwargs['features'] = 'xml'
        warnings.warn(
            'The BeautifulStoneSoup class is deprecated. Instead of using '
-            'it, pass features="xml" into the BeautifulSoup constructor.')
+            'it, pass features="xml" into the BeautifulSoup constructor.',
+            DeprecationWarning
+        )
        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)