Update beautifulsoup4-4.10.0

2025-08-20 21:33:18 -07:00 · 2021-10-14 20:46:06 -07:00 · 2021-10-14 20:46:06 -07:00 · ab8fa4d5b3
commit ab8fa4d5b3
parent b581460b51
16 changed files with 4599 additions and 743 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -1,6 +1,5 @@
-"""Beautiful Soup
+"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
-Elixir and Tonic
+
 "The Screen-Scraper's Friend"
 http://www.crummy.com/software/BeautifulSoup/
 Beautiful Soup uses a pluggable XML or HTML parser to parse a
@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
 provides methods and Pythonic idioms that make it easy to navigate,
 search, and modify the parse tree.
-Beautiful Soup works with Python 2.7 and up. It works better if lxml
+Beautiful Soup works with Python 3.5 and up. It works better if lxml
 and/or html5lib is installed.
 For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
+documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.1"
+__version__ = "4.10.0"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 __all__ = ['BeautifulSoup']
 from collections import Counter
 import os
 import re
 import sys
 import traceback
 import warnings
 # The very first thing we do is give a useful error if someone is
 # running this code under Python 2.
 if sys.version_info.major < 3:
    raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
 from .builder import builder_registry, ParserRejectedMarkup
 from .dammit import UnicodeDammit
 from .element import (
@ -42,28 +46,49 @@ from .element import (
    NavigableString,
    PageElement,
    ProcessingInstruction,
    PYTHON_SPECIFIC_ENCODINGS,
    ResultSet,
    Script,
    Stylesheet,
    SoupStrainer,
    Tag,
    TemplateString,
    )
-# The very first thing we do is give a useful error if someone is
+# Define some custom warnings.
-# running this code under Python 3 without converting it.
+class GuessedAtParserWarning(UserWarning):
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+    """The warning issued when BeautifulSoup has to guess what parser to
    use -- probably because no parser was specified in the constructor.
    """
 class MarkupResemblesLocatorWarning(UserWarning):
    """The warning issued when BeautifulSoup is given 'markup' that
    actually looks like a resource locator -- a URL or a path to a file
    on disk.
    """
 class BeautifulSoup(Tag):
-    """
+    """A data structure representing a parsed HTML or XML document.
    This class defines the basic interface called by the tree builders.
-    These methods will be called by the parser:
+    Most of the methods you'll call on a BeautifulSoup object are inherited from
-      reset()
+    PageElement or Tag.
-      feed(markup)
+
    Internally, this class defines the basic interface called by the
    tree builders when converting an HTML/XML document into a data
    structure. The interface abstracts away the differences between
    parsers. To write a new tree builder, you'll need to understand
    these methods as a whole.
    These methods will be called by the BeautifulSoup constructor:
      * reset()
      * feed(markup)
    The tree builder may call these methods from its feed() implementation:
-      handle_starttag(name, attrs) # See note about return value
+      * handle_starttag(name, attrs) # See note about return value
-      handle_endtag(name)
+      * handle_endtag(name)
-      handle_data(data) # Appends to the current data node
+      * handle_data(data) # Appends to the current data node
-      endData(containerClass) # Ends the current data node
+      * endData(containerClass) # Ends the current data node
    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
@ -73,68 +98,75 @@ class BeautifulSoup(Tag):
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    """
    # Since BeautifulSoup subclasses Tag, it's possible to treat it as
    # a Tag with a .name. This name makes it clear the BeautifulSoup
    # object isn't a real markup tag.
    ROOT_TAG_NAME = '[document]'
    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-   
+
    # A string containing all ASCII whitespace characters, used in
    # endData() to detect data chunks that seem 'empty'.
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+    
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
                 element_classes=None, **kwargs):
        """Constructor.
        :param markup: A string or a file-like object representing
-        markup to be parsed.
+         markup to be parsed.
-        :param features: Desirable features of the parser to be used. This
+        :param features: Desirable features of the parser to be
-        may be the name of a specific parser ("lxml", "lxml-xml",
+         used. This may be the name of a specific parser ("lxml",
-        "html.parser", or "html5lib") or it may be the type of markup
+         "lxml-xml", "html.parser", or "html5lib") or it may be the
-        to be used ("html", "html5", "xml"). It's recommended that you
+         type of markup to be used ("html", "html5", "xml"). It's
-        name a specific parser, so that Beautiful Soup gives you the
+         recommended that you name a specific parser, so that
-        same results across platforms and virtual environments.
+         Beautiful Soup gives you the same results across platforms
         and virtual environments.
        :param builder: A TreeBuilder subclass to instantiate (or
-        instance to use) instead of looking one up based on
+         instance to use) instead of looking one up based on
-        `features`. You only need to use this if you've implemented a
+         `features`. You only need to use this if you've implemented a
-        custom TreeBuilder.
+         custom TreeBuilder.
        :param parse_only: A SoupStrainer. Only parts of the document
-        matching the SoupStrainer will be considered. This is useful
+         matching the SoupStrainer will be considered. This is useful
-        when parsing part of a document that would otherwise be too
+         when parsing part of a document that would otherwise be too
-        large to fit into memory.
+         large to fit into memory.
        :param from_encoding: A string indicating the encoding of the
-        document to be parsed. Pass this in if Beautiful Soup is
+         document to be parsed. Pass this in if Beautiful Soup is
-        guessing wrongly about the document's encoding.
+         guessing wrongly about the document's encoding.
        :param exclude_encodings: A list of strings indicating
-        encodings known to be wrong. Pass this in if you don't know
+         encodings known to be wrong. Pass this in if you don't know
-        the document's encoding but you know Beautiful Soup's guess is
+         the document's encoding but you know Beautiful Soup's guess is
-        wrong.
+         wrong.
        :param element_classes: A dictionary mapping BeautifulSoup
-        classes like Tag and NavigableString to other classes you'd
+         classes like Tag and NavigableString, to other classes you'd
-        like to be instantiated instead as the parse tree is
+         like to be instantiated instead as the parse tree is
-        built. This is useful for using subclasses to modify the
+         built. This is useful for subclassing Tag or NavigableString
-        default behavior of Tag or NavigableString.
+         to modify default behavior.
        :param kwargs: For backwards compatibility purposes, the
-        constructor accepts certain keyword arguments used in
+         constructor accepts certain keyword arguments used in
-        Beautiful Soup 3. None of these arguments do anything in
+         Beautiful Soup 3. None of these arguments do anything in
-        Beautiful Soup 4; they will result in a warning and then be ignored.
+         Beautiful Soup 4; they will result in a warning and then be
-
+         ignored.
-        Apart from this, any keyword arguments passed into the BeautifulSoup
+         
-        constructor are propagated to the TreeBuilder constructor. This
+         Apart from this, any keyword arguments passed into the
-        makes it possible to configure a TreeBuilder beyond saying
+         BeautifulSoup constructor are propagated to the TreeBuilder
-        which one to use.
+         constructor. This makes it possible to configure a
-
+         TreeBuilder by passing in arguments, not just by saying which
         one to use.
        """
        if 'convertEntities' in kwargs:
            del kwargs['convertEntities']
            warnings.warn(
@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
            if not original_builder and not (
                    original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES
-            ):
+            ) and markup:
                # The user did not tell us which TreeBuilder to use,
                # and we had to guess. Issue a warning.
                if builder.is_xml:
                    markup_type = "XML"
                else:
@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
                        parser=builder.NAME,
                        markup_type=markup_type
                    )
-                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+                    warnings.warn(
                        self.NO_PARSER_SPECIFIED_WARNING % values,
                        GuessedAtParserWarning, stacklevel=2
                    )
        else:
            if kwargs:
                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
            else:
                possible_filename = markup
            is_file = False
            is_directory = False
            try:
                is_file = os.path.exists(possible_filename)
                if is_file:
                    is_directory = os.path.isdir(possible_filename)
            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
-            if is_file:
+            if is_directory:
-                if isinstance(markup, str):
+                warnings.warn(
-                    markup = markup.encode("utf8")
+                    '"%s" looks like a directory name, not markup. You may'
                    ' want to open a file found in this directory and pass'
                    ' the filehandle into Beautiful Soup.' % (
                        self._decode_markup(markup)
                    ),
                    MarkupResemblesLocatorWarning
                )
            elif is_file:
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
                    ' probably open this file and pass the filehandle into'
-                    ' Beautiful Soup.' % markup)
+                    ' Beautiful Soup.' % self._decode_markup(markup),
                    MarkupResemblesLocatorWarning
                )
            self._check_markup_is_url(markup)
        rejections = []
@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
        self.builder.soup = None
    def __copy__(self):
        """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
        copy = type(self)(
            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
        )
@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
            d['builder'] = None
        return d
-    @staticmethod
+    @classmethod
-    def _check_markup_is_url(markup):
+    def _decode_markup(cls, markup):
-        """ 
+        """Ensure `markup` is bytes so it's safe to send into warnings.warn.
-        Check if markup looks like it's actually a url and raise a warning 
+
-        if so. Markup can be unicode or str (py2) / bytes (py3).
+        TODO: warnings.warn had this problem back in 2010 but it might not
        anymore.
        """
        if isinstance(markup, bytes):
            decoded = markup.decode('utf-8', 'replace')
        else:
            decoded = markup
        return decoded
    @classmethod
    def _check_markup_is_url(cls, markup):
        """Error-handling method to raise a warning if incoming markup looks
        like a URL.
        :param markup: A string.
        """
        if isinstance(markup, bytes):
            space = b' '
@ -364,18 +428,20 @@ class BeautifulSoup(Tag):
        if any(markup.startswith(prefix) for prefix in cant_start_with):
            if not space in markup:
                if isinstance(markup, bytes):
                    decoded_markup = markup.decode('utf-8', 'replace')
                else:
                    decoded_markup = markup
                warnings.warn(
                    '"%s" looks like a URL. Beautiful Soup is not an'
                    ' HTTP client. You should probably use an HTTP client like'
                    ' requests to get the document behind the URL, and feed'
-                    ' that document to Beautiful Soup.' % decoded_markup
+                    ' that document to Beautiful Soup.' % cls._decode_markup(
                        markup
                    ),
                    MarkupResemblesLocatorWarning
                )
    def _feed(self):
        """Internal method that parses previously set markup, creating a large
        number of Tag and NavigableString objects.
        """
        # Convert the document to Unicode.
        self.builder.reset()
@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
            self.popTag()
    def reset(self):
        """Reset this object to a state as though it had never parsed any
        markup.
        """
        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
        self.hidden = 1
        self.builder.reset()
        self.current_data = []
        self.currentTag = None
        self.tagStack = []
        self.open_tag_counter = Counter()
        self.preserve_whitespace_tag_stack = []
        self.string_container_stack = []
        self.pushTag(self)
    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
                sourceline=None, sourcepos=None, **kwattrs):
-        """Create a new tag associated with this soup."""
+        """Create a new Tag associated with this BeautifulSoup object.
        :param name: The name of the new Tag.
        :param namespace: The URI of the new Tag's XML namespace, if any.
        :param prefix: The prefix for the new Tag's XML namespace, if any.
        :param attrs: A dictionary of this Tag's attribute values; can
            be used instead of `kwattrs` for attributes like 'class'
            that are reserved words in Python.
        :param sourceline: The line number where this tag was
            (purportedly) found in its source document.
        :param sourcepos: The character position within `sourceline` where this
            tag was (purportedly) found.
        :param kwattrs: Keyword arguments for the new Tag's attribute values.
        """
        kwattrs.update(attrs)
        return self.element_classes.get(Tag, Tag)(
            None, self.builder, name, namespace, nsprefix, kwattrs,
            sourceline=sourceline, sourcepos=sourcepos
        )
-    def new_string(self, s, subclass=None):
+    def string_container(self, base_class=None):
-        """Create a new NavigableString associated with this soup."""
+        container = base_class or NavigableString
-        subclass = subclass or self.element_classes.get(
+        
-            NavigableString, NavigableString
+        # There may be a general override of NavigableString.
        container = self.element_classes.get(
            container, container
        )
        return subclass(s)
-    def insert_before(self, successor):
+        # On top of that, we may be inside a tag that needs a special
        # container class.
        if self.string_container_stack and container is NavigableString:
            container = self.builder.string_containers.get(
                self.string_container_stack[-1].name, container
            )
        return container
    def new_string(self, s, subclass=None):
        """Create a new NavigableString associated with this BeautifulSoup
        object.
        """
        container = self.string_container(subclass)
        return container(s)
    def insert_before(self, *args):
        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
        it because there is nothing before or after it in the parse tree.
        """
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
-    def insert_after(self, successor):
+    def insert_after(self, *args):
        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
        it because there is nothing before or after it in the parse tree.
        """
        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
    def popTag(self):
        """Internal method called by _popToTag when a tag is closed."""
        tag = self.tagStack.pop()
        if tag.name in self.open_tag_counter:
            self.open_tag_counter[tag.name] -= 1
        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
            self.preserve_whitespace_tag_stack.pop()
-        #print "Pop", tag.name
+        if self.string_container_stack and tag == self.string_container_stack[-1]:
            self.string_container_stack.pop()
        #print("Pop", tag.name)
        if self.tagStack:
            self.currentTag = self.tagStack[-1]
        return self.currentTag
    def pushTag(self, tag):
-        #print "Push", tag.name
+        """Internal method called by handle_starttag when a tag is opened."""
        #print("Push", tag.name)
        if self.currentTag is not None:
            self.currentTag.contents.append(tag)
        self.tagStack.append(tag)
        self.currentTag = self.tagStack[-1]
        if tag.name != self.ROOT_TAG_NAME:
            self.open_tag_counter[tag.name] += 1
        if tag.name in self.builder.preserve_whitespace_tags:
            self.preserve_whitespace_tag_stack.append(tag)
        if tag.name in self.builder.string_containers:
            self.string_container_stack.append(tag)
    def endData(self, containerClass=None):
-
+        """Method called by the TreeBuilder when the end of a data segment
-        # Default container is NavigableString.
+        occurs.
-        containerClass = containerClass or NavigableString
+        """       
        # The user may want us to instantiate some alias for the
        # container class.
        containerClass = self.element_classes.get(
            containerClass, containerClass
        )
        if self.current_data:
            current_data = ''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
                    not self.parse_only.search(current_data)):
                return
            containerClass = self.string_container(containerClass)
            o = containerClass(current_data)
            self.object_was_parsed(o)
    def object_was_parsed(self, o, parent=None, most_recent_element=None):
-        """Add an object to the parse tree."""
+        """Method called by the TreeBuilder to integrate an object into the parse tree."""
        if parent is None:
            parent = self.currentTag
        if most_recent_element is not None:
@ -545,10 +656,19 @@ class BeautifulSoup(Tag):
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
-        instance of the given tag. If inclusivePop is false, pops the tag
+        instance of the given tag.
-        stack up to but *not* including the most recent instqance of
+
-        the given tag."""
+        If there are no open tags with the given name, nothing will be
-        #print "Popping to %s" % name
+        popped.
        :param name: Pop up to the most recent tag with this name.
        :param nsprefix: The namespace prefix that goes with `name`.
        :param inclusivePop: It this is false, pops the tag stack up
          to but *not* including the most recent instqance of the
          given tag.
        """
        #print("Popping to %s" % name)
        if name == self.ROOT_TAG_NAME:
            # The BeautifulSoup object itself can never be popped.
            return
@ -557,6 +677,8 @@ class BeautifulSoup(Tag):
        stack_size = len(self.tagStack)
        for i in range(stack_size - 1, 0, -1):
            if not self.open_tag_counter.get(name):
                break
            t = self.tagStack[i]
            if (name == t.name and nsprefix == t.prefix):
                if inclusivePop:
@ -568,15 +690,22 @@ class BeautifulSoup(Tag):
    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
                        sourcepos=None):
-        """Push a start tag on to the stack.
+        """Called by the tree builder when a new tag is encountered.
-        If this method returns None, the tag was rejected by the
+        :param name: Name of the tag.
        :param nsprefix: Namespace prefix for the tag.
        :param attrs: A dictionary of attribute values.
        :param sourceline: The line number where this tag was found in its
            source document.
        :param sourcepos: The character position within `sourceline` where this
            tag was found.
        If this method returns None, the tag was rejected by an active
        SoupStrainer. You should proceed as if the tag had not occurred
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
-
+        # print("Start tag %s: %s" % (name, attrs))
        # print "Start tag %s: %s" % (name, attrs)
        self.endData()
        if (self.parse_only and len(self.tagStack) <= 1
@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
        return tag
    def handle_endtag(self, name, nsprefix=None):
-        #print "End tag: " + name
+        """Called by the tree builder when an ending tag is encountered.
        :param name: Name of the tag.
        :param nsprefix: Namespace prefix for the tag.
        """
        #print("End tag: " + name)
        self.endData()
        self._popToTag(name, nsprefix)
    def handle_data(self, data):
        """Called by the tree builder when a chunk of textual data is encountered."""
        self.current_data.append(data)
-
+       
    def decode(self, pretty_print=False,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
-        """Returns a string or Unicode representation of this document.
+        """Returns a string or Unicode representation of the parse tree
-        To get Unicode, pass None for encoding."""
+            as an HTML or XML document.
        :param pretty_print: If this is True, indentation will be used to
            make the document more readable.
        :param eventual_encoding: The encoding of the final document.
            If this is None, the document will be a Unicode string.
        """
        if self.is_xml:
            # Print the XML declaration
            encoding_part = ''
            if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
                # This is a special Python encoding; it can't actually
                # go into an XML document because it means nothing
                # outside of Python.
                eventual_encoding = None
            if eventual_encoding != None:
                encoding_part = ' encoding="%s"' % eventual_encoding
            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
        return prefix + super(BeautifulSoup, self).decode(
            indent_level, eventual_encoding, formatter)
-# Alias to make it easier to type import: 'from bs4 import _soup'
+# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
 _s = BeautifulSoup
 _soup = BeautifulSoup
@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):
 class StopParsing(Exception):
    """Exception raised by a TreeBuilder if it's unable to continue parsing."""
    pass
 class FeatureNotFound(ValueError):
    """Exception raised by the BeautifulSoup constructor if no parser with the
    requested features is found.
    """
    pass
-#By default, act as an HTML pretty-printer.
+#If this file is run as a script, act as an HTML pretty-printer.
 if __name__ == '__main__':
    import sys
    soup = BeautifulSoup(sys.stdin)
-    print(soup.prettify())
+    print((soup.prettify()))
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -7,8 +7,11 @@ import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
    Stylesheet,
    Script,
    TemplateString,
    nonwhitespace_re
-    )
+)
 __all__ = [
    'HTMLTreeBuilder',
@ -27,18 +30,33 @@ HTML_5 = 'html5'
 class TreeBuilderRegistry(object):
-
+    """A way of looking up TreeBuilder subclasses by their name or by desired
    features.
    """
    def __init__(self):
        self.builders_for_feature = defaultdict(list)
        self.builders = []
    def register(self, treebuilder_class):
-        """Register a treebuilder based on its advertised features."""
+        """Register a treebuilder based on its advertised features.
        :param treebuilder_class: A subclass of Treebuilder. its .features
           attribute should list its features.
        """
        for feature in treebuilder_class.features:
            self.builders_for_feature[feature].insert(0, treebuilder_class)
        self.builders.insert(0, treebuilder_class)
    def lookup(self, *features):
        """Look up a TreeBuilder subclass with the desired features.
        :param features: A list of features to look for. If none are
            provided, the most recently registered TreeBuilder subclass
            will be used.
        :return: A TreeBuilder subclass, or None if there's no
            registered subclass with all the requested features.
        """
        if len(self.builders) == 0:
            # There are no builders at all.
            return None
@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
 builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
-    """Turn a document into a Beautiful Soup object tree."""
+    """Turn a textual document into a Beautiful Soup object tree."""
    NAME = "[Unknown tree builder]"
    ALTERNATE_NAMES = []
@ -96,7 +114,12 @@ class TreeBuilder(object):
    # comma-separated list of CDATA, rather than a single CDATA.
    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
    # Whitespace should be preserved inside these tags.
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
    # The textual contents of tags with these names should be
    # instantiated with some class other than NavigableString.
    DEFAULT_STRING_CONTAINERS = {}
    USE_DEFAULT = object()
@ -105,30 +128,39 @@ class TreeBuilder(object):
    def __init__(self, multi_valued_attributes=USE_DEFAULT,
                 preserve_whitespace_tags=USE_DEFAULT,
-                 store_line_numbers=USE_DEFAULT):
+                 store_line_numbers=USE_DEFAULT,
                 string_containers=USE_DEFAULT,
    ):
        """Constructor.
        :param multi_valued_attributes: If this is set to None, the
-        TreeBuilder will not turn any values for attributes like
+         TreeBuilder will not turn any values for attributes like
-        'class' into lists. Setting this do a dictionary will
+         'class' into lists. Setting this to a dictionary will
-        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
-        for an example.
+         for an example.
-        Internally, these are called "CDATA list attributes", but that
+         Internally, these are called "CDATA list attributes", but that
-        probably doesn't make sense to an end-user, so the argument name
+         probably doesn't make sense to an end-user, so the argument name
-        is `multi_valued_attributes`.
+         is `multi_valued_attributes`.
        :param preserve_whitespace_tags: A list of tags to treat
-        the way <pre> tags are treated in HTML. Tags in this list
+         the way <pre> tags are treated in HTML. Tags in this list
-        will have 
+         are immune from pretty-printing; their contents will always be
         output as-is.
        :param string_containers: A dictionary mapping tag names to
        the classes that should be instantiated to contain the textual
        contents of those tags. The default is to use NavigableString
        for every tag, no matter what the name. You can override the
        default by changing DEFAULT_STRING_CONTAINERS.
        :param store_line_numbers: If the parser keeps track of the
-        line numbers and positions of the original markup, that
+         line numbers and positions of the original markup, that
-        information will, by default, be stored in each corresponding
+         information will, by default, be stored in each corresponding
-        `Tag` object. You can turn this off by passing
+         `Tag` object. You can turn this off by passing
-        store_line_numbers=False. If the parser you're using doesn't 
+         store_line_numbers=False. If the parser you're using doesn't 
-        keep track of this information, then setting store_line_numbers=True
+         keep track of this information, then setting store_line_numbers=True
-        will do nothing.
+         will do nothing.
        """
        self.soup = None
        if multi_valued_attributes is self.USE_DEFAULT:
@ -139,15 +171,25 @@ class TreeBuilder(object):
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers
+        self.store_line_numbers = store_line_numbers 
        if string_containers == self.USE_DEFAULT:
            string_containers = self.DEFAULT_STRING_CONTAINERS
        self.string_containers = string_containers
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.
        :param soup: A BeautifulSoup object.
        """
        self.soup = soup
    def reset(self):
        """Do any work necessary to reset the underlying parser
        for a new document.
        By default, this does nothing.
        """
        pass
    def can_be_empty_element(self, tag_name):
@ -159,23 +201,57 @@ class TreeBuilder(object):
        For instance: an HTMLBuilder does not consider a <p> tag to be
        an empty-element tag (it's not in
        HTMLBuilder.empty_element_tags). This means an empty <p> tag
-        will be presented as "<p></p>", not "<p />".
+        will be presented as "<p></p>", not "<p/>" or "<p>".
        The default implementation has no opinion about which tags are
        empty-element tags, so a tag will be presented as an
-        empty-element tag if and only if it has no contents.
+        empty-element tag if and only if it has no children.
-        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
        be left alone.
        :param tag_name: The name of a markup tag.
        """
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
    def feed(self, markup):
        """Run some incoming markup through some parsing process,
        populating the `BeautifulSoup` object in self.soup.
        This method is not implemented in TreeBuilder; it must be
        implemented in subclasses.
        :return: None.
        """
        raise NotImplementedError()
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
        """Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.
        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding. NOTE: This argument is not used by the
            calling code and can probably be removed.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.
        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)
         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
         By default, the only strategy is to parse the markup
         as-is. See `LXMLTreeBuilderForXML` and
         `HTMLParserTreeBuilder` for implementations that take into
         account the quirks of particular parsers.
        """
        yield markup, None, None, False
    def test_fragment_to_document(self, fragment):
@ -188,16 +264,36 @@ class TreeBuilder(object):
        results against other HTML fragments.
        This method should not be used outside of tests.
        :param fragment: A string -- fragment of HTML.
        :return: A string -- a full HTML document.
        """
        return fragment
    def set_up_substitutions(self, tag):
        """Set up any substitutions that will need to be performed on 
        a `Tag` when it's output as a string.
        By default, this does nothing. See `HTMLTreeBuilder` for a
        case where this is used.
        :param tag: A `Tag`
        :return: Whether or not a substitution was performed.
        """
        return False
    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
-        """Replaces class="foo bar" with class=["foo", "bar"]
+        """When an attribute value is associated with a tag that can
        have multiple values for that attribute, convert the string
        value to a list of strings.
-        Modifies its input in place.
+        Basically, replaces class="foo bar" with class=["foo", "bar"]
        NOTE: This method modifies its input in place.
        :param tag_name: The name of a tag.
        :param attrs: A dictionary containing the tag's attributes.
           Any appropriate attribute values will be modified in place.
        """
        if not attrs:
            return attrs
@ -225,7 +321,11 @@ class TreeBuilder(object):
        return attrs
 class SAXTreeBuilder(TreeBuilder):
-    """A Beautiful Soup treebuilder that listens for SAX events."""
+    """A Beautiful Soup treebuilder that listens for SAX events.
    This is not currently used for anything, but it demonstrates
    how a simple TreeBuilder would work.
    """
    def feed(self, markup):
        raise NotImplementedError()
@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):
    def startElement(self, name, attrs):
        attrs = dict((key[1], value) for key, value in list(attrs.items()))
-        #print "Start %s, %r" % (name, attrs)
+        #print("Start %s, %r" % (name, attrs))
        self.soup.handle_starttag(name, attrs)
    def endElement(self, name):
-        #print "End %s" % name
+        #print("End %s" % name)
        self.soup.handle_endtag(name)
    def startElementNS(self, nsTuple, nodeName, attrs):
@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder):
    # but it may do so eventually, and this information is available if
    # you need to use it.
    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
    # The HTML standard defines an unusual content model for these tags.
    # We represent this by using a string class other than NavigableString
    # inside these tags.
    #
    # I made this list by going through the HTML spec
    # (https://html.spec.whatwg.org/#metadata-content) and looking for
    # "metadata content" elements that can contain strings.
    #
    # TODO: Arguably <noscript> could go here but it seems
    # qualitatively different from the other tags.
    DEFAULT_STRING_CONTAINERS = {
        'style': Stylesheet,
        'script': Script,
        'template': TemplateString,
    }    
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
    def set_up_substitutions(self, tag):
        """Replace the declared encoding in a <meta> tag with a placeholder,
        to be substituted when the tag is output to a string.
        An HTML document may come in to Beautiful Soup as one
        encoding, but exit in a different encoding, and the <meta> tag
        needs to be changed to reflect this.
        :param tag: A `Tag`
        :return: Whether or not a substitution was performed.
        """
        # We are only interested in <meta> tags
        if tag.name != 'meta':
            return False
@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):
 def register_treebuilders_from(module):
    """Copy TreeBuilders from the given module into this module."""
-    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules[__name__]
    this_module = sys.modules['bs4.builder']
    for name in module.__all__:
        obj = getattr(module, name)
@ -363,6 +488,9 @@ def register_treebuilders_from(module):
            this_module.builder_registry.register(obj)
 class ParserRejectedMarkup(Exception):
    """An Exception to be raised when the underlying parser simply
    refuses to parse the given markup.
    """
    def __init__(self, message_or_exception):
        """Explain why the parser rejected the given markup, either
        with a textual explanation or another exception.
@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
 from . import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -39,7 +39,18 @@ except ImportError as e:
    new_html5lib = True
 class HTML5TreeBuilder(HTMLTreeBuilder):
-    """Use html5lib to build a tree."""
+    """Use html5lib to build a tree.
    Note that this TreeBuilder does not support some features common
    to HTML TreeBuilders. Some of these features could theoretically
    be implemented, but at the very least it's quite difficult,
    because html5lib moves the parse tree around as it's being built.
    * This TreeBuilder doesn't use different subclasses of NavigableString
      based on the name of the tag in which the string was found.
    * You can't use a SoupStrainer to parse only part of a document.
    """
    NAME = "html5lib"
@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        # TODO: What are **kwargs exactly? Should they be passed in
        # here in addition to/instead of being passed to the BeautifulSoup
        # constructor?
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
        # This will be set later to an html5lib.html5parser.HTMLParser
@ -316,9 +330,7 @@ class Element(treebuilder_base.Node):
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes is not None and len(attributes) > 0:
            converted_attributes = []
            for name, value in list(attributes.items()):
                if isinstance(name, tuple):
@ -363,9 +375,9 @@ class Element(treebuilder_base.Node):
    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
-        # print "MOVE", self.element.contents
+        # print("MOVE", self.element.contents)
-        # print "FROM", self.element
+        # print("FROM", self.element)
-        # print "TO", new_parent.element
+        # print("TO", new_parent.element)
        element = self.element
        new_parent_element = new_parent.element
@ -423,9 +435,9 @@ class Element(treebuilder_base.Node):
        element.contents = []
        element.next_element = final_next_element
-        # print "DONE WITH MOVE"
+        # print("DONE WITH MOVE")
-        # print "FROM", self.element
+        # print("FROM", self.element)
-        # print "TO", new_parent_element
+        # print("TO", new_parent_element)
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -8,7 +8,7 @@ __all__ = [
    'HTMLParserTreeBuilder',
    ]
-from future.moves.html.parser import HTMLParser
+from html.parser import HTMLParser
 try:
    from html.parser import HTMLParseError
@ -53,8 +53,30 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'
 class BeautifulSoupHTMLParser(HTMLParser):
    """A subclass of the Python standard library's HTMLParser class, which
    listens for HTMLParser events and translates them into calls
    to Beautiful Soup's tree construction API.
    """
    # Strategies for handling duplicate attributes
    IGNORE = 'ignore'
    REPLACE = 'replace'
    def __init__(self, *args, **kwargs):
        """Constructor.
        :param on_duplicate_attribute: A strategy for what to do if a
            tag includes the same attribute more than once. Accepted
            values are: REPLACE (replace earlier values with later
            ones, the default), IGNORE (keep the earliest value
            encountered), or a callable. A callable must take three
            arguments: the dictionary of attributes already processed,
            the name of the duplicate attribute, and the most recent value
            encountered.           
        """
        self.on_duplicate_attribute = kwargs.pop(
            'on_duplicate_attribute', self.REPLACE
        )
        HTMLParser.__init__(self, *args, **kwargs)
        # Keep a list of empty-element tags that were encountered
@ -67,20 +89,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.already_closed_empty_element = []
    def error(self, msg):
-        """In Python 3, HTMLParser subclasses must implement error(), although this
+        """In Python 3, HTMLParser subclasses must implement error(), although
-        requirement doesn't appear to be documented.
+        this requirement doesn't appear to be documented.
-        In Python 2, HTMLParser implements error() as raising an exception.
+        In Python 2, HTMLParser implements error() by raising an exception,
        which we don't want to do.
-        In any event, this method is called only on very strange markup and our best strategy
+        In any event, this method is called only on very strange
-        is to pretend it didn't happen and keep going.
+        markup and our best strategy is to pretend it didn't happen
        and keep going.
        """
        warnings.warn(msg)
    def handle_startendtag(self, name, attrs):
-        # This is only called when the markup looks like
+        """Handle an incoming empty-element tag.
        # <tag/>.
        This is only called when the markup looks like <tag/>.
        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        """
        # is_startend() tells handle_starttag not to close the tag
        # just because its name matches a known empty-element tag. We
        # know that this is an empty-element tag and we want to call
@ -89,6 +117,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.handle_endtag(name)
    def handle_starttag(self, name, attrs, handle_empty_element=True):
        """Handle an opening tag, e.g. '<tag>'
        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        :param handle_empty_element: True if this tag is known to be
            an empty-element tag (i.e. there is not expected to be any
            closing tag).
        """
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@ -96,9 +132,21 @@ class BeautifulSoupHTMLParser(HTMLParser):
            # for consistency with the other tree builders.
            if value is None:
                value = ''
-            attr_dict[key] = value
+            if key in attr_dict:
                # A single attribute shows up multiple times in this
                # tag. How to handle it depends on the
                # on_duplicate_attribute setting.
                on_dupe = self.on_duplicate_attribute
                if on_dupe == self.IGNORE:
                    pass
                elif on_dupe in (None, self.REPLACE):
                    attr_dict[key] = value
                else:
                    on_dupe(attr_dict, key, value)
            else:
                attr_dict[key] = value
            attrvalue = '""'
-        #print "START", name
+        #print("START", name)
        sourceline, sourcepos = self.getpos()
        tag = self.soup.handle_starttag(
            name, None, None, attr_dict, sourceline=sourceline,
@ -121,20 +169,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
            self.already_closed_empty_element.append(name)
    def handle_endtag(self, name, check_already_closed=True):
-        #print "END", name
+        """Handle a closing tag, e.g. '</tag>'
        :param name: A tag name.
        :param check_already_closed: True if this tag is expected to
           be the closing portion of an empty-element tag,
           e.g. '<tag></tag>'.
        """
        #print("END", name)
        if check_already_closed and name in self.already_closed_empty_element:
            # This is a redundant end tag for an empty-element tag.
            # We've already called handle_endtag() for it, so just
            # check it off the list.
-            # print "ALREADY CLOSED", name
+            #print("ALREADY CLOSED", name)
            self.already_closed_empty_element.remove(name)
        else:
            self.soup.handle_endtag(name)
    def handle_data(self, data):
        """Handle some textual data that shows up between tags."""
        self.soup.handle_data(data)
    def handle_charref(self, name):
        """Handle a numeric character reference by converting it to the
        corresponding Unicode character and treating it as textual
        data.
        :param name: Character number, possibly in hexadecimal.
        """
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
@ -168,6 +230,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.handle_data(data)
    def handle_entityref(self, name):
        """Handle a named entity reference by converting it to the
        corresponding Unicode character(s) and treating it as textual
        data.
        :param name: Name of the entity reference.
        """
        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
        if character is not None:
            data = character
@ -181,21 +249,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.handle_data(data)
    def handle_comment(self, data):
        """Handle an HTML comment.
        :param data: The text of the comment.
        """
        self.soup.endData()
        self.soup.handle_data(data)
        self.soup.endData(Comment)
    def handle_decl(self, data):
        """Handle a DOCTYPE declaration.
        :param data: The text of the declaration.
        """
        self.soup.endData()
-        if data.startswith("DOCTYPE "):
+        data = data[len("DOCTYPE "):]
            data = data[len("DOCTYPE "):]
        elif data == 'DOCTYPE':
            # i.e. "<!DOCTYPE>"
            data = ''
        self.soup.handle_data(data)
        self.soup.endData(Doctype)
    def unknown_decl(self, data):
        """Handle a declaration of unknown type -- probably a CDATA block.
        :param data: The text of the declaration.
        """
        if data.upper().startswith('CDATA['):
            cls = CData
            data = data[len('CDATA['):]
@ -206,13 +282,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.soup.endData(cls)
    def handle_pi(self, data):
        """Handle a processing instruction.
        :param data: The text of the instruction.
        """
        self.soup.endData()
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+    """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
    found in the Python standard library.
    """
    is_xml = False
    picklable = True
    NAME = HTMLPARSER
@ -221,36 +303,88 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
    # The html.parser knows which line number and position in the
    # original file is the source of an element.
    TRACKS_LINE_NUMBERS = True
-    
+
    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
        """Constructor.
        :param parser_args: Positional arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param parser_kwargs: Keyword arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param kwargs: Keyword arguments for the superclass constructor.
        """
        # Some keyword arguments will be pulled out of kwargs and placed
        # into parser_kwargs.
        extra_parser_kwargs = dict()
        for arg in ('on_duplicate_attribute',):
            if arg in kwargs:
                value = kwargs.pop(arg)
                extra_parser_kwargs[arg] = value
        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
        parser_args = parser_args or []
        parser_kwargs = parser_kwargs or {}
        parser_kwargs.update(extra_parser_kwargs)
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            parser_kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            parser_kwargs['convert_charrefs'] = False
        self.parser_args = (parser_args, parser_kwargs)
-
+        
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
-        """
+
-        :return: A 4-tuple (markup, original encoding, encoding
+        """Run any preliminary steps necessary to make incoming markup
-        declared within markup, whether any characters had to be
+        acceptable to the parser.
-        replaced with REPLACEMENT CHARACTER).
+
        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.
        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)
         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
        """
        if isinstance(markup, str):
            # Parse Unicode as-is.
            yield (markup, None, None, False)
            return
        # Ask UnicodeDammit to sniff the most likely encoding.
        # This was provided by the end-user; treat it as a known
        # definite encoding per the algorithm laid out in the HTML5
        # spec.  (See the EncodingDetector class for details.)
        known_definite_encodings = [user_specified_encoding]
        # This was found in the document; treat it as a slightly lower-priority
        # user encoding.
        user_encodings = [document_declared_encoding]
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+        dammit = UnicodeDammit(
-                               exclude_encodings=exclude_encodings)
+            markup,
            known_definite_encodings=known_definite_encodings,
            user_encodings=user_encodings,
            is_html=True,
            exclude_encodings=exclude_encodings
        )
        yield (dammit.markup, dammit.original_encoding,
               dammit.declared_html_encoding,
               dammit.contains_replacement_characters)
    def feed(self, markup):
        """Run some incoming markup through some parsing process,
        populating the `BeautifulSoup` object in self.soup.
        """
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    # But instead we build an XMLParser or HTMLParser object to serve
    # as the target of parse messages, and those messages don't include
    # line numbers.
    # See: https://bugs.launchpad.net/lxml/+bug/1846906
    def initialize_soup(self, soup):
        """Let the BeautifulSoup object know about the standard namespace
        mapping.
        :param soup: A `BeautifulSoup`.
        """
        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
        self._register_namespaces(self.DEFAULT_NSMAPS)
@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        while parsing the document.
        This might be useful later on when creating CSS selectors.
        :param mapping: A dictionary mapping namespace prefixes to URIs.
        """
        for key, value in list(mapping.items()):
            if key and key not in self.soup._namespaces:
@ -84,20 +89,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                self.soup._namespaces[key] = value
    def default_parser(self, encoding):
-        # This can either return a parser object or a class, which
+        """Find the default parser for the given encoding.
-        # will be instantiated with default arguments.
+
        :param encoding: A string.
        :return: Either a parser object or a class, which
          will be instantiated with default arguments.
        """
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding)
    def parser_for(self, encoding):
        """Instantiate an appropriate parser for the given encoding.
        :param encoding: A string.
        :return: A parser object such as an `etree.XMLParser`.
        """
        # Use the default parser.
        parser = self.default_parser(encoding)
        if isinstance(parser, Callable):
            # Instantiate the parser with default arguments
-            parser = parser(target=self, strip_cdata=False, encoding=encoding)
+            parser = parser(
                target=self, strip_cdata=False, recover=True, encoding=encoding
            )
        return parser
    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
@ -122,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def prepare_markup(self, markup, user_specified_encoding=None,
                       exclude_encodings=None,
                       document_declared_encoding=None):
-        """
+        """Run any preliminary steps necessary to make incoming markup
-        :yield: A series of 4-tuples.
+        acceptable to the parser.
        lxml really wants to get a bytestring and convert it to
        Unicode itself. So instead of using UnicodeDammit to convert
        the bytestring to Unicode using different encodings, this
        implementation uses EncodingDetector to iterate over the
        encodings, and tell lxml to try to parse the document as each
        one in turn.
        :param markup: Some markup -- hopefully a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.
        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)
-        Each 4-tuple represents a strategy for parsing the document.
+         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
        """
        # Instead of using UnicodeDammit to convert the bytestring to
        # Unicode using different encodings, use EncodingDetector to
        # iterate over the encodings, and tell lxml to try to parse
        # the document as each one in turn.
        is_html = not self.is_xml
        if is_html:
            self.processing_instruction_class = ProcessingInstruction
@ -150,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)
-        try_encodings = [user_specified_encoding, document_declared_encoding]
+        # This was provided by the end-user; treat it as a known
        # definite encoding per the algorithm laid out in the HTML5
        # spec.  (See the EncodingDetector class for details.)
        known_definite_encodings = [user_specified_encoding]
        # This was found in the document; treat it as a slightly lower-priority
        # user encoding.
        user_encodings = [document_declared_encoding]
        detector = EncodingDetector(
-            markup, try_encodings, is_html, exclude_encodings)
+            markup, known_definite_encodings=known_definite_encodings,
            user_encodings=user_encodings, is_html=is_html,
            exclude_encodings=exclude_encodings
        )
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)
--- a/lib/bs4/check_block.py
+++ b/lib/bs4/check_block.py
@ -1,4 +0,0 @@
 import requests
 data = requests.get("https://www.crummy.com/").content
 from bs4 import _s
 data = [x for x in _s(data).block_text()]
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -20,9 +20,13 @@ import sys
 import cProfile
 def diagnose(data):
-    """Diagnostic suite for isolating common problems."""
+    """Diagnostic suite for isolating common problems.
-    print("Diagnostic running on Beautiful Soup %s" % __version__)
+
-    print("Python version %s" % sys.version)
+    :param data: A string containing markup that needs to be explained.
    :return: None; diagnostics are printed to standard output.
    """
    print(("Diagnostic running on Beautiful Soup %s" % __version__))
    print(("Python version %s" % sys.version))
    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
@ -39,65 +43,76 @@ def diagnose(data):
        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
-            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+            print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
        except ImportError as e:
-            print (
+            print(
                "lxml is not installed or couldn't be imported.")
    if 'html5lib' in basic_parsers:
        try:
            import html5lib
-            print("Found html5lib version %s" % html5lib.__version__)
+            print(("Found html5lib version %s" % html5lib.__version__))
        except ImportError as e:
-            print (
+            print(
                "html5lib is not installed or couldn't be imported.")
    if hasattr(data, 'read'):
        data = data.read()
    elif data.startswith("http:") or data.startswith("https:"):
-        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+        print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
        return
    else:
        try:
            if os.path.exists(data):
-                print('"%s" looks like a filename. Reading data from the file.' % data)
+                print(('"%s" looks like a filename. Reading data from the file.' % data))
                with open(data) as fp:
                    data = fp.read()
        except ValueError:
            # This can happen on some platforms when the 'filename' is
            # too long. Assume it's data and not a filename.
            pass
-        print()
+        print("")
    for parser in basic_parsers:
-        print("Trying to parse your markup with %s" % parser)
+        print(("Trying to parse your markup with %s" % parser))
        success = False
        try:
            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception as e:
-            print("%s could not parse the markup." % parser)
+            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
-            print("Here's what %s did with the markup:" % parser)
+            print(("Here's what %s did with the markup:" % parser))
-            print(soup.prettify())
+            print((soup.prettify()))
-        print("-" * 80)
+        print(("-" * 80))
 def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.
    This lets you see how lxml parses a document when no Beautiful
-    Soup code is running.
+    Soup code is running. You can use this to determine whether
    an lxml-specific problem is in Beautiful Soup's lxml tree builders
    or in lxml itself.
    :param data: Some markup.
    :param html: If True, markup will be parsed with lxml's HTML parser.
       if False, lxml's XML parser will be used.
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
        print(("%s, %4s, %s" % (event, element.tag, element.text)))
 class AnnouncingParser(HTMLParser):
-    """Announces HTMLParser parse events, without doing anything else."""
+    """Subclass of HTMLParser that announces parse events, without doing
    anything else.
    You can use this to get a picture of how html.parser sees a given
    document. The easiest way to do this is to call `htmlparser_trace`.
    """
    def _p(self, s):
        print(s)
@ -134,6 +149,8 @@ def htmlparser_trace(data):
    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    :param data: Some markup.
    """
    parser = AnnouncingParser()
    parser.feed(data)
@ -176,9 +193,9 @@ def rdoc(num_elements=1000):
 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
-    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
+    print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
    data = rdoc(num_elements)
-    print("Generated a large invalid HTML document (%d bytes)." % len(data))
+    print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
            b = time.time()
            success = True
        except Exception as e:
-            print("%s could not parse the markup." % parser)
+            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
-            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
+            print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
-    print("Raw lxml parsed the markup in %.2fs." % (b-a))
+    print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
-    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
+    print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
 def profile(num_elements=100000, parser="lxml"):
-
+    """Use Python's profiler on a randomly generated document."""
    filehandle = tempfile.NamedTemporaryFile()
    filename = filehandle.name
@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
    stats.sort_stats("cumulative")
    stats.print_stats('_html5lib|bs4', 50)
 # If this file is run as a script, standard input is diagnosed.
 if __name__ == '__main__':
    diagnose(sys.stdin.read())
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):
    Some parts of this strategy come from the distinction between
    HTML4, HTML5, and XML. Others are configurable by the user.
    Formatters are passed in as the `formatter` argument to methods
    like `PageElement.encode`. Most people won't need to think about
    formatters, and most people who need to think about them can pass
    in one of these predefined strings as `formatter` rather than
    making a new Formatter object:
    For HTML documents:
     * 'html' - HTML entity substitution for generic HTML documents. (default)
     * 'html5' - HTML entity substitution for HTML5 documents, as
                 well as some optimizations in the way tags are rendered.
     * 'minimal' - Only make the substitutions necessary to guarantee
                   valid HTML.
     * None - Do not perform any substitution. This will be faster
              but may result in invalid markup.
    For XML documents:
     * 'html' - Entity substitution for XHTML documents.
     * 'minimal' - Only make the substitutions necessary to guarantee
                   valid XML. (default)
     * None - Do not perform any substitution. This will be faster
              but may result in invalid markup.
    """
    # Registries of XML and HTML formatters.
    XML_FORMATTERS = {}
@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
    def __init__(
            self, language=None, entity_substitution=None,
            void_element_close_prefix='/', cdata_containing_tags=None,
            empty_attributes_are_booleans=False,
    ):
-        """
+        """Constructor.
-        :param void_element_close_prefix: By default, represent void
+        :param language: This should be Formatter.XML if you are formatting
-        elements as <tag/> rather than <tag>
+           XML markup and Formatter.HTML if you are formatting HTML markup.
        :param entity_substitution: A function to call to replace special
           characters with XML/HTML entities. For examples, see 
           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
        :param void_element_close_prefix: By default, void elements
           are represented as <tag/> (XML rules) rather than <tag>
           (HTML rules). To get <tag>, pass in the empty string.
        :param cdata_containing_tags: The list of tags that are defined
           as containing CDATA in this dialect. For example, in HTML,
           <script> and <style> tags are defined as containing CDATA,
           and their contents should not be formatted.
        :param blank_attributes_are_booleans: Render attributes whose value
            is the empty string as HTML-style boolean attributes.
            (Attributes whose value is None are always rendered this way.)
        """
        self.language = language
        self.entity_substitution = entity_substitution
@ -39,9 +76,17 @@ class Formatter(EntitySubstitution):
        self.cdata_containing_tags = self._default(
            language, cdata_containing_tags, 'cdata_containing_tags'
        )
-            
+        self.empty_attributes_are_booleans=empty_attributes_are_booleans
    def substitute(self, ns):
-        """Process a string that needs to undergo entity substitution."""
+        """Process a string that needs to undergo entity substitution.
        This may be a string encountered in an attribute value or as
        text.
        :param ns: A string.
        :return: A string with certain characters replaced by named
           or numeric entities.
        """
        if not self.entity_substitution:
            return ns
        from .element import NavigableString
@ -54,21 +99,41 @@ class Formatter(EntitySubstitution):
        return self.entity_substitution(ns)
    def attribute_value(self, value):
-        """Process the value of an attribute."""
+        """Process the value of an attribute.
        :param ns: A string.
        :return: A string with certain characters replaced by named
           or numeric entities.
        """
        return self.substitute(value)
    def attributes(self, tag):
-        """Reorder a tag's attributes however you want."""
+        """Reorder a tag's attributes however you want.
-        return sorted(tag.attrs.items())
+        
        By default, attributes are sorted alphabetically. This makes
        behavior consistent between Python 2 and Python 3, and preserves
        backwards compatibility with older versions of Beautiful Soup.
        If `empty_boolean_attributes` is True, then attributes whose
        values are set to the empty string will be treated as boolean
        attributes.
        """
        if tag.attrs is None:
            return []
        return sorted(
            (k, (None if self.empty_attributes_are_booleans and v == '' else v))
            for k, v in list(tag.attrs.items())
        )
 class HTMLFormatter(Formatter):
    """A generic Formatter for HTML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
 class XMLFormatter(Formatter):
    """A generic Formatter for XML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
@ -80,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
 )
 HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html,
-    void_element_close_prefix = None
+    void_element_close_prefix=None,
    empty_attributes_are_booleans=True,
 )
 HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -8,6 +8,7 @@ import pickle
 import copy
 import functools
 import unittest
 import warnings
 from unittest import TestCase
 from bs4 import BeautifulSoup
 from bs4.element import (
@ -15,7 +16,10 @@ from bs4.element import (
    Comment,
    ContentMetaAttributeValue,
    Doctype,
    PYTHON_SPECIFIC_ENCODINGS,
    SoupStrainer,
    Script,
    Stylesheet,
    Tag
 )
@ -83,8 +87,22 @@ class SoupTest(unittest.TestCase):
        if compare_parsed_to is None:
            compare_parsed_to = to_parse
        # Verify that the documents come out the same.
        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
        # Also run some checks on the BeautifulSoup object itself:
        # Verify that every tag that was opened was eventually closed.
        # There are no tags in the open tag counter.
        assert all(v==0 for v in list(obj.open_tag_counter.values()))
        # The only tag in the tag stack is the one for the root
        # document.
        self.assertEqual(
            [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
        )
    def assertConnectedness(self, element):
        """Ensure that next_element and previous_element are properly
        set for all descendants of the given element.
@ -211,7 +229,41 @@ class SoupTest(unittest.TestCase):
            return child
-class HTMLTreeBuilderSmokeTest(object):
+class TreeBuilderSmokeTest(object):
    # Tests that are common to HTML and XML tree builders.
    def test_fuzzed_input(self):
        # This test centralizes in one place the various fuzz tests
        # for Beautiful Soup created by the oss-fuzz project.
        # These strings superficially resemble markup, but they
        # generally can't be parsed into anything. The best we can
        # hope for is that parsing these strings won't crash the
        # parser.
        #
        # n.b. This markup is commented out because these fuzz tests
        # _do_ crash the parser. However the crashes are due to bugs
        # in html.parser, not Beautiful Soup -- otherwise I'd fix the
        # bugs!
        bad_markup = [
            # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
            # https://bugs.python.org/issue37747
            #
            #b'\n<![\xff\xfe\xfe\xcd\x00',
            #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
            # https://bugs.python.org/issue34480
            #
            #b'<![n\x00'
        ]
        for markup in bad_markup:
            with warnings.catch_warnings(record=False):
                soup = self.soup(markup)
 class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
    """A basic test of a treebuilder's competence.
@ -233,6 +285,22 @@ class HTMLTreeBuilderSmokeTest(object):
            new_tag = soup.new_tag(name)
            self.assertEqual(True, new_tag.is_empty_element)
    def test_special_string_containers(self):
        soup = self.soup(
            "<style>Some CSS</style><script>Some Javascript</script>"
        )
        assert isinstance(soup.style.string, Stylesheet)
        assert isinstance(soup.script.string, Script)
        soup = self.soup(
            "<style><!--Some CSS--></style>"
        )
        assert isinstance(soup.style.string, Stylesheet)
        # The contents of the style tag resemble an HTML comment, but
        # it's not treated as a comment.
        self.assertEqual("<!--Some CSS-->", soup.style.string)
        assert isinstance(soup.style.string, Stylesheet)
    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
        # to the original.
@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
        doctype = soup.contents[0]
        self.assertEqual(doctype.__class__, Doctype)
        self.assertEqual(doctype, doctype_fragment)
-        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+        self.assertEqual(
            soup.encode("utf8")[:len(doctype_str)],
            doctype_str
        )
        # Make sure that the doctype was correctly associated with the
        # parse tree and that the rest of the document parsed.
        self.assertEqual(soup.p.contents[0], 'foo')
-    def _document_with_doctype(self, doctype_fragment):
+    def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
        """Generate and parse a document with the given doctype."""
-        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
        markup = doctype + '\n<p>foo</p>'
        soup = self.soup(markup)
-        return doctype, soup
+        return doctype.encode("utf8"), soup
    def test_normal_doctypes(self):
        """Make sure normal, everyday HTML doctypes are handled correctly."""
@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object):
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())
    def test_mixed_case_doctype(self):
        # A lowercase or mixed-case doctype becomes a Doctype.
        for doctype_fragment in ("doctype", "DocType"):
            doctype_str, soup = self._document_with_doctype(
                "html", doctype_fragment
            )
            # Make sure a Doctype object was created and that the DOCTYPE
            # is uppercase.
            doctype = soup.contents[0]
            self.assertEqual(doctype.__class__, Doctype)
            self.assertEqual(doctype, "html")
            self.assertEqual(
                soup.encode("utf8")[:len(doctype_str)],
                b"<!DOCTYPE html>"
            )
            # Make sure that the doctype was correctly associated with the
            # parse tree and that the rest of the document parsed.
            self.assertEqual(soup.p.contents[0], 'foo')
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
@ -532,7 +624,7 @@ Hello, world!
        self.assertSoupEquals("&#10000000000000;", expect)
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
-        
+       
    def test_multipart_strings(self):
        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@ -594,7 +686,7 @@ Hello, world!
        markup = b'<a class="foo bar">'
        soup = self.soup(markup)
        self.assertEqual(['foo', 'bar'], soup.a['class'])
-
+        
    #
    # Generally speaking, tests below this point are more tests of
    # Beautiful Soup than tests of the tree builders. But parsers are
@ -779,11 +871,44 @@ Hello, world!
        # encoding.
        self.assertEqual('utf8', charset.encode("utf8"))
    def test_python_specific_encodings_not_used_in_charset(self):
        # You can encode an HTML document using a Python-specific
        # encoding, but that encoding won't be mentioned _inside_ the
        # resulting document. Instead, the document will appear to
        # have no encoding.
        for markup in [
            b'<meta charset="utf8"></head>'
            b'<meta id="encoding" charset="utf-8" />'
        ]:
            soup = self.soup(markup)
            for encoding in PYTHON_SPECIFIC_ENCODINGS:
                if encoding in (
                    'idna', 'mbcs', 'oem', 'undefined',
                    'string_escape', 'string-escape'
                ):
                    # For one reason or another, these will raise an
                    # exception if we actually try to use them, so don't
                    # bother.
                    continue
                encoded = soup.encode(encoding)
                assert b'meta charset=""' in encoded
                assert encoding.encode("ascii") not in encoded
    def test_tag_with_no_attributes_can_have_attributes_added(self):
        data = self.soup("<a>text</a>")
        data.a['foo'] = 'bar'
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
    def test_closing_tag_with_no_opening_tag(self):
        # Without BeautifulSoup.open_tag_counter, the </span> tag will
        # cause _popToTag to be called over and over again as we look
        # for a <span> tag that wasn't there. The result is that 'text2'
        # will show up outside the body of the document.
        soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
        self.assertEqual(
            "<body><div><p>text1</p>text2</div></body>", soup.body.decode()
        )
    def test_worst_case(self):
        """Test the worst case (currently) for linking issues."""
@ -791,7 +916,7 @@ Hello, world!
        self.linkage_validator(soup)
-class XMLTreeBuilderSmokeTest(object):
+class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))
    def test_python_specific_encodings_not_used_in_xml_declaration(self):
        # You can encode an XML document using a Python-specific
        # encoding, but that encoding won't be mentioned _inside_ the
        # resulting document.
        markup = b"""<?xml version="1.0"?>\n<foo/>"""
        soup = self.soup(markup)
        for encoding in PYTHON_SPECIFIC_ENCODINGS:
            if encoding in (
                'idna', 'mbcs', 'oem', 'undefined',
                'string_escape', 'string-escape'
            ):
                # For one reason or another, these will raise an
                # exception if we actually try to use them, so don't
                # bother.
                continue
            encoded = soup.encode(encoding)
            assert b'<?xml version="1.0"?>' in encoded
            assert encoding.encode("ascii") not in encoded
    def test_processing_instruction(self):
        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
        soup = self.soup(markup)
@ -828,7 +972,7 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8"), markup)
-
+       
    def test_nested_namespaces(self):
        doc = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
        soup = self.soup(markup, store_line_numbers=False)
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)
    def test_special_string_containers(self):
        # The html5lib tree builder doesn't support this standard feature,
        # because there's no way of knowing, when a string is created,
        # where in the tree it will eventually end up.
        pass
    def test_html5_attributes(self):
        # The html5lib TreeBuilder can convert any entity named in
        # the HTML5 spec to a sequence of Unicode characters, and
        # convert those Unicode characters to a (potentially
        # different) named entity on the way out.
        #
        # This is a copy of the same test from
        # HTMLParserTreeBuilderSmokeTest.  It's not in the superclass
        # because the lxml HTML TreeBuilder _doesn't_ work this way.
        for input_element, output_unicode, output_element in (
                ("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
                ('&models;', '\u22a7', b'&models;'),
                ('&Nfr;', '\U0001d511', b'&Nfr;'),
                ('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
                ('&not;', '\xac', b'&not;'),
                ('&Not;', '\u2aec', b'&Not;'),
                ('&quot;', '"', b'"'),
                ('&there4;', '\u2234', b'&there4;'),
                ('&Therefore;', '\u2234', b'&there4;'),
                ('&therefore;', '\u2234', b'&there4;'),
                ("&fjlig;", 'fj', b'fj'),                
                ("&sqcup;", '\u2294', b'&sqcup;'),
                ("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
                ("&apos;", "'", b"'"),
                ("&verbar;", "|", b"|"),
        ):
            markup = '<div>%s</div>' % input_element
            div = self.soup(markup).div
            without_element = div.encode()
            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
            self.assertEqual(without_element, expect)
            with_element = div.encode(formatter="html")
            expect = b"<div>%s</div>" % output_element
            self.assertEqual(with_element, expect)
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -3,6 +3,7 @@ trees."""
 from pdb import set_trace
 import pickle
 import warnings
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)
    def test_on_duplicate_attribute(self):
        # The html.parser tree builder has a variety of ways of
        # handling a tag that contains the same attribute multiple times.
        markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
        # If you don't provide any particular value for
        # on_duplicate_attribute, later values replace earlier values.
        soup = self.soup(markup)
        self.assertEqual("url3", soup.a['href'])
        self.assertEqual(["cls"], soup.a['class'])
        self.assertEqual("id", soup.a['id'])
        # You can also get this behavior explicitly.
        def assert_attribute(on_duplicate_attribute, expected):
            soup = self.soup(
                markup, on_duplicate_attribute=on_duplicate_attribute
            )
            self.assertEqual(expected, soup.a['href'])
            # Verify that non-duplicate attributes are treated normally.
            self.assertEqual(["cls"], soup.a['class'])
            self.assertEqual("id", soup.a['id'])
        assert_attribute(None, "url3")
        assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
        # You can ignore subsequent values in favor of the first.
        assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
        # And you can pass in a callable that does whatever you want.
        def accumulate(attrs, key, value):
            if not isinstance(attrs[key], list):
                attrs[key] = [attrs[key]]
            attrs[key].append(value)
        assert_attribute(accumulate, ["url1", "url2", "url3"])            
    def test_html5_attributes(self):
        # The html.parser TreeBuilder can convert any entity named in
        # the HTML5 spec to a sequence of Unicode characters, and
        # convert those Unicode characters to a (potentially
        # different) named entity on the way out.
        for input_element, output_unicode, output_element in (
                ("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
                ('&models;', '\u22a7', b'&models;'),
                ('&Nfr;', '\U0001d511', b'&Nfr;'),
                ('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
                ('&not;', '\xac', b'&not;'),
                ('&Not;', '\u2aec', b'&Not;'),
                ('&quot;', '"', b'"'),
                ('&there4;', '\u2234', b'&there4;'),
                ('&Therefore;', '\u2234', b'&there4;'),
                ('&therefore;', '\u2234', b'&there4;'),
                ("&fjlig;", 'fj', b'fj'),                
                ("&sqcup;", '\u2294', b'&sqcup;'),
                ("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
                ("&apos;", "'", b"'"),
                ("&verbar;", "|", b"|"),
        ):
            markup = '<div>%s</div>' % input_element
            div = self.soup(markup).div
            without_element = div.encode()
            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
            self.assertEqual(without_element, expect)
            with_element = div.encode(formatter="html")
            expect = b"<div>%s</div>" % output_element
            self.assertEqual(with_element, expect)
 class TestHTMLParserSubclass(SoupTest):
    def test_error(self):
        """Verify that our HTMLParser subclass implements error() in a way
        that doesn't cause a crash.
        """
        parser = BeautifulSoupHTMLParser()
-        parser.error("don't crash")
+        with warnings.catch_warnings(record=True) as warns:
            parser.error("don't crash")
        [warning] = warns
        assert "don't crash" == str(warning.message)
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
-
+        
    def test_entities_in_foreign_document_encoding(self):
        # We can't implement this case correctly because by the time we
        # hear about markup like "&#147;", it's been (incorrectly) converted into
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -3,6 +3,7 @@
 from pdb import set_trace
 import logging
 import os
 import unittest
 import sys
 import tempfile
@ -10,6 +11,8 @@ import tempfile
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
    GuessedAtParserWarning,
    MarkupResemblesLocatorWarning,
 )
 from bs4.builder import (
    TreeBuilder,
@ -29,7 +32,6 @@ import bs4.dammit
 from bs4.dammit import (
    EntitySubstitution,
    UnicodeDammit,
    EncodingDetector,
 )
 from bs4.testing import (
    default_builder,
@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
                self.store_line_numbers = False
                self.cdata_list_attributes = []
                self.preserve_whitespace_tags = []
                self.string_containers = {}
            def initialize_soup(self, soup):
                pass
            def feed(self, markup):
@ -186,28 +189,69 @@ class TestConstructor(SoupTest):
            isinstance(x, (TagPlus, StringPlus, CommentPlus))
            for x in soup.recursiveChildGenerator()
        )
    def test_alternate_string_containers(self):
        # Test the ability to customize the string containers for
        # different types of tags.
        class PString(NavigableString):
            pass
        class BString(NavigableString):
            pass
        soup = self.soup(
            "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
            string_containers = {
                'b': BString,
                'p': PString,
            }
        )
        # The string before the <p> tag is a regular NavigableString.
        assert isinstance(soup.div.contents[0], NavigableString)
        # The string inside the <p> tag, but not inside the <i> tag,
        # is a PString.
        assert isinstance(soup.p.contents[0], PString)
        # Every string inside the <b> tag is a BString, even the one that
        # was also inside an <i> tag.
        for s in soup.b.strings:
            assert isinstance(s, BString)
        # Now that parsing was complete, the string_container_stack
        # (where this information was kept) has been cleared out.
        self.assertEqual([], soup.string_container_stack)
 class TestWarnings(SoupTest):
-    def _no_parser_specified(self, s, is_there=True):
+    def _assert_warning(self, warnings, cls):
-        v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
+        for w in warnings:
-        self.assertTrue(v)
+            if isinstance(w.message, cls):
                return w
        raise Exception("%s warning not found in %r" % cls, warnings)
    def _assert_no_parser_specified(self, w):
        warning = self._assert_warning(w, GuessedAtParserWarning)
        message = str(warning.message)
        self.assertTrue(
            message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
        )
    def test_warning_if_no_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>")
+            soup = BeautifulSoup("<a><b></b></a>")
-        msg = str(w[0].message)
+        self._assert_no_parser_specified(w)
        self._assert_no_parser_specified(msg)
    def test_warning_if_parser_specified_too_vague(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>", "html")
+            soup = BeautifulSoup("<a><b></b></a>", "html")
-        msg = str(w[0].message)
+        self._assert_no_parser_specified(w)
        self._assert_no_parser_specified(msg)
    def test_no_warning_if_explicit_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>", "html.parser")
+            soup = BeautifulSoup("<a><b></b></a>", "html.parser")
        self.assertEqual([], w)
    def test_parseOnlyThese_renamed_to_parse_only(self):
@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
        self.assertRaises(
            TypeError, self.soup, "<a>", no_such_argument=True)
 class TestWarnings(SoupTest):
    def test_disk_file_warning(self):
        filehandle = tempfile.NamedTemporaryFile()
        filename = filehandle.name
        try:
            with warnings.catch_warnings(record=True) as w:
                soup = self.soup(filename)
-            msg = str(w[0].message)
+            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
-            self.assertTrue("looks like a filename" in msg)
+            self.assertTrue("looks like a filename" in str(warning.message))
        finally:
            filehandle.close()
        # The file no longer exists, so Beautiful Soup will no longer issue the warning.
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(filename)
-        self.assertEqual(0, len(w))
+        self.assertEqual([], w)
    def test_directory_warning(self):
        try:
            filename = tempfile.mkdtemp()
            with warnings.catch_warnings(record=True) as w:
                soup = self.soup(filename)
            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
            self.assertTrue("looks like a directory" in str(warning.message))
        finally:
            os.rmdir(filename)
        # The directory no longer exists, so Beautiful Soup will no longer issue the warning.
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(filename)
        self.assertEqual([], w)
    def test_url_warning_with_bytes_url(self):
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup(b"http://www.crummybytes.com/")
-        # Be aware this isn't the only warning that can be raised during
+        warning = self._assert_warning(
-        # execution..
+            warning_list, MarkupResemblesLocatorWarning
-        self.assertTrue(any("looks like a URL" in str(w.message) 
+        )
-            for w in warning_list))
+        self.assertTrue("looks like a URL" in str(warning.message))
    def test_url_warning_with_unicode_url(self):
        with warnings.catch_warnings(record=True) as warning_list:
            # note - this url must differ from the bytes one otherwise
            # python's warnings system swallows the second warning
            soup = self.soup("http://www.crummyunicode.com/")
-        self.assertTrue(any("looks like a URL" in str(w.message) 
+        warning = self._assert_warning(
-            for w in warning_list))
+            warning_list, MarkupResemblesLocatorWarning
        )
        self.assertTrue("looks like a URL" in str(warning.message))
    def test_url_warning_with_bytes_and_space(self):
        # Here the markup contains something besides a URL, so no warning
        # is issued.
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup(b"http://www.crummybytes.com/ is great")
        self.assertFalse(any("looks like a URL" in str(w.message) 
@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
        self.assertEqual(self.sub.substitute_html(dammit.markup),
                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
    def test_html5_entity(self):
        # Some HTML5 entities correspond to single- or multi-character
        # Unicode sequences.
        for entity, u in (
            # A few spot checks of our ability to recognize
            # special character sequences and convert them
            # to named entities.
            ('&models;', '\u22a7'),
            ('&Nfr;', '\U0001d511'),
            ('&ngeqq;', '\u2267\u0338'),
            ('&not;', '\xac'),
            ('&Not;', '\u2aec'),
            # We _could_ convert | to &verbarr;, but we don't, because
            # | is an ASCII character.
            ('|' '|'),
            # Similarly for the fj ligature, which we could convert to
            # &fjlig;, but we don't.
            ("fj", "fj"),
            # We do convert _these_ ASCII characters to HTML entities,
            # because that's required to generate valid HTML.
            ('&gt;', '>'),
            ('&lt;', '<'),
            ('&amp;', '&'),
        ):
            template = '3 %s 4'
            raw = template % u
            with_entities = template % entity
            self.assertEqual(self.sub.substitute_html(raw), with_entities)
    def test_html5_entity_with_variation_selector(self):
        # Some HTML5 entities correspond either to a single-character
        # Unicode sequence _or_ to the same character plus U+FE00,
        # VARIATION SELECTOR 1. We can handle this.
        data = "fjords \u2294 penguins"
        markup = "fjords &sqcup; penguins"
        self.assertEqual(self.sub.substitute_html(data), markup)
        data = "fjords \u2294\ufe00 penguins"
        markup = "fjords &sqcups; penguins"
        self.assertEqual(self.sub.substitute_html(data), markup)
    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, False), s)
@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest):
        markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
 class TestUnicodeDammit(unittest.TestCase):
    """Standalone tests of UnicodeDammit."""
    def test_unicode_input(self):
        markup = "I'm already Unicode! \N{SNOWMAN}"
        dammit = UnicodeDammit(markup)
        self.assertEqual(dammit.unicode_markup, markup)
    def test_smart_quotes_to_unicode(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup)
        self.assertEqual(
            dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
    def test_smart_quotes_to_xml_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
    def test_smart_quotes_to_html_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="html")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
    def test_smart_quotes_to_ascii(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
        self.assertEqual(
            dammit.unicode_markup, """<foo>''""</foo>""")
    def test_detect_utf8(self):
        utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
        dammit = UnicodeDammit(utf8)
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
    def test_convert_hebrew(self):
        hebrew = b"\xed\xe5\xec\xf9"
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
        self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
    def test_dont_see_smart_quotes_where_there_are_none(self):
        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
        dammit = UnicodeDammit(utf_8)
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
    def test_ignore_inappropriate_codecs(self):
        utf8_data = "Räksmörgås".encode("utf-8")
        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_ignore_invalid_codecs(self):
        utf8_data = "Räksmörgås".encode("utf-8")
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
            dammit = UnicodeDammit(utf8_data, [bad_encoding])
            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_exclude_encodings(self):
        # This is UTF-8.
        utf8_data = "Räksmörgås".encode("utf-8")
        # But if we exclude UTF-8 from consideration, the guess is
        # Windows-1252.
        dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
        # And if we exclude that, there is no valid guess at all.
        dammit = UnicodeDammit(
            utf8_data, exclude_encodings=["utf-8", "windows-1252"])
        self.assertEqual(dammit.original_encoding, None)
    def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
        detected = EncodingDetector(
            b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
        encodings = list(detected.encodings)
        assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
    def test_detect_html5_style_meta_tag(self):
        for data in (
            b'<html><meta charset="euc-jp" /></html>',
            b"<html><meta charset='euc-jp' /></html>",
            b"<html><meta charset=euc-jp /></html>",
            b"<html><meta charset=euc-jp/></html>"):
            dammit = UnicodeDammit(data, is_html=True)
            self.assertEqual(
                "euc-jp", dammit.original_encoding)
    def test_last_ditch_entity_replacement(self):
        # This is a UTF-8 document that contains bytestrings
        # completely incompatible with UTF-8 (ie. encoded with some other
        # encoding).
        #
        # Since there is no consistent encoding for the document,
        # Unicode, Dammit will eventually encode the document as UTF-8
        # and encode the incompatible characters as REPLACEMENT
        # CHARACTER.
        #
        # If chardet is installed, it will detect that the document
        # can be converted into ISO-8859-1 without errors. This happens
        # to be the wrong encoding, but it is a consistent encoding, so the
        # code we're testing here won't run.
        #
        # So we temporarily disable chardet if it's present.
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            bs4.dammit.chardet_dammit = noop
            dammit = UnicodeDammit(doc)
            self.assertEqual(True, dammit.contains_replacement_characters)
            self.assertTrue("\ufffd" in dammit.unicode_markup)
            soup = BeautifulSoup(doc, "html.parser")
            self.assertTrue(soup.contains_replacement_characters)
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
    def test_byte_order_mark_removed(self):
        # A document written in UTF-16LE will have its byte order marker stripped.
        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
        dammit = UnicodeDammit(data)
        self.assertEqual("<a>áé</a>", dammit.unicode_markup)
        self.assertEqual("utf-16le", dammit.original_encoding)
    def test_detwingle(self):
        # Here's a UTF8 document.
        utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
        # Here's a Windows-1252 document.
        windows_1252 = (
            "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
            "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
        # Through some unholy alchemy, they've been stuck together.
        doc = utf8 + windows_1252 + utf8
        # The document can't be turned into UTF-8:
        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
        # Unicode, Dammit thinks the whole document is Windows-1252,
        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
        # But if we run it through fix_embedded_windows_1252, it's fixed:
        fixed = UnicodeDammit.detwingle(doc)
        self.assertEqual(
            "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
    def test_detwingle_ignores_multibyte_characters(self):
        # Each of these characters has a UTF-8 representation ending
        # in \x93. \x93 is a smart quote if interpreted as
        # Windows-1252. But our code knows to skip over multibyte
        # UTF-8 characters, so they'll survive the process unscathed.
        for tricky_unicode_char in (
            "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
            "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
            "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
            ):
            input = tricky_unicode_char.encode("utf8")
            self.assertTrue(input.endswith(b'\x93'))
            output = UnicodeDammit.detwingle(input)
            self.assertEqual(output, input)
    def test_find_declared_encoding(self):
        # Test our ability to find a declared encoding inside an
        # XML or HTML document.
        #
        # Even if the document comes in as Unicode, it may be
        # interesting to know what encoding was claimed
        # originally.
        html_unicode = '<html><head><meta charset="utf-8"></head></html>'
        html_bytes = html_unicode.encode("ascii")
        xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
        xml_bytes = xml_unicode.encode("ascii")
        m = EncodingDetector.find_declared_encoding
        self.assertEqual(None, m(html_unicode, is_html=False))
        self.assertEqual("utf-8", m(html_unicode, is_html=True))
        self.assertEqual("utf-8", m(html_bytes, is_html=True))
        self.assertEqual("iso-8859-1", m(xml_unicode))
        self.assertEqual("iso-8859-1", m(xml_bytes))
        # Normally, only the first few kilobytes of a document are checked for
        # an encoding.
        spacer = b' ' * 5000
        self.assertEqual(None, m(spacer + html_bytes))
        self.assertEqual(None, m(spacer + xml_bytes))
        # But you can tell find_declared_encoding to search an entire
        # HTML document.
        self.assertEqual(
            "utf-8",
            m(spacer + html_bytes, is_html=True, search_entire_document=True)
        )
        # The XML encoding declaration has to be the very first thing
        # in the document. We'll allow whitespace before the document
        # starts, but nothing else.
        self.assertEqual(
            "iso-8859-1",
            m(xml_bytes, search_entire_document=True)
        )
        self.assertEqual(
            None, m(b'a' + xml_bytes, search_entire_document=True)
        )
 class TestNamedspacedAttribute(SoupTest):
    def test_name_may_be_none_or_missing(self):
        a = NamespacedAttribute("xmlns", None)
        self.assertEqual(a, "xmlns")
        a = NamespacedAttribute("xmlns", "")
        self.assertEqual(a, "xmlns")
        a = NamespacedAttribute("xmlns")
        self.assertEqual(a, "xmlns")
    def test_namespace_may_be_none_or_missing(self):
        a = NamespacedAttribute(None, "tag")
        self.assertEqual(a, "tag")
        a = NamespacedAttribute("", "tag")
        self.assertEqual(a, "tag")
    def test_attribute_is_equivalent_to_colon_separated_string(self):
        a = NamespacedAttribute("a", "b")
        self.assertEqual("a:b", a)
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@ -27,13 +27,17 @@ from bs4.element import (
    Doctype,
    Formatter,
    NavigableString,
    Script,
    SoupStrainer,
    Stylesheet,
    Tag,
    TemplateString,
 )
 from bs4.testing import (
    SoupTest,
    skipIf,
 )
 from soupsieve import SelectorSyntaxError
 XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
 LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
        soup.a.extend(l)
        self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
    def test_extend_with_another_tags_contents(self):
        data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
        soup = self.soup(data)
        d1 = soup.find('div', id='d1')
        d2 = soup.find('div', id='d2')
        d2.extend(d1)
        self.assertEqual('<div id="d1"></div>', d1.decode())
        self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
    def test_move_tag_to_beginning_of_parent(self):
        data = "<a><b></b><c></c><d></d></a>"
        soup = self.soup(data)
@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
        self.assertEqual(no.next_element, "no")
        self.assertEqual(no.next_sibling, " business")
    def test_replace_with_errors(self):
        # Can't replace a tag that's not part of a tree.
        a_tag = Tag(name="a")
        self.assertRaises(ValueError, a_tag.replace_with, "won't work")
        # Can't replace a tag with its parent.
        a_tag = self.soup("<a><b></b></a>").a
        self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
        # Or with a list that includes its parent.
        self.assertRaises(ValueError, a_tag.b.replace_with,
                          "string1", a_tag, "string2")
    def test_replace_with_multiple(self):
        data = "<a><b></b><c></c></a>"
        soup = self.soup(data)
        d_tag = soup.new_tag("d")
        d_tag.string = "Text In D Tag"
        e_tag = soup.new_tag("e")
        f_tag = soup.new_tag("f")
        a_string = "Random Text"
        soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
        self.assertEqual(
            "<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>",
            soup.decode()
        )
        assert soup.b.next_element == d_tag
        assert d_tag.string.next_element==e_tag
        assert e_tag.next_element.string == a_string
        assert e_tag.next_element.next_element == f_tag
    def test_replace_first_child(self):
        data = "<a><b></b><c></c></a>"
        soup = self.soup(data)
@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
        a.clear(decompose=True)
        self.assertEqual(0, len(em.contents))
    def test_decompose(self):
        # Test PageElement.decompose() and PageElement.decomposed
        soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
        p1, p2 = soup.find_all('p')
        a = p1.a
        text = p1.em.string
        for i in [p1, p2, a, text]:
            self.assertEqual(False, i.decomposed)
        # This sets p1 and everything beneath it to decomposed.
        p1.decompose()
        for i in [p1, a, text]:
            self.assertEqual(True, i.decomposed)
        # p2 is unaffected.
        self.assertEqual(False, p2.decomposed)
    def test_string_set(self):
        """Tag.string = 'string'"""
        soup = self.soup("<a></a> <b><c></c></b>")
@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
-    def test_get_text_ignores_comments(self):
+    def test_get_text_ignores_special_string_containers(self):
        soup = self.soup("foo<!--IGNORE-->bar")
        self.assertEqual(soup.get_text(), "foobar")
@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
        self.assertEqual(
            soup.get_text(types=None), "fooIGNOREbar")
-    def test_all_strings_ignores_comments(self):
+        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
        self.assertEqual(soup.get_text(), "foobar")
    def test_all_strings_ignores_special_string_containers(self):
        soup = self.soup("foo<!--IGNORE-->bar")
        self.assertEqual(['foo', 'bar'], list(soup.strings))
        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
        self.assertEqual(['foo', 'bar'], list(soup.strings))
    def test_string_methods_inside_special_string_container_tags(self):
        # Strings inside tags like <script> are generally ignored by
        # methods like get_text, because they're not what humans
        # consider 'text'. But if you call get_text on the <script>
        # tag itself, those strings _are_ considered to be 'text',
        # because there's nothing else you might be looking for.
        style = self.soup("<div>a<style>Some CSS</style></div>")
        template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
        script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
        self.assertEqual(style.div.get_text(), "a")
        self.assertEqual(list(style.div.strings), ["a"])
        self.assertEqual(style.div.style.get_text(), "Some CSS")
        self.assertEqual(list(style.div.style.strings),
                         ['Some CSS'])
        # The comment is not picked up here. That's because it was
        # parsed into a Comment object, which is not considered
        # interesting by template.strings.
        self.assertEqual(template.div.get_text(), "a")
        self.assertEqual(list(template.div.strings), ["a"])
        self.assertEqual(template.div.template.get_text(), "Templated text.")
        self.assertEqual(list(template.div.template.strings),
                         ["Templated ", "text", "."])
        # The comment is included here, because it didn't get parsed
        # into a Comment object--it's part of the Script string.
        self.assertEqual(script.div.get_text(), "a")
        self.assertEqual(list(script.div.strings), ["a"])
        self.assertEqual(script.div.script.get_text(),
                         "<!--a comment-->Some text")
        self.assertEqual(list(script.div.script.strings),
                         ['<!--a comment-->Some text'])
 class TestCDAtaListAttributes(SoupTest):
    """Testing cdata-list attributes like 'class'.
@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest):
        else:
            self.assertEqual(b'<b>\\u2603</b>', repr(soup))
-class TestFormatter(SoupTest):
+        
    def test_sort_attributes(self):
        # Test the ability to override Formatter.attributes() to,
        # e.g., disable the normal sorting of attributes.
        class UnsortedFormatter(Formatter):
            def attributes(self, tag):
                self.called_with = tag
                for k, v in sorted(tag.attrs.items()):
                    if k == 'ignore':
                        continue
                    yield k,v
        soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
        formatter = UnsortedFormatter()
        decoded = soup.decode(formatter=formatter)
        # attributes() was called on the <p> tag. It filtered out one
        # attribute and sorted the other two.
        self.assertEqual(formatter.called_with, soup.p)
        self.assertEqual('<p aval="2" cval="1"></p>', decoded)
 class TestNavigableStringSubclasses(SoupTest):
    def test_cdata(self):
        # None of the current builders turn CDATA sections into CData
        # objects, but you can create them manually.
        soup = self.soup("")
        cdata = CData("foo")
        soup.insert(1, cdata)
        self.assertEqual(str(soup), "<![CDATA[foo]]>")
        self.assertEqual(soup.find(text="foo"), "foo")
        self.assertEqual(soup.contents[0], "foo")
    def test_cdata_is_never_formatted(self):
        """Text inside a CData object is passed into the formatter.
        But the return value is ignored.
        """
        self.count = 0
        def increment(*args):
            self.count += 1
            return "BITTER FAILURE"
        soup = self.soup("")
        cdata = CData("<><><>")
        soup.insert(1, cdata)
        self.assertEqual(
            b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
        self.assertEqual(1, self.count)
    def test_doctype_ends_in_newline(self):
        # Unlike other NavigableString subclasses, a DOCTYPE always ends
        # in a newline.
        doctype = Doctype("foo")
        soup = self.soup("")
        soup.insert(1, doctype)
        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
    def test_declaration(self):
        d = Declaration("foo")
        self.assertEqual("<?foo?>", d.output_ready())
 class TestSoupSelector(TreeTest):
    HTML = """
@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
        self.assertEqual(len(self.soup.select('del')), 0)
    def test_invalid_tag(self):
-        self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
    def test_select_dashed_tag_ids(self):
        self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
        self.assertRaises(
-            SyntaxError, self.soup.select, "a:nth-of-type(a)")
+            SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
    def test_nth_of_type(self):
        # Try to select first paragraph
@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
        self.assertEqual([], self.soup.select('#inner ~ h2'))
    def test_dangling_combinator(self):
-        self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
    def test_sibling_combinator_wont_select_same_tag_twice(self):
        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
        self.assertSelects('div x,y,  z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
    def test_invalid_multiple_select(self):
-        self.assertRaises(SyntaxError, self.soup.select, ',x, y')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
-        self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
    def test_multiple_select_attrs(self):
        self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])