Update beautifulsoup4-4.10.0

2025-08-21 05:43:22 -07:00 · 2021-10-14 20:46:06 -07:00 · 2021-10-14 20:46:06 -07:00 · ab8fa4d5b3
commit ab8fa4d5b3
parent b581460b51
16 changed files with 4599 additions and 743 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -1,6 +1,5 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
+"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
+
 http://www.crummy.com/software/BeautifulSoup/

 Beautiful Soup uses a pluggable XML or HTML parser to parse a
@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
 provides methods and Pythonic idioms that make it easy to navigate,
 search, and modify the parse tree.

-Beautiful Soup works with Python 2.7 and up. It works better if lxml
+Beautiful Soup works with Python 3.5 and up. It works better if lxml
 and/or html5lib is installed.

 For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
+documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.1"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__version__ = "4.10.0"
+__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']

+
+from collections import Counter
 import os
 import re
 import sys
 import traceback
 import warnings

+# The very first thing we do is give a useful error if someone is
+# running this code under Python 2.
+if sys.version_info.major < 3:
+    raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
+
 from .builder import builder_registry, ParserRejectedMarkup
 from .dammit import UnicodeDammit
 from .element import (
@ -42,28 +46,49 @@ from .element import (
    NavigableString,
    PageElement,
    ProcessingInstruction,
+    PYTHON_SPECIFIC_ENCODINGS,
    ResultSet,
+    Script,
+    Stylesheet,
    SoupStrainer,
    Tag,
+    TemplateString,
    )

-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+# Define some custom warnings.
+class GuessedAtParserWarning(UserWarning):
+    """The warning issued when BeautifulSoup has to guess what parser to
+    use -- probably because no parser was specified in the constructor.
+    """
+
+class MarkupResemblesLocatorWarning(UserWarning):
+    """The warning issued when BeautifulSoup is given 'markup' that
+    actually looks like a resource locator -- a URL or a path to a file
+    on disk.
+    """
+

 class BeautifulSoup(Tag):
-    """
-    This class defines the basic interface called by the tree builders.
+    """A data structure representing a parsed HTML or XML document.

-    These methods will be called by the parser:
-      reset()
-      feed(markup)
+    Most of the methods you'll call on a BeautifulSoup object are inherited from
+    PageElement or Tag.
+
+    Internally, this class defines the basic interface called by the
+    tree builders when converting an HTML/XML document into a data
+    structure. The interface abstracts away the differences between
+    parsers. To write a new tree builder, you'll need to understand
+    these methods as a whole.
+
+    These methods will be called by the BeautifulSoup constructor:
+      * reset()
+      * feed(markup)

    The tree builder may call these methods from its feed() implementation:
-      handle_starttag(name, attrs) # See note about return value
-      handle_endtag(name)
-      handle_data(data) # Appends to the current data node
-      endData(containerClass) # Ends the current data node
+      * handle_starttag(name, attrs) # See note about return value
+      * handle_endtag(name)
+      * handle_data(data) # Appends to the current data node
+      * endData(containerClass) # Ends the current data node

    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
@ -73,68 +98,75 @@ class BeautifulSoup(Tag):
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    """
+
+    # Since BeautifulSoup subclasses Tag, it's possible to treat it as
+    # a Tag with a .name. This name makes it clear the BeautifulSoup
+    # object isn't a real markup tag.
    ROOT_TAG_NAME = '[document]'

    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-   
+
+    # A string containing all ASCII whitespace characters, used in
+    # endData() to detect data chunks that seem 'empty'.
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+    
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
                 element_classes=None, **kwargs):
        """Constructor.

        :param markup: A string or a file-like object representing
-        markup to be parsed.
+         markup to be parsed.

-        :param features: Desirable features of the parser to be used. This
-        may be the name of a specific parser ("lxml", "lxml-xml",
-        "html.parser", or "html5lib") or it may be the type of markup
-        to be used ("html", "html5", "xml"). It's recommended that you
-        name a specific parser, so that Beautiful Soup gives you the
-        same results across platforms and virtual environments.
+        :param features: Desirable features of the parser to be
+         used. This may be the name of a specific parser ("lxml",
+         "lxml-xml", "html.parser", or "html5lib") or it may be the
+         type of markup to be used ("html", "html5", "xml"). It's
+         recommended that you name a specific parser, so that
+         Beautiful Soup gives you the same results across platforms
+         and virtual environments.

        :param builder: A TreeBuilder subclass to instantiate (or
-        instance to use) instead of looking one up based on
-        `features`. You only need to use this if you've implemented a
-        custom TreeBuilder.
+         instance to use) instead of looking one up based on
+         `features`. You only need to use this if you've implemented a
+         custom TreeBuilder.

        :param parse_only: A SoupStrainer. Only parts of the document
-        matching the SoupStrainer will be considered. This is useful
-        when parsing part of a document that would otherwise be too
-        large to fit into memory.
+         matching the SoupStrainer will be considered. This is useful
+         when parsing part of a document that would otherwise be too
+         large to fit into memory.

        :param from_encoding: A string indicating the encoding of the
-        document to be parsed. Pass this in if Beautiful Soup is
-        guessing wrongly about the document's encoding.
+         document to be parsed. Pass this in if Beautiful Soup is
+         guessing wrongly about the document's encoding.

        :param exclude_encodings: A list of strings indicating
-        encodings known to be wrong. Pass this in if you don't know
-        the document's encoding but you know Beautiful Soup's guess is
-        wrong.
+         encodings known to be wrong. Pass this in if you don't know
+         the document's encoding but you know Beautiful Soup's guess is
+         wrong.

        :param element_classes: A dictionary mapping BeautifulSoup
-        classes like Tag and NavigableString to other classes you'd
-        like to be instantiated instead as the parse tree is
-        built. This is useful for using subclasses to modify the
-        default behavior of Tag or NavigableString.
+         classes like Tag and NavigableString, to other classes you'd
+         like to be instantiated instead as the parse tree is
+         built. This is useful for subclassing Tag or NavigableString
+         to modify default behavior.

        :param kwargs: For backwards compatibility purposes, the
-        constructor accepts certain keyword arguments used in
-        Beautiful Soup 3. None of these arguments do anything in
-        Beautiful Soup 4; they will result in a warning and then be ignored.
-
-        Apart from this, any keyword arguments passed into the BeautifulSoup
-        constructor are propagated to the TreeBuilder constructor. This
-        makes it possible to configure a TreeBuilder beyond saying
-        which one to use.
-
+         constructor accepts certain keyword arguments used in
+         Beautiful Soup 3. None of these arguments do anything in
+         Beautiful Soup 4; they will result in a warning and then be
+         ignored.
+         
+         Apart from this, any keyword arguments passed into the
+         BeautifulSoup constructor are propagated to the TreeBuilder
+         constructor. This makes it possible to configure a
+         TreeBuilder by passing in arguments, not just by saying which
+         one to use.
        """
-
        if 'convertEntities' in kwargs:
            del kwargs['convertEntities']
            warnings.warn(
@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
            if not original_builder and not (
                    original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES
-            ):
+            ) and markup:
+                # The user did not tell us which TreeBuilder to use,
+                # and we had to guess. Issue a warning.
                if builder.is_xml:
                    markup_type = "XML"
                else:
@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
                        parser=builder.NAME,
                        markup_type=markup_type
                    )
-                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+                    warnings.warn(
+                        self.NO_PARSER_SPECIFIED_WARNING % values,
+                        GuessedAtParserWarning, stacklevel=2
+                    )
        else:
            if kwargs:
                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
            else:
                possible_filename = markup
            is_file = False
+            is_directory = False
            try:
                is_file = os.path.exists(possible_filename)
+                if is_file:
+                    is_directory = os.path.isdir(possible_filename)
            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
-            if is_file:
-                if isinstance(markup, str):
-                    markup = markup.encode("utf8")
+            if is_directory:
+                warnings.warn(
+                    '"%s" looks like a directory name, not markup. You may'
+                    ' want to open a file found in this directory and pass'
+                    ' the filehandle into Beautiful Soup.' % (
+                        self._decode_markup(markup)
+                    ),
+                    MarkupResemblesLocatorWarning
+                )
+            elif is_file:
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
                    ' probably open this file and pass the filehandle into'
-                    ' Beautiful Soup.' % markup)
+                    ' Beautiful Soup.' % self._decode_markup(markup),
+                    MarkupResemblesLocatorWarning
+                )
            self._check_markup_is_url(markup)

        rejections = []
@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
        self.builder.soup = None

    def __copy__(self):
+        """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
        copy = type(self)(
            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
        )
@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
            d['builder'] = None
        return d

-    @staticmethod
-    def _check_markup_is_url(markup):
-        """ 
-        Check if markup looks like it's actually a url and raise a warning 
-        if so. Markup can be unicode or str (py2) / bytes (py3).
+    @classmethod
+    def _decode_markup(cls, markup):
+        """Ensure `markup` is bytes so it's safe to send into warnings.warn.
+
+        TODO: warnings.warn had this problem back in 2010 but it might not
+        anymore.
+        """
+        if isinstance(markup, bytes):
+            decoded = markup.decode('utf-8', 'replace')
+        else:
+            decoded = markup
+        return decoded
+
+    @classmethod
+    def _check_markup_is_url(cls, markup):
+        """Error-handling method to raise a warning if incoming markup looks
+        like a URL.
+
+        :param markup: A string.
        """
        if isinstance(markup, bytes):
            space = b' '
@ -364,18 +428,20 @@ class BeautifulSoup(Tag):

        if any(markup.startswith(prefix) for prefix in cant_start_with):
            if not space in markup:
-                if isinstance(markup, bytes):
-                    decoded_markup = markup.decode('utf-8', 'replace')
-                else:
-                    decoded_markup = markup
                warnings.warn(
                    '"%s" looks like a URL. Beautiful Soup is not an'
                    ' HTTP client. You should probably use an HTTP client like'
                    ' requests to get the document behind the URL, and feed'
-                    ' that document to Beautiful Soup.' % decoded_markup
+                    ' that document to Beautiful Soup.' % cls._decode_markup(
+                        markup
+                    ),
+                    MarkupResemblesLocatorWarning
                )

    def _feed(self):
+        """Internal method that parses previously set markup, creating a large
+        number of Tag and NavigableString objects.
+        """
        # Convert the document to Unicode.
        self.builder.reset()

@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
            self.popTag()

    def reset(self):
+        """Reset this object to a state as though it had never parsed any
+        markup.
+        """
        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
        self.hidden = 1
        self.builder.reset()
        self.current_data = []
        self.currentTag = None
        self.tagStack = []
+        self.open_tag_counter = Counter()
        self.preserve_whitespace_tag_stack = []
+        self.string_container_stack = []
        self.pushTag(self)

    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
                sourceline=None, sourcepos=None, **kwattrs):
-        """Create a new tag associated with this soup."""
+        """Create a new Tag associated with this BeautifulSoup object.
+
+        :param name: The name of the new Tag.
+        :param namespace: The URI of the new Tag's XML namespace, if any.
+        :param prefix: The prefix for the new Tag's XML namespace, if any.
+        :param attrs: A dictionary of this Tag's attribute values; can
+            be used instead of `kwattrs` for attributes like 'class'
+            that are reserved words in Python.
+        :param sourceline: The line number where this tag was
+            (purportedly) found in its source document.
+        :param sourcepos: The character position within `sourceline` where this
+            tag was (purportedly) found.
+        :param kwattrs: Keyword arguments for the new Tag's attribute values.
+
+        """
        kwattrs.update(attrs)
        return self.element_classes.get(Tag, Tag)(
            None, self.builder, name, namespace, nsprefix, kwattrs,
            sourceline=sourceline, sourcepos=sourcepos
        )

-    def new_string(self, s, subclass=None):
-        """Create a new NavigableString associated with this soup."""
-        subclass = subclass or self.element_classes.get(
-            NavigableString, NavigableString
+    def string_container(self, base_class=None):
+        container = base_class or NavigableString
+        
+        # There may be a general override of NavigableString.
+        container = self.element_classes.get(
+            container, container
        )
-        return subclass(s)

-    def insert_before(self, successor):
+        # On top of that, we may be inside a tag that needs a special
+        # container class.
+        if self.string_container_stack and container is NavigableString:
+            container = self.builder.string_containers.get(
+                self.string_container_stack[-1].name, container
+            )
+        return container
+        
+    def new_string(self, s, subclass=None):
+        """Create a new NavigableString associated with this BeautifulSoup
+        object.
+        """
+        container = self.string_container(subclass)
+        return container(s)
+
+    def insert_before(self, *args):
+        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+        it because there is nothing before or after it in the parse tree.
+        """
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")

-    def insert_after(self, successor):
+    def insert_after(self, *args):
+        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+        it because there is nothing before or after it in the parse tree.
+        """
        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")

    def popTag(self):
+        """Internal method called by _popToTag when a tag is closed."""
        tag = self.tagStack.pop()
+        if tag.name in self.open_tag_counter:
+            self.open_tag_counter[tag.name] -= 1
        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
            self.preserve_whitespace_tag_stack.pop()
-        #print "Pop", tag.name
+        if self.string_container_stack and tag == self.string_container_stack[-1]:
+            self.string_container_stack.pop()
+        #print("Pop", tag.name)
        if self.tagStack:
            self.currentTag = self.tagStack[-1]
        return self.currentTag

    def pushTag(self, tag):
-        #print "Push", tag.name
+        """Internal method called by handle_starttag when a tag is opened."""
+        #print("Push", tag.name)
        if self.currentTag is not None:
            self.currentTag.contents.append(tag)
        self.tagStack.append(tag)
        self.currentTag = self.tagStack[-1]
+        if tag.name != self.ROOT_TAG_NAME:
+            self.open_tag_counter[tag.name] += 1
        if tag.name in self.builder.preserve_whitespace_tags:
            self.preserve_whitespace_tag_stack.append(tag)
+        if tag.name in self.builder.string_containers:
+            self.string_container_stack.append(tag)

    def endData(self, containerClass=None):
-
-        # Default container is NavigableString.
-        containerClass = containerClass or NavigableString
-
-        # The user may want us to instantiate some alias for the
-        # container class.
-        containerClass = self.element_classes.get(
-            containerClass, containerClass
-        )
-        
+        """Method called by the TreeBuilder when the end of a data segment
+        occurs.
+        """       
        if self.current_data:
            current_data = ''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
                    not self.parse_only.search(current_data)):
                return

+            containerClass = self.string_container(containerClass)
            o = containerClass(current_data)
            self.object_was_parsed(o)

    def object_was_parsed(self, o, parent=None, most_recent_element=None):
-        """Add an object to the parse tree."""
+        """Method called by the TreeBuilder to integrate an object into the parse tree."""
        if parent is None:
            parent = self.currentTag
        if most_recent_element is not None:
@ -545,10 +656,19 @@ class BeautifulSoup(Tag):

    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
-        instance of the given tag. If inclusivePop is false, pops the tag
-        stack up to but *not* including the most recent instqance of
-        the given tag."""
-        #print "Popping to %s" % name
+        instance of the given tag.
+
+        If there are no open tags with the given name, nothing will be
+        popped.
+
+        :param name: Pop up to the most recent tag with this name.
+        :param nsprefix: The namespace prefix that goes with `name`.
+        :param inclusivePop: It this is false, pops the tag stack up
+          to but *not* including the most recent instqance of the
+          given tag.
+
+        """
+        #print("Popping to %s" % name)
        if name == self.ROOT_TAG_NAME:
            # The BeautifulSoup object itself can never be popped.
            return
@ -557,6 +677,8 @@ class BeautifulSoup(Tag):

        stack_size = len(self.tagStack)
        for i in range(stack_size - 1, 0, -1):
+            if not self.open_tag_counter.get(name):
+                break
            t = self.tagStack[i]
            if (name == t.name and nsprefix == t.prefix):
                if inclusivePop:
@ -568,15 +690,22 @@ class BeautifulSoup(Tag):

    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
                        sourcepos=None):
-        """Push a start tag on to the stack.
+        """Called by the tree builder when a new tag is encountered.

-        If this method returns None, the tag was rejected by the
+        :param name: Name of the tag.
+        :param nsprefix: Namespace prefix for the tag.
+        :param attrs: A dictionary of attribute values.
+        :param sourceline: The line number where this tag was found in its
+            source document.
+        :param sourcepos: The character position within `sourceline` where this
+            tag was found.
+
+        If this method returns None, the tag was rejected by an active
        SoupStrainer. You should proceed as if the tag had not occurred
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
-
-        # print "Start tag %s: %s" % (name, attrs)
+        # print("Start tag %s: %s" % (name, attrs))
        self.endData()

        if (self.parse_only and len(self.tagStack) <= 1
@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
        return tag

    def handle_endtag(self, name, nsprefix=None):
-        #print "End tag: " + name
+        """Called by the tree builder when an ending tag is encountered.
+
+        :param name: Name of the tag.
+        :param nsprefix: Namespace prefix for the tag.
+        """
+        #print("End tag: " + name)
        self.endData()
        self._popToTag(name, nsprefix)

    def handle_data(self, data):
+        """Called by the tree builder when a chunk of textual data is encountered."""
        self.current_data.append(data)
-
+       
    def decode(self, pretty_print=False,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
-        """Returns a string or Unicode representation of this document.
-        To get Unicode, pass None for encoding."""
+        """Returns a string or Unicode representation of the parse tree
+            as an HTML or XML document.

+        :param pretty_print: If this is True, indentation will be used to
+            make the document more readable.
+        :param eventual_encoding: The encoding of the final document.
+            If this is None, the document will be a Unicode string.
+        """
        if self.is_xml:
            # Print the XML declaration
            encoding_part = ''
+            if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
+                # This is a special Python encoding; it can't actually
+                # go into an XML document because it means nothing
+                # outside of Python.
+                eventual_encoding = None
            if eventual_encoding != None:
                encoding_part = ' encoding="%s"' % eventual_encoding
            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
        return prefix + super(BeautifulSoup, self).decode(
            indent_level, eventual_encoding, formatter)

-# Alias to make it easier to type import: 'from bs4 import _soup'
+# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
 _s = BeautifulSoup
 _soup = BeautifulSoup

@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):


 class StopParsing(Exception):
+    """Exception raised by a TreeBuilder if it's unable to continue parsing."""
    pass

 class FeatureNotFound(ValueError):
+    """Exception raised by the BeautifulSoup constructor if no parser with the
+    requested features is found.
+    """
    pass


-#By default, act as an HTML pretty-printer.
+#If this file is run as a script, act as an HTML pretty-printer.
 if __name__ == '__main__':
    import sys
    soup = BeautifulSoup(sys.stdin)
-    print(soup.prettify())
+    print((soup.prettify()))
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -7,8 +7,11 @@ import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
+    Stylesheet,
+    Script,
+    TemplateString,
    nonwhitespace_re
-    )
+)

 __all__ = [
    'HTMLTreeBuilder',
@ -27,18 +30,33 @@ HTML_5 = 'html5'


 class TreeBuilderRegistry(object):
-
+    """A way of looking up TreeBuilder subclasses by their name or by desired
+    features.
+    """
+    
    def __init__(self):
        self.builders_for_feature = defaultdict(list)
        self.builders = []

    def register(self, treebuilder_class):
-        """Register a treebuilder based on its advertised features."""
+        """Register a treebuilder based on its advertised features.
+
+        :param treebuilder_class: A subclass of Treebuilder. its .features
+           attribute should list its features.
+        """
        for feature in treebuilder_class.features:
            self.builders_for_feature[feature].insert(0, treebuilder_class)
        self.builders.insert(0, treebuilder_class)

    def lookup(self, *features):
+        """Look up a TreeBuilder subclass with the desired features.
+
+        :param features: A list of features to look for. If none are
+            provided, the most recently registered TreeBuilder subclass
+            will be used.
+        :return: A TreeBuilder subclass, or None if there's no
+            registered subclass with all the requested features.
+        """
        if len(self.builders) == 0:
            # There are no builders at all.
            return None
@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
 builder_registry = TreeBuilderRegistry()

 class TreeBuilder(object):
-    """Turn a document into a Beautiful Soup object tree."""
+    """Turn a textual document into a Beautiful Soup object tree."""

    NAME = "[Unknown tree builder]"
    ALTERNATE_NAMES = []
@ -96,7 +114,12 @@ class TreeBuilder(object):
    # comma-separated list of CDATA, rather than a single CDATA.
    DEFAULT_CDATA_LIST_ATTRIBUTES = {}

+    # Whitespace should be preserved inside these tags.
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+    # The textual contents of tags with these names should be
+    # instantiated with some class other than NavigableString.
+    DEFAULT_STRING_CONTAINERS = {}
    
    USE_DEFAULT = object()

@ -105,30 +128,39 @@ class TreeBuilder(object):
    
    def __init__(self, multi_valued_attributes=USE_DEFAULT,
                 preserve_whitespace_tags=USE_DEFAULT,
-                 store_line_numbers=USE_DEFAULT):
+                 store_line_numbers=USE_DEFAULT,
+                 string_containers=USE_DEFAULT,
+    ):
        """Constructor.

        :param multi_valued_attributes: If this is set to None, the
-        TreeBuilder will not turn any values for attributes like
-        'class' into lists. Setting this do a dictionary will
-        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
-        for an example.
+         TreeBuilder will not turn any values for attributes like
+         'class' into lists. Setting this to a dictionary will
+         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+         for an example.

-        Internally, these are called "CDATA list attributes", but that
-        probably doesn't make sense to an end-user, so the argument name
-        is `multi_valued_attributes`.
+         Internally, these are called "CDATA list attributes", but that
+         probably doesn't make sense to an end-user, so the argument name
+         is `multi_valued_attributes`.

        :param preserve_whitespace_tags: A list of tags to treat
-        the way <pre> tags are treated in HTML. Tags in this list
-        will have 
+         the way <pre> tags are treated in HTML. Tags in this list
+         are immune from pretty-printing; their contents will always be
+         output as-is.
+
+        :param string_containers: A dictionary mapping tag names to
+        the classes that should be instantiated to contain the textual
+        contents of those tags. The default is to use NavigableString
+        for every tag, no matter what the name. You can override the
+        default by changing DEFAULT_STRING_CONTAINERS.

        :param store_line_numbers: If the parser keeps track of the
-        line numbers and positions of the original markup, that
-        information will, by default, be stored in each corresponding
-        `Tag` object. You can turn this off by passing
-        store_line_numbers=False. If the parser you're using doesn't 
-        keep track of this information, then setting store_line_numbers=True
-        will do nothing.
+         line numbers and positions of the original markup, that
+         information will, by default, be stored in each corresponding
+         `Tag` object. You can turn this off by passing
+         store_line_numbers=False. If the parser you're using doesn't 
+         keep track of this information, then setting store_line_numbers=True
+         will do nothing.
        """
        self.soup = None
        if multi_valued_attributes is self.USE_DEFAULT:
@ -139,15 +171,25 @@ class TreeBuilder(object):
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers
+        self.store_line_numbers = store_line_numbers 
+        if string_containers == self.USE_DEFAULT:
+            string_containers = self.DEFAULT_STRING_CONTAINERS
+        self.string_containers = string_containers
        
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.
+
+        :param soup: A BeautifulSoup object.
        """
        self.soup = soup
        
    def reset(self):
+        """Do any work necessary to reset the underlying parser
+        for a new document.
+
+        By default, this does nothing.
+        """
        pass

    def can_be_empty_element(self, tag_name):
@ -159,23 +201,57 @@ class TreeBuilder(object):
        For instance: an HTMLBuilder does not consider a <p> tag to be
        an empty-element tag (it's not in
        HTMLBuilder.empty_element_tags). This means an empty <p> tag
-        will be presented as "<p></p>", not "<p />".
+        will be presented as "<p></p>", not "<p/>" or "<p>".

        The default implementation has no opinion about which tags are
        empty-element tags, so a tag will be presented as an
-        empty-element tag if and only if it has no contents.
-        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        empty-element tag if and only if it has no children.
+        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
        be left alone.
+
+        :param tag_name: The name of a markup tag.
        """
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
    
    def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+
+        This method is not implemented in TreeBuilder; it must be
+        implemented in subclasses.
+
+        :return: None.
+        """
        raise NotImplementedError()

    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding. NOTE: This argument is not used by the
+            calling code and can probably be removed.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
+
+         By default, the only strategy is to parse the markup
+         as-is. See `LXMLTreeBuilderForXML` and
+         `HTMLParserTreeBuilder` for implementations that take into
+         account the quirks of particular parsers.
+        """
        yield markup, None, None, False

    def test_fragment_to_document(self, fragment):
@ -188,16 +264,36 @@ class TreeBuilder(object):
        results against other HTML fragments.

        This method should not be used outside of tests.
+
+        :param fragment: A string -- fragment of HTML.
+        :return: A string -- a full HTML document.
        """
        return fragment

    def set_up_substitutions(self, tag):
+        """Set up any substitutions that will need to be performed on 
+        a `Tag` when it's output as a string.
+
+        By default, this does nothing. See `HTMLTreeBuilder` for a
+        case where this is used.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
        return False

    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
-        """Replaces class="foo bar" with class=["foo", "bar"]
+        """When an attribute value is associated with a tag that can
+        have multiple values for that attribute, convert the string
+        value to a list of strings.

-        Modifies its input in place.
+        Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+        NOTE: This method modifies its input in place.
+
+        :param tag_name: The name of a tag.
+        :param attrs: A dictionary containing the tag's attributes.
+           Any appropriate attribute values will be modified in place.
        """
        if not attrs:
            return attrs
@ -225,7 +321,11 @@ class TreeBuilder(object):
        return attrs

 class SAXTreeBuilder(TreeBuilder):
-    """A Beautiful Soup treebuilder that listens for SAX events."""
+    """A Beautiful Soup treebuilder that listens for SAX events.
+
+    This is not currently used for anything, but it demonstrates
+    how a simple TreeBuilder would work.
+    """

    def feed(self, markup):
        raise NotImplementedError()
@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):

    def startElement(self, name, attrs):
        attrs = dict((key[1], value) for key, value in list(attrs.items()))
-        #print "Start %s, %r" % (name, attrs)
+        #print("Start %s, %r" % (name, attrs))
        self.soup.handle_starttag(name, attrs)

    def endElement(self, name):
-        #print "End %s" % name
+        #print("End %s" % name)
        self.soup.handle_endtag(name)

    def startElementNS(self, nsTuple, nodeName, attrs):
@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder):
    # but it may do so eventually, and this information is available if
    # you need to use it.
    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
+    # The HTML standard defines an unusual content model for these tags.
+    # We represent this by using a string class other than NavigableString
+    # inside these tags.
+    #
+    # I made this list by going through the HTML spec
+    # (https://html.spec.whatwg.org/#metadata-content) and looking for
+    # "metadata content" elements that can contain strings.
+    #
+    # TODO: Arguably <noscript> could go here but it seems
+    # qualitatively different from the other tags.
+    DEFAULT_STRING_CONTAINERS = {
+        'style': Stylesheet,
+        'script': Script,
+        'template': TemplateString,
+    }    
    
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
    
    def set_up_substitutions(self, tag):
+        """Replace the declared encoding in a <meta> tag with a placeholder,
+        to be substituted when the tag is output to a string.
+
+        An HTML document may come in to Beautiful Soup as one
+        encoding, but exit in a different encoding, and the <meta> tag
+        needs to be changed to reflect this.
+
+        :param tag: A `Tag`
+        :return: Whether or not a substitution was performed.
+        """
        # We are only interested in <meta> tags
        if tag.name != 'meta':
            return False
@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):

 def register_treebuilders_from(module):
    """Copy TreeBuilders from the given module into this module."""
-    # I'm fairly sure this is not the best way to do this.
-    this_module = sys.modules['bs4.builder']
+    this_module = sys.modules[__name__]
    for name in module.__all__:
        obj = getattr(module, name)

@ -363,6 +488,9 @@ def register_treebuilders_from(module):
            this_module.builder_registry.register(obj)

 class ParserRejectedMarkup(Exception):
+    """An Exception to be raised when the underlying parser simply
+    refuses to parse the given markup.
+    """
    def __init__(self, message_or_exception):
        """Explain why the parser rejected the given markup, either
        with a textual explanation or another exception.
@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
 from . import _htmlparser
 register_treebuilders_from(_htmlparser)
 try:
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -39,7 +39,18 @@ except ImportError as e:
    new_html5lib = True

 class HTML5TreeBuilder(HTMLTreeBuilder):
-    """Use html5lib to build a tree."""
+    """Use html5lib to build a tree.
+
+    Note that this TreeBuilder does not support some features common
+    to HTML TreeBuilders. Some of these features could theoretically
+    be implemented, but at the very least it's quite difficult,
+    because html5lib moves the parse tree around as it's being built.
+
+    * This TreeBuilder doesn't use different subclasses of NavigableString
+      based on the name of the tag in which the string was found.
+
+    * You can't use a SoupStrainer to parse only part of a document.
+    """

    NAME = "html5lib"

@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
+        # TODO: What are **kwargs exactly? Should they be passed in
+        # here in addition to/instead of being passed to the BeautifulSoup
+        # constructor?
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
@ -316,9 +330,7 @@ class Element(treebuilder_base.Node):
        return AttrList(self.element)

    def setAttributes(self, attributes):
-
        if attributes is not None and len(attributes) > 0:
-
            converted_attributes = []
            for name, value in list(attributes.items()):
                if isinstance(name, tuple):
@ -363,9 +375,9 @@ class Element(treebuilder_base.Node):

    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
-        # print "MOVE", self.element.contents
-        # print "FROM", self.element
-        # print "TO", new_parent.element
+        # print("MOVE", self.element.contents)
+        # print("FROM", self.element)
+        # print("TO", new_parent.element)

        element = self.element
        new_parent_element = new_parent.element
@ -423,9 +435,9 @@ class Element(treebuilder_base.Node):
        element.contents = []
        element.next_element = final_next_element

-        # print "DONE WITH MOVE"
-        # print "FROM", self.element
-        # print "TO", new_parent_element
+        # print("DONE WITH MOVE")
+        # print("FROM", self.element)
+        # print("TO", new_parent_element)

    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -8,7 +8,7 @@ __all__ = [
    'HTMLParserTreeBuilder',
    ]

-from future.moves.html.parser import HTMLParser
+from html.parser import HTMLParser

 try:
    from html.parser import HTMLParseError
@ -53,8 +53,30 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'

 class BeautifulSoupHTMLParser(HTMLParser):
+    """A subclass of the Python standard library's HTMLParser class, which
+    listens for HTMLParser events and translates them into calls
+    to Beautiful Soup's tree construction API.
+    """

+    # Strategies for handling duplicate attributes
+    IGNORE = 'ignore'
+    REPLACE = 'replace'
+    
    def __init__(self, *args, **kwargs):
+        """Constructor.
+
+        :param on_duplicate_attribute: A strategy for what to do if a
+            tag includes the same attribute more than once. Accepted
+            values are: REPLACE (replace earlier values with later
+            ones, the default), IGNORE (keep the earliest value
+            encountered), or a callable. A callable must take three
+            arguments: the dictionary of attributes already processed,
+            the name of the duplicate attribute, and the most recent value
+            encountered.           
+        """
+        self.on_duplicate_attribute = kwargs.pop(
+            'on_duplicate_attribute', self.REPLACE
+        )
        HTMLParser.__init__(self, *args, **kwargs)

        # Keep a list of empty-element tags that were encountered
@ -67,20 +89,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.already_closed_empty_element = []

    def error(self, msg):
-        """In Python 3, HTMLParser subclasses must implement error(), although this
-        requirement doesn't appear to be documented.
+        """In Python 3, HTMLParser subclasses must implement error(), although
+        this requirement doesn't appear to be documented.

-        In Python 2, HTMLParser implements error() as raising an exception.
+        In Python 2, HTMLParser implements error() by raising an exception,
+        which we don't want to do.

-        In any event, this method is called only on very strange markup and our best strategy
-        is to pretend it didn't happen and keep going.
+        In any event, this method is called only on very strange
+        markup and our best strategy is to pretend it didn't happen
+        and keep going.
        """
        warnings.warn(msg)
        
    def handle_startendtag(self, name, attrs):
-        # This is only called when the markup looks like
-        # <tag/>.
+        """Handle an incoming empty-element tag.

+        This is only called when the markup looks like <tag/>.
+
+        :param name: Name of the tag.
+        :param attrs: Dictionary of the tag's attributes.
+        """
        # is_startend() tells handle_starttag not to close the tag
        # just because its name matches a known empty-element tag. We
        # know that this is an empty-element tag and we want to call
@ -89,6 +117,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.handle_endtag(name)
        
    def handle_starttag(self, name, attrs, handle_empty_element=True):
+        """Handle an opening tag, e.g. '<tag>'
+
+        :param name: Name of the tag.
+        :param attrs: Dictionary of the tag's attributes.
+        :param handle_empty_element: True if this tag is known to be
+            an empty-element tag (i.e. there is not expected to be any
+            closing tag).
+        """
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@ -96,9 +132,21 @@ class BeautifulSoupHTMLParser(HTMLParser):
            # for consistency with the other tree builders.
            if value is None:
                value = ''
-            attr_dict[key] = value
+            if key in attr_dict:
+                # A single attribute shows up multiple times in this
+                # tag. How to handle it depends on the
+                # on_duplicate_attribute setting.
+                on_dupe = self.on_duplicate_attribute
+                if on_dupe == self.IGNORE:
+                    pass
+                elif on_dupe in (None, self.REPLACE):
+                    attr_dict[key] = value
+                else:
+                    on_dupe(attr_dict, key, value)
+            else:
+                attr_dict[key] = value
            attrvalue = '""'
-        #print "START", name
+        #print("START", name)
        sourceline, sourcepos = self.getpos()
        tag = self.soup.handle_starttag(
            name, None, None, attr_dict, sourceline=sourceline,
@ -121,20 +169,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
            self.already_closed_empty_element.append(name)
            
    def handle_endtag(self, name, check_already_closed=True):
-        #print "END", name
+        """Handle a closing tag, e.g. '</tag>'
+        
+        :param name: A tag name.
+        :param check_already_closed: True if this tag is expected to
+           be the closing portion of an empty-element tag,
+           e.g. '<tag></tag>'.
+        """
+        #print("END", name)
        if check_already_closed and name in self.already_closed_empty_element:
            # This is a redundant end tag for an empty-element tag.
            # We've already called handle_endtag() for it, so just
            # check it off the list.
-            # print "ALREADY CLOSED", name
+            #print("ALREADY CLOSED", name)
            self.already_closed_empty_element.remove(name)
        else:
            self.soup.handle_endtag(name)

    def handle_data(self, data):
+        """Handle some textual data that shows up between tags."""
        self.soup.handle_data(data)

    def handle_charref(self, name):
+        """Handle a numeric character reference by converting it to the
+        corresponding Unicode character and treating it as textual
+        data.
+
+        :param name: Character number, possibly in hexadecimal.
+        """
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
@ -168,6 +230,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.handle_data(data)

    def handle_entityref(self, name):
+        """Handle a named entity reference by converting it to the
+        corresponding Unicode character(s) and treating it as textual
+        data.
+
+        :param name: Name of the entity reference.
+        """
        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
        if character is not None:
            data = character
@ -181,21 +249,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.handle_data(data)

    def handle_comment(self, data):
+        """Handle an HTML comment.
+
+        :param data: The text of the comment.
+        """
        self.soup.endData()
        self.soup.handle_data(data)
        self.soup.endData(Comment)

    def handle_decl(self, data):
+        """Handle a DOCTYPE declaration.
+
+        :param data: The text of the declaration.
+        """
        self.soup.endData()
-        if data.startswith("DOCTYPE "):
-            data = data[len("DOCTYPE "):]
-        elif data == 'DOCTYPE':
-            # i.e. "<!DOCTYPE>"
-            data = ''
+        data = data[len("DOCTYPE "):]
        self.soup.handle_data(data)
        self.soup.endData(Doctype)

    def unknown_decl(self, data):
+        """Handle a declaration of unknown type -- probably a CDATA block.
+
+        :param data: The text of the declaration.
+        """
        if data.upper().startswith('CDATA['):
            cls = CData
            data = data[len('CDATA['):]
@ -206,13 +282,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.soup.endData(cls)

    def handle_pi(self, data):
+        """Handle a processing instruction.
+
+        :param data: The text of the instruction.
+        """
        self.soup.endData()
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)


 class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+    """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
+    found in the Python standard library.
+    """
    is_xml = False
    picklable = True
    NAME = HTMLPARSER
@ -221,36 +303,88 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
    # The html.parser knows which line number and position in the
    # original file is the source of an element.
    TRACKS_LINE_NUMBERS = True
-    
+
    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+        """Constructor.
+
+        :param parser_args: Positional arguments to pass into 
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param parser_kwargs: Keyword arguments to pass into 
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param kwargs: Keyword arguments for the superclass constructor.
+        """
+        # Some keyword arguments will be pulled out of kwargs and placed
+        # into parser_kwargs.
+        extra_parser_kwargs = dict()
+        for arg in ('on_duplicate_attribute',):
+            if arg in kwargs:
+                value = kwargs.pop(arg)
+                extra_parser_kwargs[arg] = value
        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
        parser_args = parser_args or []
        parser_kwargs = parser_kwargs or {}
+        parser_kwargs.update(extra_parser_kwargs)
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            parser_kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            parser_kwargs['convert_charrefs'] = False
        self.parser_args = (parser_args, parser_kwargs)
-
+        
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
-        """
-        :return: A 4-tuple (markup, original encoding, encoding
-        declared within markup, whether any characters had to be
-        replaced with REPLACEMENT CHARACTER).
+
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
        """
        if isinstance(markup, str):
+            # Parse Unicode as-is.
            yield (markup, None, None, False)
            return

+        # Ask UnicodeDammit to sniff the most likely encoding.
+
+        # This was provided by the end-user; treat it as a known
+        # definite encoding per the algorithm laid out in the HTML5
+        # spec.  (See the EncodingDetector class for details.)
+        known_definite_encodings = [user_specified_encoding]
+
+        # This was found in the document; treat it as a slightly lower-priority
+        # user encoding.
+        user_encodings = [document_declared_encoding]
+
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
-                               exclude_encodings=exclude_encodings)
+        dammit = UnicodeDammit(
+            markup,
+            known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings,
+            is_html=True,
+            exclude_encodings=exclude_encodings
+        )
        yield (dammit.markup, dammit.original_encoding,
               dammit.declared_html_encoding,
               dammit.contains_replacement_characters)

    def feed(self, markup):
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in self.soup.
+        """
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    # But instead we build an XMLParser or HTMLParser object to serve
    # as the target of parse messages, and those messages don't include
    # line numbers.
+    # See: https://bugs.launchpad.net/lxml/+bug/1846906
    
    def initialize_soup(self, soup):
        """Let the BeautifulSoup object know about the standard namespace
        mapping.
+
+        :param soup: A `BeautifulSoup`.
        """
        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
        self._register_namespaces(self.DEFAULT_NSMAPS)
@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        while parsing the document.

        This might be useful later on when creating CSS selectors.
+
+        :param mapping: A dictionary mapping namespace prefixes to URIs.
        """
        for key, value in list(mapping.items()):
            if key and key not in self.soup._namespaces:
@ -84,20 +89,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                self.soup._namespaces[key] = value

    def default_parser(self, encoding):
-        # This can either return a parser object or a class, which
-        # will be instantiated with default arguments.
+        """Find the default parser for the given encoding.
+
+        :param encoding: A string.
+        :return: Either a parser object or a class, which
+          will be instantiated with default arguments.
+        """
        if self._default_parser is not None:
            return self._default_parser
        return etree.XMLParser(
            target=self, strip_cdata=False, recover=True, encoding=encoding)

    def parser_for(self, encoding):
+        """Instantiate an appropriate parser for the given encoding.
+
+        :param encoding: A string.
+        :return: A parser object such as an `etree.XMLParser`.
+        """
        # Use the default parser.
        parser = self.default_parser(encoding)

        if isinstance(parser, Callable):
            # Instantiate the parser with default arguments
-            parser = parser(target=self, strip_cdata=False, encoding=encoding)
+            parser = parser(
+                target=self, strip_cdata=False, recover=True, encoding=encoding
+            )
        return parser

    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
@ -122,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def prepare_markup(self, markup, user_specified_encoding=None,
                       exclude_encodings=None,
                       document_declared_encoding=None):
-        """
-        :yield: A series of 4-tuples.
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        lxml really wants to get a bytestring and convert it to
+        Unicode itself. So instead of using UnicodeDammit to convert
+        the bytestring to Unicode using different encodings, this
+        implementation uses EncodingDetector to iterate over the
+        encodings, and tell lxml to try to parse the document as each
+        one in turn.
+
+        :param markup: Some markup -- hopefully a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)

-        Each 4-tuple represents a strategy for parsing the document.
+         Each 4-tuple represents a strategy for converting the
+         document to Unicode and parsing it. Each strategy will be tried 
+         in turn.
        """
-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
        is_html = not self.is_xml
        if is_html:
            self.processing_instruction_class = ProcessingInstruction
@ -150,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)

-        try_encodings = [user_specified_encoding, document_declared_encoding]
+        # This was provided by the end-user; treat it as a known
+        # definite encoding per the algorithm laid out in the HTML5
+        # spec.  (See the EncodingDetector class for details.)
+        known_definite_encodings = [user_specified_encoding]
+
+        # This was found in the document; treat it as a slightly lower-priority
+        # user encoding.
+        user_encodings = [document_declared_encoding]
        detector = EncodingDetector(
-            markup, try_encodings, is_html, exclude_encodings)
+            markup, known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings, is_html=is_html,
+            exclude_encodings=exclude_encodings
+        )
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)

--- a/lib/bs4/check_block.py
+++ b/lib/bs4/check_block.py
@ -1,4 +0,0 @@
-import requests
-data = requests.get("https://www.crummy.com/").content
-from bs4 import _s
-data = [x for x in _s(data).block_text()]
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -20,9 +20,13 @@ import sys
 import cProfile

 def diagnose(data):
-    """Diagnostic suite for isolating common problems."""
-    print("Diagnostic running on Beautiful Soup %s" % __version__)
-    print("Python version %s" % sys.version)
+    """Diagnostic suite for isolating common problems.
+
+    :param data: A string containing markup that needs to be explained.
+    :return: None; diagnostics are printed to standard output.
+    """
+    print(("Diagnostic running on Beautiful Soup %s" % __version__))
+    print(("Python version %s" % sys.version))

    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
@ -39,65 +43,76 @@ def diagnose(data):
        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
-            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+            print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
        except ImportError as e:
-            print (
+            print(
                "lxml is not installed or couldn't be imported.")


    if 'html5lib' in basic_parsers:
        try:
            import html5lib
-            print("Found html5lib version %s" % html5lib.__version__)
+            print(("Found html5lib version %s" % html5lib.__version__))
        except ImportError as e:
-            print (
+            print(
                "html5lib is not installed or couldn't be imported.")

    if hasattr(data, 'read'):
        data = data.read()
    elif data.startswith("http:") or data.startswith("https:"):
-        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+        print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
        return
    else:
        try:
            if os.path.exists(data):
-                print('"%s" looks like a filename. Reading data from the file.' % data)
+                print(('"%s" looks like a filename. Reading data from the file.' % data))
                with open(data) as fp:
                    data = fp.read()
        except ValueError:
            # This can happen on some platforms when the 'filename' is
            # too long. Assume it's data and not a filename.
            pass
-        print()
+        print("")

    for parser in basic_parsers:
-        print("Trying to parse your markup with %s" % parser)
+        print(("Trying to parse your markup with %s" % parser))
        success = False
        try:
            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception as e:
-            print("%s could not parse the markup." % parser)
+            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
-            print("Here's what %s did with the markup:" % parser)
-            print(soup.prettify())
+            print(("Here's what %s did with the markup:" % parser))
+            print((soup.prettify()))

-        print("-" * 80)
+        print(("-" * 80))

 def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
-    Soup code is running.
+    Soup code is running. You can use this to determine whether
+    an lxml-specific problem is in Beautiful Soup's lxml tree builders
+    or in lxml itself.
+
+    :param data: Some markup.
+    :param html: If True, markup will be parsed with lxml's HTML parser.
+       if False, lxml's XML parser will be used.
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
        print(("%s, %4s, %s" % (event, element.tag, element.text)))

 class AnnouncingParser(HTMLParser):
-    """Announces HTMLParser parse events, without doing anything else."""
+    """Subclass of HTMLParser that announces parse events, without doing
+    anything else.
+
+    You can use this to get a picture of how html.parser sees a given
+    document. The easiest way to do this is to call `htmlparser_trace`.
+    """

    def _p(self, s):
        print(s)
@ -134,6 +149,8 @@ def htmlparser_trace(data):

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
+
+    :param data: Some markup.
    """
    parser = AnnouncingParser()
    parser.feed(data)
@ -176,9 +193,9 @@ def rdoc(num_elements=1000):

 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
-    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
+    print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
    data = rdoc(num_elements)
-    print("Generated a large invalid HTML document (%d bytes)." % len(data))
+    print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
            b = time.time()
            success = True
        except Exception as e:
-            print("%s could not parse the markup." % parser)
+            print(("%s could not parse the markup." % parser))
            traceback.print_exc()
        if success:
-            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
+            print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
-    print("Raw lxml parsed the markup in %.2fs." % (b-a))
+    print(("Raw lxml parsed the markup in %.2fs." % (b-a)))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
-    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
+    print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))

 def profile(num_elements=100000, parser="lxml"):
-
+    """Use Python's profiler on a randomly generated document."""
    filehandle = tempfile.NamedTemporaryFile()
    filename = filehandle.name

@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
    stats.sort_stats("cumulative")
    stats.print_stats('_html5lib|bs4', 50)

+# If this file is run as a script, standard input is diagnosed.
 if __name__ == '__main__':
    diagnose(sys.stdin.read())
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):

    Some parts of this strategy come from the distinction between
    HTML4, HTML5, and XML. Others are configurable by the user.
+
+    Formatters are passed in as the `formatter` argument to methods
+    like `PageElement.encode`. Most people won't need to think about
+    formatters, and most people who need to think about them can pass
+    in one of these predefined strings as `formatter` rather than
+    making a new Formatter object:
+
+    For HTML documents:
+     * 'html' - HTML entity substitution for generic HTML documents. (default)
+     * 'html5' - HTML entity substitution for HTML5 documents, as
+                 well as some optimizations in the way tags are rendered.
+     * 'minimal' - Only make the substitutions necessary to guarantee
+                   valid HTML.
+     * None - Do not perform any substitution. This will be faster
+              but may result in invalid markup.
+
+    For XML documents:
+     * 'html' - Entity substitution for XHTML documents.
+     * 'minimal' - Only make the substitutions necessary to guarantee
+                   valid XML. (default)
+     * None - Do not perform any substitution. This will be faster
+              but may result in invalid markup.
    """
    # Registries of XML and HTML formatters.
    XML_FORMATTERS = {}
@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
    def __init__(
            self, language=None, entity_substitution=None,
            void_element_close_prefix='/', cdata_containing_tags=None,
+            empty_attributes_are_booleans=False,
    ):
-        """
+        """Constructor.

-        :param void_element_close_prefix: By default, represent void
-        elements as <tag/> rather than <tag>
+        :param language: This should be Formatter.XML if you are formatting
+           XML markup and Formatter.HTML if you are formatting HTML markup.
+
+        :param entity_substitution: A function to call to replace special
+           characters with XML/HTML entities. For examples, see 
+           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
+        :param void_element_close_prefix: By default, void elements
+           are represented as <tag/> (XML rules) rather than <tag>
+           (HTML rules). To get <tag>, pass in the empty string.
+        :param cdata_containing_tags: The list of tags that are defined
+           as containing CDATA in this dialect. For example, in HTML,
+           <script> and <style> tags are defined as containing CDATA,
+           and their contents should not be formatted.
+        :param blank_attributes_are_booleans: Render attributes whose value
+            is the empty string as HTML-style boolean attributes.
+            (Attributes whose value is None are always rendered this way.)
        """
        self.language = language
        self.entity_substitution = entity_substitution
@ -39,9 +76,17 @@ class Formatter(EntitySubstitution):
        self.cdata_containing_tags = self._default(
            language, cdata_containing_tags, 'cdata_containing_tags'
        )
-            
+        self.empty_attributes_are_booleans=empty_attributes_are_booleans
+        
    def substitute(self, ns):
-        """Process a string that needs to undergo entity substitution."""
+        """Process a string that needs to undergo entity substitution.
+        This may be a string encountered in an attribute value or as
+        text.
+
+        :param ns: A string.
+        :return: A string with certain characters replaced by named
+           or numeric entities.
+        """
        if not self.entity_substitution:
            return ns
        from .element import NavigableString
@ -54,21 +99,41 @@ class Formatter(EntitySubstitution):
        return self.entity_substitution(ns)

    def attribute_value(self, value):
-        """Process the value of an attribute."""
+        """Process the value of an attribute.
+
+        :param ns: A string.
+        :return: A string with certain characters replaced by named
+           or numeric entities.
+        """
        return self.substitute(value)
    
    def attributes(self, tag):
-        """Reorder a tag's attributes however you want."""
-        return sorted(tag.attrs.items())
+        """Reorder a tag's attributes however you want.
+        
+        By default, attributes are sorted alphabetically. This makes
+        behavior consistent between Python 2 and Python 3, and preserves
+        backwards compatibility with older versions of Beautiful Soup.

+        If `empty_boolean_attributes` is True, then attributes whose
+        values are set to the empty string will be treated as boolean
+        attributes.
+        """
+        if tag.attrs is None:
+            return []
+        return sorted(
+            (k, (None if self.empty_attributes_are_booleans and v == '' else v))
+            for k, v in list(tag.attrs.items())
+        )
   
 class HTMLFormatter(Formatter):
+    """A generic Formatter for HTML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)

    
 class XMLFormatter(Formatter):
+    """A generic Formatter for XML."""
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
@ -80,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
 )
 HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html,
-    void_element_close_prefix = None
+    void_element_close_prefix=None,
+    empty_attributes_are_booleans=True,
 )
 HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -8,6 +8,7 @@ import pickle
 import copy
 import functools
 import unittest
+import warnings
 from unittest import TestCase
 from bs4 import BeautifulSoup
 from bs4.element import (
@ -15,7 +16,10 @@ from bs4.element import (
    Comment,
    ContentMetaAttributeValue,
    Doctype,
+    PYTHON_SPECIFIC_ENCODINGS,
    SoupStrainer,
+    Script,
+    Stylesheet,
    Tag
 )

@ -83,8 +87,22 @@ class SoupTest(unittest.TestCase):
        if compare_parsed_to is None:
            compare_parsed_to = to_parse

+        # Verify that the documents come out the same.
        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))

+        # Also run some checks on the BeautifulSoup object itself:
+
+        # Verify that every tag that was opened was eventually closed.
+
+        # There are no tags in the open tag counter.
+        assert all(v==0 for v in list(obj.open_tag_counter.values()))
+
+        # The only tag in the tag stack is the one for the root
+        # document.
+        self.assertEqual(
+            [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
+        )
+        
    def assertConnectedness(self, element):
        """Ensure that next_element and previous_element are properly
        set for all descendants of the given element.
@ -211,7 +229,41 @@ class SoupTest(unittest.TestCase):
            return child


-class HTMLTreeBuilderSmokeTest(object):
+class TreeBuilderSmokeTest(object):
+    # Tests that are common to HTML and XML tree builders.
+
+    def test_fuzzed_input(self):
+        # This test centralizes in one place the various fuzz tests
+        # for Beautiful Soup created by the oss-fuzz project.
+        
+        # These strings superficially resemble markup, but they
+        # generally can't be parsed into anything. The best we can
+        # hope for is that parsing these strings won't crash the
+        # parser.
+        #
+        # n.b. This markup is commented out because these fuzz tests
+        # _do_ crash the parser. However the crashes are due to bugs
+        # in html.parser, not Beautiful Soup -- otherwise I'd fix the
+        # bugs!
+        
+        bad_markup = [
+            # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
+            # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
+            # https://bugs.python.org/issue37747
+            #
+            #b'\n<![\xff\xfe\xfe\xcd\x00',
+
+            #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
+            # https://bugs.python.org/issue34480
+            #
+            #b'<![n\x00'
+        ]
+        for markup in bad_markup:
+            with warnings.catch_warnings(record=False):
+                soup = self.soup(markup)
+        
+
+class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):

    """A basic test of a treebuilder's competence.

@ -233,6 +285,22 @@ class HTMLTreeBuilderSmokeTest(object):
            new_tag = soup.new_tag(name)
            self.assertEqual(True, new_tag.is_empty_element)

+    def test_special_string_containers(self):
+        soup = self.soup(
+            "<style>Some CSS</style><script>Some Javascript</script>"
+        )
+        assert isinstance(soup.style.string, Stylesheet)
+        assert isinstance(soup.script.string, Script)
+
+        soup = self.soup(
+            "<style><!--Some CSS--></style>"
+        )
+        assert isinstance(soup.style.string, Stylesheet)
+        # The contents of the style tag resemble an HTML comment, but
+        # it's not treated as a comment.
+        self.assertEqual("<!--Some CSS-->", soup.style.string)
+        assert isinstance(soup.style.string, Stylesheet)
+        
    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
        # to the original.
@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
        doctype = soup.contents[0]
        self.assertEqual(doctype.__class__, Doctype)
        self.assertEqual(doctype, doctype_fragment)
-        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+        self.assertEqual(
+            soup.encode("utf8")[:len(doctype_str)],
+            doctype_str
+        )

        # Make sure that the doctype was correctly associated with the
        # parse tree and that the rest of the document parsed.
        self.assertEqual(soup.p.contents[0], 'foo')

-    def _document_with_doctype(self, doctype_fragment):
+    def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
        """Generate and parse a document with the given doctype."""
-        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
        markup = doctype + '\n<p>foo</p>'
        soup = self.soup(markup)
-        return doctype, soup
+        return doctype.encode("utf8"), soup

    def test_normal_doctypes(self):
        """Make sure normal, everyday HTML doctypes are handled correctly."""
@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object):
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())

+    def test_mixed_case_doctype(self):
+        # A lowercase or mixed-case doctype becomes a Doctype.
+        for doctype_fragment in ("doctype", "DocType"):
+            doctype_str, soup = self._document_with_doctype(
+                "html", doctype_fragment
+            )
+
+            # Make sure a Doctype object was created and that the DOCTYPE
+            # is uppercase.
+            doctype = soup.contents[0]
+            self.assertEqual(doctype.__class__, Doctype)
+            self.assertEqual(doctype, "html")
+            self.assertEqual(
+                soup.encode("utf8")[:len(doctype_str)],
+                b"<!DOCTYPE html>"
+            )
+
+            # Make sure that the doctype was correctly associated with the
+            # parse tree and that the rest of the document parsed.
+            self.assertEqual(soup.p.contents[0], 'foo')
+        
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
@ -532,7 +624,7 @@ Hello, world!
        self.assertSoupEquals("&#10000000000000;", expect)
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
-        
+       
    def test_multipart_strings(self):
        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@ -594,7 +686,7 @@ Hello, world!
        markup = b'<a class="foo bar">'
        soup = self.soup(markup)
        self.assertEqual(['foo', 'bar'], soup.a['class'])
-
+        
    #
    # Generally speaking, tests below this point are more tests of
    # Beautiful Soup than tests of the tree builders. But parsers are
@ -779,11 +871,44 @@ Hello, world!
        # encoding.
        self.assertEqual('utf8', charset.encode("utf8"))

+    def test_python_specific_encodings_not_used_in_charset(self):
+        # You can encode an HTML document using a Python-specific
+        # encoding, but that encoding won't be mentioned _inside_ the
+        # resulting document. Instead, the document will appear to
+        # have no encoding.
+        for markup in [
+            b'<meta charset="utf8"></head>'
+            b'<meta id="encoding" charset="utf-8" />'
+        ]:
+            soup = self.soup(markup)
+            for encoding in PYTHON_SPECIFIC_ENCODINGS:
+                if encoding in (
+                    'idna', 'mbcs', 'oem', 'undefined',
+                    'string_escape', 'string-escape'
+                ):
+                    # For one reason or another, these will raise an
+                    # exception if we actually try to use them, so don't
+                    # bother.
+                    continue
+                encoded = soup.encode(encoding)
+                assert b'meta charset=""' in encoded
+                assert encoding.encode("ascii") not in encoded
+        
    def test_tag_with_no_attributes_can_have_attributes_added(self):
        data = self.soup("<a>text</a>")
        data.a['foo'] = 'bar'
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())

+    def test_closing_tag_with_no_opening_tag(self):
+        # Without BeautifulSoup.open_tag_counter, the </span> tag will
+        # cause _popToTag to be called over and over again as we look
+        # for a <span> tag that wasn't there. The result is that 'text2'
+        # will show up outside the body of the document.
+        soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
+        self.assertEqual(
+            "<body><div><p>text1</p>text2</div></body>", soup.body.decode()
+        )
+        
    def test_worst_case(self):
        """Test the worst case (currently) for linking issues."""

@ -791,7 +916,7 @@ Hello, world!
        self.linkage_validator(soup)


-class XMLTreeBuilderSmokeTest(object):
+class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):

    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))

+    def test_python_specific_encodings_not_used_in_xml_declaration(self):
+        # You can encode an XML document using a Python-specific
+        # encoding, but that encoding won't be mentioned _inside_ the
+        # resulting document.
+        markup = b"""<?xml version="1.0"?>\n<foo/>"""
+        soup = self.soup(markup)
+        for encoding in PYTHON_SPECIFIC_ENCODINGS:
+            if encoding in (
+                'idna', 'mbcs', 'oem', 'undefined',
+                'string_escape', 'string-escape'
+            ):
+                # For one reason or another, these will raise an
+                # exception if we actually try to use them, so don't
+                # bother.
+                continue
+            encoded = soup.encode(encoding)
+            assert b'<?xml version="1.0"?>' in encoded
+            assert encoding.encode("ascii") not in encoded
+
    def test_processing_instruction(self):
        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
        soup = self.soup(markup)
@ -828,7 +972,7 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(
            soup.encode("utf-8"), markup)
-
+       
    def test_nested_namespaces(self):
        doc = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
        soup = self.soup(markup, store_line_numbers=False)
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)
+
+    def test_special_string_containers(self):
+        # The html5lib tree builder doesn't support this standard feature,
+        # because there's no way of knowing, when a string is created,
+        # where in the tree it will eventually end up.
+        pass
+
+    def test_html5_attributes(self):
+        # The html5lib TreeBuilder can convert any entity named in
+        # the HTML5 spec to a sequence of Unicode characters, and
+        # convert those Unicode characters to a (potentially
+        # different) named entity on the way out.
+        #
+        # This is a copy of the same test from
+        # HTMLParserTreeBuilderSmokeTest.  It's not in the superclass
+        # because the lxml HTML TreeBuilder _doesn't_ work this way.
+        for input_element, output_unicode, output_element in (
+                ("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
+                ('&models;', '\u22a7', b'&models;'),
+                ('&Nfr;', '\U0001d511', b'&Nfr;'),
+                ('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
+                ('&not;', '\xac', b'&not;'),
+                ('&Not;', '\u2aec', b'&Not;'),
+                ('&quot;', '"', b'"'),
+                ('&there4;', '\u2234', b'&there4;'),
+                ('&Therefore;', '\u2234', b'&there4;'),
+                ('&therefore;', '\u2234', b'&there4;'),
+                ("&fjlig;", 'fj', b'fj'),                
+                ("&sqcup;", '\u2294', b'&sqcup;'),
+                ("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
+                ("&apos;", "'", b"'"),
+                ("&verbar;", "|", b"|"),
+        ):
+            markup = '<div>%s</div>' % input_element
+            div = self.soup(markup).div
+            without_element = div.encode()
+            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+            self.assertEqual(without_element, expect)
+
+            with_element = div.encode(formatter="html")
+            expect = b"<div>%s</div>" % output_element
+            self.assertEqual(with_element, expect)
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -3,6 +3,7 @@ trees."""

 from pdb import set_trace
 import pickle
+import warnings
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)

+    def test_on_duplicate_attribute(self):
+        # The html.parser tree builder has a variety of ways of
+        # handling a tag that contains the same attribute multiple times.
+
+        markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
+
+        # If you don't provide any particular value for
+        # on_duplicate_attribute, later values replace earlier values.
+        soup = self.soup(markup)
+        self.assertEqual("url3", soup.a['href'])
+        self.assertEqual(["cls"], soup.a['class'])
+        self.assertEqual("id", soup.a['id'])
        
+        # You can also get this behavior explicitly.
+        def assert_attribute(on_duplicate_attribute, expected):
+            soup = self.soup(
+                markup, on_duplicate_attribute=on_duplicate_attribute
+            )
+            self.assertEqual(expected, soup.a['href'])
+
+            # Verify that non-duplicate attributes are treated normally.
+            self.assertEqual(["cls"], soup.a['class'])
+            self.assertEqual("id", soup.a['id'])
+        assert_attribute(None, "url3")
+        assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
+
+        # You can ignore subsequent values in favor of the first.
+        assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
+
+        # And you can pass in a callable that does whatever you want.
+        def accumulate(attrs, key, value):
+            if not isinstance(attrs[key], list):
+                attrs[key] = [attrs[key]]
+            attrs[key].append(value)
+        assert_attribute(accumulate, ["url1", "url2", "url3"])            
+
+    def test_html5_attributes(self):
+        # The html.parser TreeBuilder can convert any entity named in
+        # the HTML5 spec to a sequence of Unicode characters, and
+        # convert those Unicode characters to a (potentially
+        # different) named entity on the way out.
+        for input_element, output_unicode, output_element in (
+                ("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
+                ('&models;', '\u22a7', b'&models;'),
+                ('&Nfr;', '\U0001d511', b'&Nfr;'),
+                ('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
+                ('&not;', '\xac', b'&not;'),
+                ('&Not;', '\u2aec', b'&Not;'),
+                ('&quot;', '"', b'"'),
+                ('&there4;', '\u2234', b'&there4;'),
+                ('&Therefore;', '\u2234', b'&there4;'),
+                ('&therefore;', '\u2234', b'&there4;'),
+                ("&fjlig;", 'fj', b'fj'),                
+                ("&sqcup;", '\u2294', b'&sqcup;'),
+                ("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
+                ("&apos;", "'", b"'"),
+                ("&verbar;", "|", b"|"),
+        ):
+            markup = '<div>%s</div>' % input_element
+            div = self.soup(markup).div
+            without_element = div.encode()
+            expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+            self.assertEqual(without_element, expect)
+
+            with_element = div.encode(formatter="html")
+            expect = b"<div>%s</div>" % output_element
+            self.assertEqual(with_element, expect)
+
+
 class TestHTMLParserSubclass(SoupTest):
    def test_error(self):
        """Verify that our HTMLParser subclass implements error() in a way
        that doesn't cause a crash.
        """
        parser = BeautifulSoupHTMLParser()
-        parser.error("don't crash")
+        with warnings.catch_warnings(record=True) as warns:
+            parser.error("don't crash")
+        [warning] = warns
+        assert "don't crash" == str(warning.message)
+
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
-
+        
    def test_entities_in_foreign_document_encoding(self):
        # We can't implement this case correctly because by the time we
        # hear about markup like "&#147;", it's been (incorrectly) converted into
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -3,6 +3,7 @@

 from pdb import set_trace
 import logging
+import os
 import unittest
 import sys
 import tempfile
@ -10,6 +11,8 @@ import tempfile
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
+    GuessedAtParserWarning,
+    MarkupResemblesLocatorWarning,
 )
 from bs4.builder import (
    TreeBuilder,
@ -29,7 +32,6 @@ import bs4.dammit
 from bs4.dammit import (
    EntitySubstitution,
    UnicodeDammit,
-    EncodingDetector,
 )
 from bs4.testing import (
    default_builder,
@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
                self.store_line_numbers = False
                self.cdata_list_attributes = []
                self.preserve_whitespace_tags = []
+                self.string_containers = {}
            def initialize_soup(self, soup):
                pass
            def feed(self, markup):
@ -186,28 +189,69 @@ class TestConstructor(SoupTest):
            isinstance(x, (TagPlus, StringPlus, CommentPlus))
            for x in soup.recursiveChildGenerator()
        )
+
+    def test_alternate_string_containers(self):
+        # Test the ability to customize the string containers for
+        # different types of tags.
+        class PString(NavigableString):
+            pass
+
+        class BString(NavigableString):
+            pass
+
+        soup = self.soup(
+            "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
+            string_containers = {
+                'b': BString,
+                'p': PString,
+            }
+        )
+
+        # The string before the <p> tag is a regular NavigableString.
+        assert isinstance(soup.div.contents[0], NavigableString)
        
+        # The string inside the <p> tag, but not inside the <i> tag,
+        # is a PString.
+        assert isinstance(soup.p.contents[0], PString)
+
+        # Every string inside the <b> tag is a BString, even the one that
+        # was also inside an <i> tag.
+        for s in soup.b.strings:
+            assert isinstance(s, BString)
+
+        # Now that parsing was complete, the string_container_stack
+        # (where this information was kept) has been cleared out.
+        self.assertEqual([], soup.string_container_stack)
+
+
 class TestWarnings(SoupTest):

-    def _no_parser_specified(self, s, is_there=True):
-        v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
-        self.assertTrue(v)
+    def _assert_warning(self, warnings, cls):
+        for w in warnings:
+            if isinstance(w.message, cls):
+                return w
+        raise Exception("%s warning not found in %r" % cls, warnings)
+    
+    def _assert_no_parser_specified(self, w):
+        warning = self._assert_warning(w, GuessedAtParserWarning)
+        message = str(warning.message)
+        self.assertTrue(
+            message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
+        )

    def test_warning_if_no_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>")
-        msg = str(w[0].message)
-        self._assert_no_parser_specified(msg)
+            soup = BeautifulSoup("<a><b></b></a>")
+        self._assert_no_parser_specified(w)

    def test_warning_if_parser_specified_too_vague(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>", "html")
-        msg = str(w[0].message)
-        self._assert_no_parser_specified(msg)
+            soup = BeautifulSoup("<a><b></b></a>", "html")
+        self._assert_no_parser_specified(w)

    def test_no_warning_if_explicit_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup("<a><b></b></a>", "html.parser")
+            soup = BeautifulSoup("<a><b></b></a>", "html.parser")
        self.assertEqual([], w)

    def test_parseOnlyThese_renamed_to_parse_only(self):
@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
        self.assertRaises(
            TypeError, self.soup, "<a>", no_such_argument=True)

-class TestWarnings(SoupTest):
-
    def test_disk_file_warning(self):
        filehandle = tempfile.NamedTemporaryFile()
        filename = filehandle.name
        try:
            with warnings.catch_warnings(record=True) as w:
                soup = self.soup(filename)
-            msg = str(w[0].message)
-            self.assertTrue("looks like a filename" in msg)
+            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+            self.assertTrue("looks like a filename" in str(warning.message))
        finally:
            filehandle.close()

        # The file no longer exists, so Beautiful Soup will no longer issue the warning.
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(filename)
-        self.assertEqual(0, len(w))
+        self.assertEqual([], w)

+    def test_directory_warning(self):
+        try:
+            filename = tempfile.mkdtemp()
+            with warnings.catch_warnings(record=True) as w:
+                soup = self.soup(filename)
+            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+            self.assertTrue("looks like a directory" in str(warning.message))
+        finally:
+            os.rmdir(filename)
+
+        # The directory no longer exists, so Beautiful Soup will no longer issue the warning.
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(filename)
+        self.assertEqual([], w)
+        
    def test_url_warning_with_bytes_url(self):
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup(b"http://www.crummybytes.com/")
-        # Be aware this isn't the only warning that can be raised during
-        # execution..
-        self.assertTrue(any("looks like a URL" in str(w.message) 
-            for w in warning_list))
+        warning = self._assert_warning(
+            warning_list, MarkupResemblesLocatorWarning
+        )
+        self.assertTrue("looks like a URL" in str(warning.message))

    def test_url_warning_with_unicode_url(self):
        with warnings.catch_warnings(record=True) as warning_list:
            # note - this url must differ from the bytes one otherwise
            # python's warnings system swallows the second warning
            soup = self.soup("http://www.crummyunicode.com/")
-        self.assertTrue(any("looks like a URL" in str(w.message) 
-            for w in warning_list))
+        warning = self._assert_warning(
+            warning_list, MarkupResemblesLocatorWarning
+        )
+        self.assertTrue("looks like a URL" in str(warning.message))

    def test_url_warning_with_bytes_and_space(self):
+        # Here the markup contains something besides a URL, so no warning
+        # is issued.
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup(b"http://www.crummybytes.com/ is great")
        self.assertFalse(any("looks like a URL" in str(w.message) 
@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
        self.assertEqual(self.sub.substitute_html(dammit.markup),
                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")

+    def test_html5_entity(self):
+        # Some HTML5 entities correspond to single- or multi-character
+        # Unicode sequences.
+
+        for entity, u in (
+            # A few spot checks of our ability to recognize
+            # special character sequences and convert them
+            # to named entities.
+            ('&models;', '\u22a7'),
+            ('&Nfr;', '\U0001d511'),
+            ('&ngeqq;', '\u2267\u0338'),
+            ('&not;', '\xac'),
+            ('&Not;', '\u2aec'),
+                
+            # We _could_ convert | to &verbarr;, but we don't, because
+            # | is an ASCII character.
+            ('|' '|'),
+
+            # Similarly for the fj ligature, which we could convert to
+            # &fjlig;, but we don't.
+            ("fj", "fj"),
+
+            # We do convert _these_ ASCII characters to HTML entities,
+            # because that's required to generate valid HTML.
+            ('&gt;', '>'),
+            ('&lt;', '<'),
+            ('&amp;', '&'),
+        ):
+            template = '3 %s 4'
+            raw = template % u
+            with_entities = template % entity
+            self.assertEqual(self.sub.substitute_html(raw), with_entities)
+            
+    def test_html5_entity_with_variation_selector(self):
+        # Some HTML5 entities correspond either to a single-character
+        # Unicode sequence _or_ to the same character plus U+FE00,
+        # VARIATION SELECTOR 1. We can handle this.
+        data = "fjords \u2294 penguins"
+        markup = "fjords &sqcup; penguins"
+        self.assertEqual(self.sub.substitute_html(data), markup)
+
+        data = "fjords \u2294\ufe00 penguins"
+        markup = "fjords &sqcups; penguins"
+        self.assertEqual(self.sub.substitute_html(data), markup)
+        
    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, False), s)
@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest):
        markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))

-class TestUnicodeDammit(unittest.TestCase):
-    """Standalone tests of UnicodeDammit."""

-    def test_unicode_input(self):
-        markup = "I'm already Unicode! \N{SNOWMAN}"
-        dammit = UnicodeDammit(markup)
-        self.assertEqual(dammit.unicode_markup, markup)
-
-    def test_smart_quotes_to_unicode(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup)
-        self.assertEqual(
-            dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
-
-    def test_smart_quotes_to_xml_entities(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
-        self.assertEqual(
-            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
-
-    def test_smart_quotes_to_html_entities(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup, smart_quotes_to="html")
-        self.assertEqual(
-            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
-
-    def test_smart_quotes_to_ascii(self):
-        markup = b"<foo>\x91\x92\x93\x94</foo>"
-        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
-        self.assertEqual(
-            dammit.unicode_markup, """<foo>''""</foo>""")
-
-    def test_detect_utf8(self):
-        utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
-        dammit = UnicodeDammit(utf8)
-        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
-        self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
-
-
-    def test_convert_hebrew(self):
-        hebrew = b"\xed\xe5\xec\xf9"
-        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
-        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
-        self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
-
-    def test_dont_see_smart_quotes_where_there_are_none(self):
-        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
-        dammit = UnicodeDammit(utf_8)
-        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
-        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
-
-    def test_ignore_inappropriate_codecs(self):
-        utf8_data = "Räksmörgås".encode("utf-8")
-        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
-        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
-
-    def test_ignore_invalid_codecs(self):
-        utf8_data = "Räksmörgås".encode("utf-8")
-        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
-            dammit = UnicodeDammit(utf8_data, [bad_encoding])
-            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
-
-    def test_exclude_encodings(self):
-        # This is UTF-8.
-        utf8_data = "Räksmörgås".encode("utf-8")
-
-        # But if we exclude UTF-8 from consideration, the guess is
-        # Windows-1252.
-        dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
-        self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
-
-        # And if we exclude that, there is no valid guess at all.
-        dammit = UnicodeDammit(
-            utf8_data, exclude_encodings=["utf-8", "windows-1252"])
-        self.assertEqual(dammit.original_encoding, None)
-
-    def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
-        detected = EncodingDetector(
-            b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
-        encodings = list(detected.encodings)
-        assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
-
-    def test_detect_html5_style_meta_tag(self):
-
-        for data in (
-            b'<html><meta charset="euc-jp" /></html>',
-            b"<html><meta charset='euc-jp' /></html>",
-            b"<html><meta charset=euc-jp /></html>",
-            b"<html><meta charset=euc-jp/></html>"):
-            dammit = UnicodeDammit(data, is_html=True)
-            self.assertEqual(
-                "euc-jp", dammit.original_encoding)
-
-    def test_last_ditch_entity_replacement(self):
-        # This is a UTF-8 document that contains bytestrings
-        # completely incompatible with UTF-8 (ie. encoded with some other
-        # encoding).
-        #
-        # Since there is no consistent encoding for the document,
-        # Unicode, Dammit will eventually encode the document as UTF-8
-        # and encode the incompatible characters as REPLACEMENT
-        # CHARACTER.
-        #
-        # If chardet is installed, it will detect that the document
-        # can be converted into ISO-8859-1 without errors. This happens
-        # to be the wrong encoding, but it is a consistent encoding, so the
-        # code we're testing here won't run.
-        #
-        # So we temporarily disable chardet if it's present.
-        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
-<html><b>\330\250\330\252\330\261</b>
-<i>\310\322\321\220\312\321\355\344</i></html>"""
-        chardet = bs4.dammit.chardet_dammit
-        logging.disable(logging.WARNING)
-        try:
-            def noop(str):
-                return None
-            bs4.dammit.chardet_dammit = noop
-            dammit = UnicodeDammit(doc)
-            self.assertEqual(True, dammit.contains_replacement_characters)
-            self.assertTrue("\ufffd" in dammit.unicode_markup)
-
-            soup = BeautifulSoup(doc, "html.parser")
-            self.assertTrue(soup.contains_replacement_characters)
-        finally:
-            logging.disable(logging.NOTSET)
-            bs4.dammit.chardet_dammit = chardet
-
-    def test_byte_order_mark_removed(self):
-        # A document written in UTF-16LE will have its byte order marker stripped.
-        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
-        dammit = UnicodeDammit(data)
-        self.assertEqual("<a>áé</a>", dammit.unicode_markup)
-        self.assertEqual("utf-16le", dammit.original_encoding)
-
-    def test_detwingle(self):
-        # Here's a UTF8 document.
-        utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
-
-        # Here's a Windows-1252 document.
-        windows_1252 = (
-            "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
-            "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
-
-        # Through some unholy alchemy, they've been stuck together.
-        doc = utf8 + windows_1252 + utf8
-
-        # The document can't be turned into UTF-8:
-        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
-
-        # Unicode, Dammit thinks the whole document is Windows-1252,
-        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
-
-        # But if we run it through fix_embedded_windows_1252, it's fixed:
-
-        fixed = UnicodeDammit.detwingle(doc)
-        self.assertEqual(
-            "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
-
-    def test_detwingle_ignores_multibyte_characters(self):
-        # Each of these characters has a UTF-8 representation ending
-        # in \x93. \x93 is a smart quote if interpreted as
-        # Windows-1252. But our code knows to skip over multibyte
-        # UTF-8 characters, so they'll survive the process unscathed.
-        for tricky_unicode_char in (
-            "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
-            "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
-            "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
-            ):
-            input = tricky_unicode_char.encode("utf8")
-            self.assertTrue(input.endswith(b'\x93'))
-            output = UnicodeDammit.detwingle(input)
-            self.assertEqual(output, input)
-
-    def test_find_declared_encoding(self):
-        # Test our ability to find a declared encoding inside an
-        # XML or HTML document.
-        #
-        # Even if the document comes in as Unicode, it may be
-        # interesting to know what encoding was claimed
-        # originally.
-
-        html_unicode = '<html><head><meta charset="utf-8"></head></html>'
-        html_bytes = html_unicode.encode("ascii")
-
-        xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
-        xml_bytes = xml_unicode.encode("ascii")
-
-        m = EncodingDetector.find_declared_encoding
-        self.assertEqual(None, m(html_unicode, is_html=False))
-        self.assertEqual("utf-8", m(html_unicode, is_html=True))
-        self.assertEqual("utf-8", m(html_bytes, is_html=True))
-
-        self.assertEqual("iso-8859-1", m(xml_unicode))
-        self.assertEqual("iso-8859-1", m(xml_bytes))
-
-        # Normally, only the first few kilobytes of a document are checked for
-        # an encoding.
-        spacer = b' ' * 5000
-        self.assertEqual(None, m(spacer + html_bytes))
-        self.assertEqual(None, m(spacer + xml_bytes))
-
-        # But you can tell find_declared_encoding to search an entire
-        # HTML document.
-        self.assertEqual(
-            "utf-8",
-            m(spacer + html_bytes, is_html=True, search_entire_document=True)
-        )
-
-        # The XML encoding declaration has to be the very first thing
-        # in the document. We'll allow whitespace before the document
-        # starts, but nothing else.
-        self.assertEqual(
-            "iso-8859-1",
-            m(xml_bytes, search_entire_document=True)
-        )
-        self.assertEqual(
-            None, m(b'a' + xml_bytes, search_entire_document=True)
-        )
-            
 class TestNamedspacedAttribute(SoupTest):

    def test_name_may_be_none_or_missing(self):
        a = NamespacedAttribute("xmlns", None)
        self.assertEqual(a, "xmlns")

+        a = NamespacedAttribute("xmlns", "")
+        self.assertEqual(a, "xmlns")
+
        a = NamespacedAttribute("xmlns")
        self.assertEqual(a, "xmlns")
        
+    def test_namespace_may_be_none_or_missing(self):
+        a = NamespacedAttribute(None, "tag")
+        self.assertEqual(a, "tag")
+        
+        a = NamespacedAttribute("", "tag")
+        self.assertEqual(a, "tag")
+        
    def test_attribute_is_equivalent_to_colon_separated_string(self):
        a = NamespacedAttribute("a", "b")
        self.assertEqual("a:b", a)
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@ -27,13 +27,17 @@ from bs4.element import (
    Doctype,
    Formatter,
    NavigableString,
+    Script,
    SoupStrainer,
+    Stylesheet,
    Tag,
+    TemplateString,
 )
 from bs4.testing import (
    SoupTest,
    skipIf,
 )
+from soupsieve import SelectorSyntaxError

 XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
 LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
        soup.a.extend(l)
        self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())

+    def test_extend_with_another_tags_contents(self):
+        data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
+        soup = self.soup(data)
+        d1 = soup.find('div', id='d1')
+        d2 = soup.find('div', id='d2')
+        d2.extend(d1)
+        self.assertEqual('<div id="d1"></div>', d1.decode())
+        self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
+        
    def test_move_tag_to_beginning_of_parent(self):
        data = "<a><b></b><c></c><d></d></a>"
        soup = self.soup(data)
@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
        self.assertEqual(no.next_element, "no")
        self.assertEqual(no.next_sibling, " business")

+    def test_replace_with_errors(self):
+        # Can't replace a tag that's not part of a tree.
+        a_tag = Tag(name="a")
+        self.assertRaises(ValueError, a_tag.replace_with, "won't work")
+
+        # Can't replace a tag with its parent.
+        a_tag = self.soup("<a><b></b></a>").a
+        self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
+
+        # Or with a list that includes its parent.
+        self.assertRaises(ValueError, a_tag.b.replace_with,
+                          "string1", a_tag, "string2")
+        
+    def test_replace_with_multiple(self):
+        data = "<a><b></b><c></c></a>"
+        soup = self.soup(data)
+        d_tag = soup.new_tag("d")
+        d_tag.string = "Text In D Tag"
+        e_tag = soup.new_tag("e")
+        f_tag = soup.new_tag("f")
+        a_string = "Random Text"
+        soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
+        self.assertEqual(
+            "<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>",
+            soup.decode()
+        )
+        assert soup.b.next_element == d_tag
+        assert d_tag.string.next_element==e_tag
+        assert e_tag.next_element.string == a_string
+        assert e_tag.next_element.next_element == f_tag
+        
    def test_replace_first_child(self):
        data = "<a><b></b><c></c></a>"
        soup = self.soup(data)
@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
        a.clear(decompose=True)
        self.assertEqual(0, len(em.contents))

+       
+    def test_decompose(self):
+        # Test PageElement.decompose() and PageElement.decomposed
+        soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
+        p1, p2 = soup.find_all('p')
+        a = p1.a
+        text = p1.em.string
+        for i in [p1, p2, a, text]:
+            self.assertEqual(False, i.decomposed)
+
+        # This sets p1 and everything beneath it to decomposed.
+        p1.decompose()
+        for i in [p1, a, text]:
+            self.assertEqual(True, i.decomposed)
+        # p2 is unaffected.
+        self.assertEqual(False, p2.decomposed)
+            
    def test_string_set(self):
        """Tag.string = 'string'"""
        soup = self.soup("<a></a> <b><c></c></b>")
@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")

-    def test_get_text_ignores_comments(self):
+    def test_get_text_ignores_special_string_containers(self):
        soup = self.soup("foo<!--IGNORE-->bar")
        self.assertEqual(soup.get_text(), "foobar")

@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
        self.assertEqual(
            soup.get_text(types=None), "fooIGNOREbar")

-    def test_all_strings_ignores_comments(self):
+        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+        self.assertEqual(soup.get_text(), "foobar")
+        
+    def test_all_strings_ignores_special_string_containers(self):
        soup = self.soup("foo<!--IGNORE-->bar")
        self.assertEqual(['foo', 'bar'], list(soup.strings))

+        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
+        self.assertEqual(['foo', 'bar'], list(soup.strings))
+
+    def test_string_methods_inside_special_string_container_tags(self):
+        # Strings inside tags like <script> are generally ignored by
+        # methods like get_text, because they're not what humans
+        # consider 'text'. But if you call get_text on the <script>
+        # tag itself, those strings _are_ considered to be 'text',
+        # because there's nothing else you might be looking for.
+        
+        style = self.soup("<div>a<style>Some CSS</style></div>")
+        template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
+        script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
+        
+        self.assertEqual(style.div.get_text(), "a")
+        self.assertEqual(list(style.div.strings), ["a"])
+        self.assertEqual(style.div.style.get_text(), "Some CSS")
+        self.assertEqual(list(style.div.style.strings),
+                         ['Some CSS'])
+        
+        # The comment is not picked up here. That's because it was
+        # parsed into a Comment object, which is not considered
+        # interesting by template.strings.
+        self.assertEqual(template.div.get_text(), "a")
+        self.assertEqual(list(template.div.strings), ["a"])
+        self.assertEqual(template.div.template.get_text(), "Templated text.")
+        self.assertEqual(list(template.div.template.strings),
+                         ["Templated ", "text", "."])
+
+        # The comment is included here, because it didn't get parsed
+        # into a Comment object--it's part of the Script string.
+        self.assertEqual(script.div.get_text(), "a")
+        self.assertEqual(list(script.div.strings), ["a"])
+        self.assertEqual(script.div.script.get_text(),
+                         "<!--a comment-->Some text")
+        self.assertEqual(list(script.div.script.strings),
+                         ['<!--a comment-->Some text'])
+
 class TestCDAtaListAttributes(SoupTest):

    """Testing cdata-list attributes like 'class'.
@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest):
        else:
            self.assertEqual(b'<b>\\u2603</b>', repr(soup))

-class TestFormatter(SoupTest):
-
-    def test_sort_attributes(self):
-        # Test the ability to override Formatter.attributes() to,
-        # e.g., disable the normal sorting of attributes.
-        class UnsortedFormatter(Formatter):
-            def attributes(self, tag):
-                self.called_with = tag
-                for k, v in sorted(tag.attrs.items()):
-                    if k == 'ignore':
-                        continue
-                    yield k,v
-
-        soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
-        formatter = UnsortedFormatter()
-        decoded = soup.decode(formatter=formatter)
-
-        # attributes() was called on the <p> tag. It filtered out one
-        # attribute and sorted the other two.
-        self.assertEqual(formatter.called_with, soup.p)
-        self.assertEqual('<p aval="2" cval="1"></p>', decoded)
-
-
-class TestNavigableStringSubclasses(SoupTest):
-
-    def test_cdata(self):
-        # None of the current builders turn CDATA sections into CData
-        # objects, but you can create them manually.
-        soup = self.soup("")
-        cdata = CData("foo")
-        soup.insert(1, cdata)
-        self.assertEqual(str(soup), "<![CDATA[foo]]>")
-        self.assertEqual(soup.find(text="foo"), "foo")
-        self.assertEqual(soup.contents[0], "foo")
-
-    def test_cdata_is_never_formatted(self):
-        """Text inside a CData object is passed into the formatter.
-
-        But the return value is ignored.
-        """
-
-        self.count = 0
-        def increment(*args):
-            self.count += 1
-            return "BITTER FAILURE"
-
-        soup = self.soup("")
-        cdata = CData("<><><>")
-        soup.insert(1, cdata)
-        self.assertEqual(
-            b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
-        self.assertEqual(1, self.count)
-
-    def test_doctype_ends_in_newline(self):
-        # Unlike other NavigableString subclasses, a DOCTYPE always ends
-        # in a newline.
-        doctype = Doctype("foo")
-        soup = self.soup("")
-        soup.insert(1, doctype)
-        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
-
-    def test_declaration(self):
-        d = Declaration("foo")
-        self.assertEqual("<?foo?>", d.output_ready())
-
+        
 class TestSoupSelector(TreeTest):

    HTML = """
@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
        self.assertEqual(len(self.soup.select('del')), 0)

    def test_invalid_tag(self):
-        self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')

    def test_select_dashed_tag_ids(self):
        self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")

        self.assertRaises(
-            SyntaxError, self.soup.select, "a:nth-of-type(a)")
+            SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")

    def test_nth_of_type(self):
        # Try to select first paragraph
@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
        self.assertEqual([], self.soup.select('#inner ~ h2'))

    def test_dangling_combinator(self):
-        self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')

    def test_sibling_combinator_wont_select_same_tag_twice(self):
        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
        self.assertSelects('div x,y,  z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])

    def test_invalid_multiple_select(self):
-        self.assertRaises(SyntaxError, self.soup.select, ',x, y')
-        self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
+        self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')

    def test_multiple_select_attrs(self):
        self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])