Update bs4 to 4.8.1 (with 2to3)

2025-08-21 05:43:22 -07:00 · 2019-11-23 18:54:24 -08:00 · 2019-11-23 18:54:24 -08:00 · f28e741ad7
commit f28e741ad7
parent 23c4e5b09d
19 changed files with 5487 additions and 792 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -5,26 +5,30 @@ http://www.crummy.com/software/BeautifulSoup/
 Beautiful Soup uses a pluggable XML or HTML parser to parse a
 (possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
+provides methods and Pythonic idioms that make it easy to navigate,
-navigate, search, and modify the parse tree.
+search, and modify the parse tree.
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
 and/or html5lib is installed.
 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.3.2"
+__version__ = "4.8.1"
-__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
+__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 __all__ = ['BeautifulSoup']
 import os
 import re
 import sys
 import traceback
 import warnings
 from .builder import builder_registry, ParserRejectedMarkup
@ -45,7 +49,7 @@ from .element import (
 # The very first thing we do is give a useful error if someone is
 # running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
 class BeautifulSoup(Tag):
    """
@ -59,7 +63,7 @@ class BeautifulSoup(Tag):
      handle_starttag(name, attrs) # See note about return value
      handle_endtag(name)
      handle_data(data) # Appends to the current data node
-      endData(containerClass=NavigableString) # Ends the current data node
+      endData(containerClass) # Ends the current data node
    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
@ -69,7 +73,7 @@ class BeautifulSoup(Tag):
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    """
-    ROOT_TAG_NAME = u'[document]'
+    ROOT_TAG_NAME = '[document]'
    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
@ -77,13 +81,62 @@ class BeautifulSoup(Tag):
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
    def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None, **kwargs):
+                 parse_only=None, from_encoding=None, exclude_encodings=None,
-        """The Soup object is initialized as the 'root tag', and the
+                 element_classes=None, **kwargs):
-        provided markup (which can be a string or a file-like object)
+        """Constructor.
-        is fed into the underlying parser."""
+
        :param markup: A string or a file-like object representing
        markup to be parsed.
        :param features: Desirable features of the parser to be used. This
        may be the name of a specific parser ("lxml", "lxml-xml",
        "html.parser", or "html5lib") or it may be the type of markup
        to be used ("html", "html5", "xml"). It's recommended that you
        name a specific parser, so that Beautiful Soup gives you the
        same results across platforms and virtual environments.
        :param builder: A TreeBuilder subclass to instantiate (or
        instance to use) instead of looking one up based on
        `features`. You only need to use this if you've implemented a
        custom TreeBuilder.
        :param parse_only: A SoupStrainer. Only parts of the document
        matching the SoupStrainer will be considered. This is useful
        when parsing part of a document that would otherwise be too
        large to fit into memory.
        :param from_encoding: A string indicating the encoding of the
        document to be parsed. Pass this in if Beautiful Soup is
        guessing wrongly about the document's encoding.
        :param exclude_encodings: A list of strings indicating
        encodings known to be wrong. Pass this in if you don't know
        the document's encoding but you know Beautiful Soup's guess is
        wrong.
        :param element_classes: A dictionary mapping BeautifulSoup
        classes like Tag and NavigableString to other classes you'd
        like to be instantiated instead as the parse tree is
        built. This is useful for using subclasses to modify the
        default behavior of Tag or NavigableString.
        :param kwargs: For backwards compatibility purposes, the
        constructor accepts certain keyword arguments used in
        Beautiful Soup 3. None of these arguments do anything in
        Beautiful Soup 4; they will result in a warning and then be ignored.
        Apart from this, any keyword arguments passed into the BeautifulSoup
        constructor are propagated to the TreeBuilder constructor. This
        makes it possible to configure a TreeBuilder beyond saying
        which one to use.
        """
        if 'convertEntities' in kwargs:
            del kwargs['convertEntities']
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
@ -114,9 +167,9 @@ class BeautifulSoup(Tag):
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
-                "BeautifulSoup constructor. You can pass in features='html' "
+                "BeautifulSoup constructor. Suggest you use "
-                "or features='xml' to get a builder capable of handling "
+                "features='lxml' for HTML and features='lxml-xml' for "
-                "one or the other.")
+                "XML.")
        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
@ -134,13 +187,24 @@ class BeautifulSoup(Tag):
        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")
-        if len(kwargs) > 0:
+        if from_encoding and isinstance(markup, str):
-            arg = kwargs.keys().pop()
+            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
-            raise TypeError(
+            from_encoding = None
                "__init__() got an unexpected keyword argument '%s'" % arg)
-        if builder is None:
+        self.element_classes = element_classes or dict()
-            if isinstance(features, basestring):
+
        # We need this information to track whether or not the builder
        # was specified well enough that we can omit the 'you need to
        # specify a parser' warning.
        original_builder = builder
        original_features = features
        if isinstance(builder, type):
            # A builder class was passed in; it needs to be instantiated.
            builder_class = builder
            builder = None
        elif builder is None:
            if isinstance(features, str):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
@ -150,21 +214,73 @@ class BeautifulSoup(Tag):
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
-            builder = builder_class()
+
        # At this point either we have a TreeBuilder instance in
        # builder, or we have a builder_class that we can instantiate
        # with the remaining **kwargs.
        if builder is None:
            builder = builder_class(**kwargs)
            if not original_builder and not (
                    original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES
            ):
                if builder.is_xml:
                    markup_type = "XML"
                else:
                    markup_type = "HTML"
                # This code adapted from warnings.py so that we get the same line
                # of code as our warnings.warn() call gets, even if the answer is wrong
                # (as it may be in a multithreading situation).
                caller = None
                try:
                    caller = sys._getframe(1)
                except ValueError:
                    pass
                if caller:
                    globals = caller.f_globals
                    line_number = caller.f_lineno
                else:
                    globals = sys.__dict__
                    line_number= 1                    
                filename = globals.get('__file__')
                if filename:
                    fnl = filename.lower()
                    if fnl.endswith((".pyc", ".pyo")):
                        filename = filename[:-1]
                if filename:
                    # If there is no filename at all, the user is most likely in a REPL,
                    # and the warning is not necessary.
                    values = dict(
                        filename=filename,
                        line_number=line_number,
                        parser=builder.NAME,
                        markup_type=markup_type
                    )
                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
        else:
            if kwargs:
                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
        self.builder = builder
        self.is_xml = builder.is_xml
-        self.builder.soup = self
+        self.known_xml = self.is_xml
-
+        self._namespaces = dict()
        self.parse_only = parse_only
        self.builder.initialize_soup(self)
        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
-        elif len(markup) <= 256:
+        elif len(markup) <= 256 and (
                (isinstance(markup, bytes) and not b'<' in markup)
                or (isinstance(markup, str) and not '<' in markup)
        ):
            # Print out warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
            # just in case that's what the user really wants.
-            if (isinstance(markup, unicode)
+            if (isinstance(markup, str)
                and not os.path.supports_unicode_filenames):
                possible_filename = markup.encode("utf8")
            else:
@ -172,37 +288,93 @@ class BeautifulSoup(Tag):
            is_file = False
            try:
                is_file = os.path.exists(possible_filename)
-            except Exception, e:
+            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
            if is_file:
                if isinstance(markup, str):
                    markup = markup.encode("utf8")
                warnings.warn(
-                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
+                    '"%s" looks like a filename, not markup. You should'
-            if markup[:5] == "http:" or markup[:6] == "https:":
+                    ' probably open this file and pass the filehandle into'
-                # TODO: This is ugly but I couldn't get it to work in
+                    ' Beautiful Soup.' % markup)
-                # Python 3 otherwise.
+            self._check_markup_is_url(markup)
                if ((isinstance(markup, bytes) and not b' ' in markup)
                    or (isinstance(markup, unicode) and not u' ' in markup)):
                    warnings.warn(
                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
        rejections = []
        success = False
        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
-            self.builder.prepare_markup(markup, from_encoding)):
+             self.builder.prepare_markup(
                 markup, from_encoding, exclude_encodings=exclude_encodings)):
            self.reset()
            try:
                self._feed()
                success = True
                break
-            except ParserRejectedMarkup:
+            except ParserRejectedMarkup as e:
                rejections.append(e)
                pass
        if not success:
            other_exceptions = [str(e) for e in rejections]
            raise ParserRejectedMarkup(
                "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
            )
        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
        self.builder.soup = None
    def __copy__(self):
        copy = type(self)(
            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
        )
        # Although we encoded the tree to UTF-8, that may not have
        # been the encoding of the original markup. Set the copy's
        # .original_encoding to reflect the original object's
        # .original_encoding.
        copy.original_encoding = self.original_encoding
        return copy
    def __getstate__(self):
        # Frequently a tree builder can't be pickled.
        d = dict(self.__dict__)
        if 'builder' in d and not self.builder.picklable:
            d['builder'] = None
        return d
    @staticmethod
    def _check_markup_is_url(markup):
        """ 
        Check if markup looks like it's actually a url and raise a warning 
        if so. Markup can be unicode or str (py2) / bytes (py3).
        """
        if isinstance(markup, bytes):
            space = b' '
            cant_start_with = (b"http:", b"https:")
        elif isinstance(markup, str):
            space = ' '
            cant_start_with = ("http:", "https:")
        else:
            return
        if any(markup.startswith(prefix) for prefix in cant_start_with):
            if not space in markup:
                if isinstance(markup, bytes):
                    decoded_markup = markup.decode('utf-8', 'replace')
                else:
                    decoded_markup = markup
                warnings.warn(
                    '"%s" looks like a URL. Beautiful Soup is not an'
                    ' HTTP client. You should probably use an HTTP client like'
                    ' requests to get the document behind the URL, and feed'
                    ' that document to Beautiful Soup.' % decoded_markup
                )
    def _feed(self):
        # Convert the document to Unicode.
        self.builder.reset()
@ -223,15 +395,21 @@ class BeautifulSoup(Tag):
        self.preserve_whitespace_tag_stack = []
        self.pushTag(self)
-    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
                sourceline=None, sourcepos=None, **kwattrs):
        """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+        kwattrs.update(attrs)
        return self.element_classes.get(Tag, Tag)(
            None, self.builder, name, namespace, nsprefix, kwattrs,
            sourceline=sourceline, sourcepos=sourcepos
        )
-    def new_string(self, s, subclass=NavigableString):
+    def new_string(self, s, subclass=None):
        """Create a new NavigableString associated with this soup."""
-        navigable = subclass(s)
+        subclass = subclass or self.element_classes.get(
-        navigable.setup()
+            NavigableString, NavigableString
-        return navigable
+        )
        return subclass(s)
    def insert_before(self, successor):
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@ -250,16 +428,26 @@ class BeautifulSoup(Tag):
    def pushTag(self, tag):
        #print "Push", tag.name
-        if self.currentTag:
+        if self.currentTag is not None:
            self.currentTag.contents.append(tag)
        self.tagStack.append(tag)
        self.currentTag = self.tagStack[-1]
        if tag.name in self.builder.preserve_whitespace_tags:
            self.preserve_whitespace_tag_stack.append(tag)
-    def endData(self, containerClass=NavigableString):
+    def endData(self, containerClass=None):
        # Default container is NavigableString.
        containerClass = containerClass or NavigableString
        # The user may want us to instantiate some alias for the
        # container class.
        containerClass = self.element_classes.get(
            containerClass, containerClass
        )
        if self.current_data:
-            current_data = u''.join(self.current_data)
+            current_data = ''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
            # nothing but ASCII spaces, replace it with a single space
            # or newline.
@ -289,15 +477,72 @@ class BeautifulSoup(Tag):
    def object_was_parsed(self, o, parent=None, most_recent_element=None):
        """Add an object to the parse tree."""
-        parent = parent or self.currentTag
+        if parent is None:
-        most_recent_element = most_recent_element or self._most_recent_element
+            parent = self.currentTag
        o.setup(parent, most_recent_element)
        if most_recent_element is not None:
-            most_recent_element.next_element = o
+            previous_element = most_recent_element
        else:
            previous_element = self._most_recent_element
        next_element = previous_sibling = next_sibling = None
        if isinstance(o, Tag):
            next_element = o.next_element
            next_sibling = o.next_sibling
            previous_sibling = o.previous_sibling
            if previous_element is None:
                previous_element = o.previous_element
        fix = parent.next_element is not None
        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
        self._most_recent_element = o
        parent.contents.append(o)
        # Check if we are inserting into an already parsed node.
        if fix:
            self._linkage_fixer(parent)
    def _linkage_fixer(self, el):
        """Make sure linkage of this fragment is sound."""
        first = el.contents[0]
        child = el.contents[-1]
        descendant = child
        if child is first and el.parent is not None:
            # Parent should be linked to first child
            el.next_element = child
            # We are no longer linked to whatever this element is
            prev_el = child.previous_element
            if prev_el is not None and prev_el is not el:
                prev_el.next_element = None
            # First child should be linked to the parent, and no previous siblings.
            child.previous_element = el
            child.previous_sibling = None
        # We have no sibling as we've been appended as the last.
        child.next_sibling = None
        # This index is a tag, dig deeper for a "last descendant"
        if isinstance(child, Tag) and child.contents:
            descendant = child._last_descendant(False)
        # As the final step, link last descendant. It should be linked
        # to the parent's next sibling (if found), else walk up the chain
        # and find a parent with a sibling. It should have no next sibling.
        descendant.next_element = None
        descendant.next_sibling = None
        target = el
        while True:
            if target is None:
                break
            elif target.next_sibling is not None:
                descendant.next_element = target.next_sibling
                target.next_sibling.previous_element = child
                break
            target = target.parent
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
        instance of the given tag. If inclusivePop is false, pops the tag
@ -321,11 +566,12 @@ class BeautifulSoup(Tag):
        return most_recently_popped
-    def handle_starttag(self, name, namespace, nsprefix, attrs):
+    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
                        sourcepos=None):
        """Push a start tag on to the stack.
        If this method returns None, the tag was rejected by the
-        SoupStrainer. You should proceed as if the tag had not occured
+        SoupStrainer. You should proceed as if the tag had not occurred
        in the document. For instance, if this was a self-closing tag,
        don't call handle_endtag.
        """
@ -338,11 +584,14 @@ class BeautifulSoup(Tag):
                 or not self.parse_only.search_tag(name, attrs))):
            return None
-        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+        tag = self.element_classes.get(Tag, Tag)(
-                  self.currentTag, self._most_recent_element)
+            self, self.builder, name, namespace, nsprefix, attrs,
            self.currentTag, self._most_recent_element,
            sourceline=sourceline, sourcepos=sourcepos
        )
        if tag is None:
            return tag
-        if self._most_recent_element:
+        if self._most_recent_element is not None:
            self._most_recent_element.next_element = tag
        self._most_recent_element = tag
        self.pushTag(tag)
@ -367,9 +616,9 @@ class BeautifulSoup(Tag):
            encoding_part = ''
            if eventual_encoding != None:
                encoding_part = ' encoding="%s"' % eventual_encoding
-            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
        else:
-            prefix = u''
+            prefix = ''
        if not pretty_print:
            indent_level = None
        else:
@ -403,4 +652,4 @@ class FeatureNotFound(ValueError):
 if __name__ == '__main__':
    import sys
    soup = BeautifulSoup(sys.stdin)
-    print soup.prettify()
+    print(soup.prettify())
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -1,10 +1,13 @@
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 from collections import defaultdict
 import itertools
 import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
-    whitespace_re
+    nonwhitespace_re
    )
 __all__ = [
@ -80,20 +83,69 @@ builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
    """Turn a document into a Beautiful Soup object tree."""
    NAME = "[Unknown tree builder]"
    ALTERNATE_NAMES = []
    features = []
    is_xml = False
-    preserve_whitespace_tags = set()
+    picklable = False
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
    # A value for these tag/attribute combinations is a space- or
    # comma-separated list of CDATA, rather than a single CDATA.
-    cdata_list_attributes = {}
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
-    def __init__(self):
+    USE_DEFAULT = object()
    # Most parsers don't keep track of line numbers.
    TRACKS_LINE_NUMBERS = False
    def __init__(self, multi_valued_attributes=USE_DEFAULT,
                 preserve_whitespace_tags=USE_DEFAULT,
                 store_line_numbers=USE_DEFAULT):
        """Constructor.
        :param multi_valued_attributes: If this is set to None, the
        TreeBuilder will not turn any values for attributes like
        'class' into lists. Setting this do a dictionary will
        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
        for an example.
        Internally, these are called "CDATA list attributes", but that
        probably doesn't make sense to an end-user, so the argument name
        is `multi_valued_attributes`.
        :param preserve_whitespace_tags: A list of tags to treat
        the way <pre> tags are treated in HTML. Tags in this list
        will have 
        :param store_line_numbers: If the parser keeps track of the
        line numbers and positions of the original markup, that
        information will, by default, be stored in each corresponding
        `Tag` object. You can turn this off by passing
        store_line_numbers=False. If the parser you're using doesn't 
        keep track of this information, then setting store_line_numbers=True
        will do nothing.
        """
        self.soup = None
        if multi_valued_attributes is self.USE_DEFAULT:
            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
        self.cdata_list_attributes = multi_valued_attributes
        if preserve_whitespace_tags is self.USE_DEFAULT:
            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
        self.store_line_numbers = store_line_numbers
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.
        """
        self.soup = soup
    def reset(self):
        pass
@ -123,8 +175,8 @@ class TreeBuilder(object):
        raise NotImplementedError()
    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
-        return markup, None, None, False
+        yield markup, None, None, False
    def test_fragment_to_document(self, fragment):
        """Wrap an HTML fragment to make it look like a document.
@ -153,14 +205,14 @@ class TreeBuilder(object):
            universal = self.cdata_list_attributes.get('*', [])
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), None)
-            for attr in attrs.keys():
+            for attr in list(attrs.keys()):
                if attr in universal or (tag_specific and attr in tag_specific):
                    # We have a "class"-type attribute whose string
                    # value is a whitespace-separated list of
                    # values. Split it into a list.
                    value = attrs[attr]
-                    if isinstance(value, basestring):
+                    if isinstance(value, str):
-                        values = whitespace_re.split(value)
+                        values = nonwhitespace_re.findall(value)
                    else:
                        # html5lib sometimes calls setAttributes twice
                        # for the same tag when rearranging the parse
@ -224,9 +276,19 @@ class HTMLTreeBuilder(TreeBuilder):
    Such as which tags are empty-element tags.
    """
-    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set([
-    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+        # These are from HTML5.
-                              'spacer', 'link', 'frame', 'base'])
+        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
        # These are from earlier versions of HTML and are removed in HTML5.
        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
    ])
    # The HTML standard defines these as block-level elements. Beautiful
    # Soup does not treat these elements differently from other elements,
    # but it may do so eventually, and this information is available if
    # you need to use it.
    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
@ -235,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
    # encounter one of these attributes, we will parse its value into
    # a list of values if possible. Upon output, the list will be
    # converted back into a string.
-    cdata_list_attributes = {
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {
        "*" : ['class', 'accesskey', 'dropzone'],
        "a" : ['rel', 'rev'],
        "link" :  ['rel', 'rev'],
@ -252,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
        "output" : ["for"],
        }
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
    def set_up_substitutions(self, tag):
        # We are only interested in <meta> tags
        if tag.name != 'meta':
@ -299,7 +363,14 @@ def register_treebuilders_from(module):
            this_module.builder_registry.register(obj)
 class ParserRejectedMarkup(Exception):
-    pass
+    def __init__(self, message_or_exception):
        """Explain why the parser rejected the given markup, either
        with a textual explanation or another exception.
        """
        if isinstance(message_or_exception, Exception):
            e = message_or_exception
            message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
        super(ParserRejectedMarkup, self).__init__(message_or_exception)
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -1,17 +1,27 @@
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 __all__ = [
    'HTML5TreeBuilder',
    ]
 import warnings
 import re
 from bs4.builder import (
    PERMISSIVE,
    HTML,
    HTML_5,
    HTMLTreeBuilder,
    )
-from bs4.element import NamespacedAttribute
+from bs4.element import (
    NamespacedAttribute,
    nonwhitespace_re,
 )
 import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
    namespaces,
    prefixes,
    )
 from bs4.element import (
    Comment,
    Doctype,
@ -19,14 +29,36 @@ from bs4.element import (
    Tag,
    )
 try:
    # Pre-0.99999999
    from html5lib.treebuilders import _base as treebuilder_base
    new_html5lib = False
 except ImportError as e:
    # 0.99999999 and up
    from html5lib.treebuilders import base as treebuilder_base
    new_html5lib = True
 class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""
-    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+    NAME = "html5lib"
-    def prepare_markup(self, markup, user_specified_encoding):
+    features = [NAME, PERMISSIVE, HTML_5, HTML]
    # html5lib can tell us which line number and position in the
    # original file is the source of an element.
    TRACKS_LINE_NUMBERS = True
    def prepare_markup(self, markup, user_specified_encoding,
                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
        # document_declared_encoding and exclude_encodings aren't used
        # ATM because the html5lib TreeBuilder doesn't use
        # UnicodeDammit.
        if exclude_encodings:
            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
        yield (markup, None, None, False)
    # These methods are defined by Beautiful Soup.
@ -34,32 +66,63 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+        self.underlying_builder.parser = parser
        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)
        # Set the character encoding detected by the tokenizer.
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding
        self.underlying_builder.parser = None
    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
-            self.soup, namespaceHTMLElements)
+            namespaceHTMLElements, self.soup,
            store_line_numbers=self.store_line_numbers
        )
        return self.underlying_builder
    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<html><head></head><body>%s</body></html>' % fragment
+        return '<html><head></head><body>%s</body></html>' % fragment
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
-    def __init__(self, soup, namespaceHTMLElements):
+    def __init__(self, namespaceHTMLElements, soup=None,
                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup(
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers
    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)
@ -73,14 +136,26 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
        self.soup.object_was_parsed(doctype)
    def elementClass(self, name, namespace):
-        tag = self.soup.new_tag(name, namespace)
+        kwargs = {}
        if self.parser and self.store_line_numbers:
            # This represents the point immediately after the end of the
            # tag. We don't know when the tag started, but we do know
            # where it ended -- the character just before this one.
            sourceline, sourcepos = self.parser.tokenizer.stream.position()
            kwargs['sourceline'] = sourceline
            kwargs['sourcepos'] = sourcepos-1
        tag = self.soup.new_tag(name, namespace, **kwargs)
        return Element(tag, self.soup, namespace)
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    def fragmentClass(self):
-        self.soup = BeautifulSoup("")
+        from bs4 import BeautifulSoup
        # TODO: Why is the parser 'html.parser' here? To avoid an
        # infinite loop?
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)
@ -92,7 +167,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
        return self.soup
    def getFragment(self):
-        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+        return treebuilder_base.TreeBuilder.getFragment(self).element
    def testSerializer(self, element):
        from bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in list(element.attrs.items()):
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace], name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))
                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)
        serializeElement(element, 0)
        return "\n".join(rv)
 class AttrList(object):
    def __init__(self, element):
@ -101,7 +226,16 @@ class AttrList(object):
    def __iter__(self):
        return list(self.attrs.items()).__iter__()
    def __setitem__(self, name, value):
-        "set attr", name, value
+        # If this attribute is a multi-valued attribute for this element,
        # turn its value into a list.
        list_attr = self.element.cdata_list_attributes
        if (name in list_attr['*']
            or (self.element.name in list_attr
                and name in list_attr[self.element.name])):
            # A node that is being cloned may have already undergone
            # this procedure.
            if not isinstance(value, list):
                value = nonwhitespace_re.findall(value)
        self.element[name] = value
    def items(self):
        return list(self.attrs.items())
@ -115,16 +249,16 @@ class AttrList(object):
        return name in list(self.attrs.keys())
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
    def __init__(self, element, soup, namespace):
-        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        treebuilder_base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
    def appendChild(self, node):
        string_child = child = None
-        if isinstance(node, basestring):
+        if isinstance(node, str):
            # Some other piece of code decided to pass in a string
            # instead of creating a TextElement object to contain the
            # string.
@ -136,13 +270,15 @@ class Element(html5lib.treebuilders._base.Node):
            child = node
        elif node.element.__class__ == NavigableString:
            string_child = child = node.element
            node.parent = self
        else:
            child = node.element
            node.parent = self
-        if not isinstance(child, basestring) and child.parent is not None:
+        if not isinstance(child, str) and child.parent is not None:
            node.element.extract()
-        if (string_child and self.element.contents
+        if (string_child is not None and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # We are appending a string onto another string.
            # TODO This has O(n^2) performance, for input like
@ -152,7 +288,7 @@ class Element(html5lib.treebuilders._base.Node):
            old_element.replace_with(new_element)
            self.soup._most_recent_element = new_element
        else:
-            if isinstance(node, basestring):
+            if isinstance(node, str):
                # Create a brand new NavigableString from this string.
                child = self.soup.new_string(node)
@ -161,6 +297,12 @@ class Element(html5lib.treebuilders._base.Node):
            # immediately after the parent, if it has no children.)
            if self.element.contents:
                most_recent_element = self.element._last_descendant(False)
            elif self.element.next_element is not None:
                # Something from further ahead in the parse tree is
                # being inserted into this earlier element. This is
                # very annoying because it means an expensive search
                # for the last element in the tree.
                most_recent_element = self.soup._last_descendant()
            else:
                most_recent_element = self.element
@ -169,9 +311,12 @@ class Element(html5lib.treebuilders._base.Node):
                most_recent_element=most_recent_element)
    def getAttributes(self):
        if isinstance(self.element, Comment):
            return {}
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes is not None and len(attributes) > 0:
            converted_attributes = []
@ -183,7 +328,7 @@ class Element(html5lib.treebuilders._base.Node):
            self.soup.builder._replace_cdata_list_attribute_values(
                self.name, attributes)
-            for name, value in attributes.items():
+            for name, value in list(attributes.items()):
                self.element[name] = value
            # The attributes may contain variables that need substitution.
@ -195,11 +340,11 @@ class Element(html5lib.treebuilders._base.Node):
    attributes = property(getAttributes, setAttributes)
    def insertText(self, data, insertBefore=None):
        if insertBefore:
        text = TextNode(self.soup.new_string(data), self.soup)
-            self.insertBefore(data, insertBefore)
+        if insertBefore:
            self.insertBefore(text, insertBefore)
        else:
-            self.appendChild(data)
+            self.appendChild(text)
    def insertBefore(self, node, refNode):
        index = self.element.index(refNode.element)
@ -218,6 +363,10 @@ class Element(html5lib.treebuilders._base.Node):
    def reparentChildren(self, new_parent):
        """Move all of this tag's children into another tag."""
        # print "MOVE", self.element.contents
        # print "FROM", self.element
        # print "TO", new_parent.element
        element = self.element
        new_parent_element = new_parent.element
        # Determine what this tag's next_element will be once all the children
@ -236,18 +385,35 @@ class Element(html5lib.treebuilders._base.Node):
            new_parents_last_descendant_next_element = new_parent_element.next_element
        to_append = element.contents
        append_after = new_parent.element.contents
        if len(to_append) > 0:
            # Set the first child's previous_element and previous_sibling
            # to elements within the new parent
            first_child = to_append[0]
            if new_parents_last_descendant is not None:
                first_child.previous_element = new_parents_last_descendant
            else:
                first_child.previous_element = new_parent_element
            first_child.previous_sibling = new_parents_last_child
            if new_parents_last_descendant is not None:
                new_parents_last_descendant.next_element = first_child
            else:
                new_parent_element.next_element = first_child
            if new_parents_last_child is not None:
                new_parents_last_child.next_sibling = first_child
-            # Fix the last child's next_element and next_sibling
+            # Find the very last element being moved. It is now the
-            last_child = to_append[-1]
+            # parent's last descendant. It has no .next_sibling and
-            last_child.next_element = new_parents_last_descendant_next_element
+            # its .next_element is whatever the previous last
-            last_child.next_sibling = None
+            # descendant had.
            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
            if new_parents_last_descendant_next_element is not None:
                # TODO: This code has no test coverage and I'm not sure
                # how to get html5lib to go through this path, but it's
                # just the other side of the previous line.
                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
            last_childs_last_descendant.next_sibling = None
        for child in to_append:
            child.parent = new_parent_element
@ -257,6 +423,10 @@ class Element(html5lib.treebuilders._base.Node):
        element.contents = []
        element.next_element = final_next_element
        # print "DONE WITH MOVE"
        # print "FROM", self.element
        # print "TO", new_parent_element
    def cloneNode(self):
        tag = self.soup.new_tag(self.element.name, self.namespace)
        node = Element(tag, self.soup, self.namespace)
@ -268,7 +438,7 @@ class Element(html5lib.treebuilders._base.Node):
        return self.element.contents
    def getNameTuple(self):
-        if self.namespace is None:
+        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
@ -277,7 +447,7 @@ class Element(html5lib.treebuilders._base.Node):
 class TextNode(Element):
    def __init__(self, element, soup):
-        html5lib.treebuilders._base.Node.__init__(self, None)
+        treebuilder_base.Node.__init__(self, None)
        self.element = element
        self.soup = soup
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -1,13 +1,23 @@
 # encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 __all__ = [
    'HTMLParserTreeBuilder',
    ]
-from HTMLParser import (
+from html.parser import HTMLParser
-    HTMLParser,
+
-    HTMLParseError,
+try:
-    )
+    from html.parser import HTMLParseError
 except ImportError as e:
    # HTMLParseError is removed in Python 3.5. Since it can never be
    # thrown in 3.5, we can just define our own class as a placeholder.
    class HTMLParseError(Exception):
        pass
 import sys
 import warnings
@ -19,10 +29,10 @@ import warnings
 # At the end of this file, we monkeypatch HTMLParser so that
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
-    major > 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
-    or (major == 3 and minor > 2)
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
-    or (major == 3 and minor == 2 and release >= 3))
+
 from bs4.element import (
    CData,
@ -43,7 +53,42 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'
 class BeautifulSoupHTMLParser(HTMLParser):
-    def handle_starttag(self, name, attrs):
+
    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
        # Keep a list of empty-element tags that were encountered
        # without an explicit closing tag. If we encounter a closing tag
        # of this type, we'll associate it with one of those entries.
        #
        # This isn't a stack because we don't care about the
        # order. It's a list of closing tags we've already handled and
        # will ignore, assuming they ever show up.
        self.already_closed_empty_element = []
    def error(self, msg):
        """In Python 3, HTMLParser subclasses must implement error(), although this
        requirement doesn't appear to be documented.
        In Python 2, HTMLParser implements error() as raising an exception.
        In any event, this method is called only on very strange markup and our best strategy
        is to pretend it didn't happen and keep going.
        """
        warnings.warn(msg)
    def handle_startendtag(self, name, attrs):
        # This is only called when the markup looks like
        # <tag/>.
        # is_startend() tells handle_starttag not to close the tag
        # just because its name matches a known empty-element tag. We
        # know that this is an empty-element tag and we want to call
        # handle_endtag ourselves.
        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
        self.handle_endtag(name)
    def handle_starttag(self, name, attrs, handle_empty_element=True):
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@ -53,9 +98,37 @@ class BeautifulSoupHTMLParser(HTMLParser):
                value = ''
            attr_dict[key] = value
            attrvalue = '""'
-        self.soup.handle_starttag(name, None, None, attr_dict)
+        #print "START", name
        sourceline, sourcepos = self.getpos()
        tag = self.soup.handle_starttag(
            name, None, None, attr_dict, sourceline=sourceline,
            sourcepos=sourcepos
        )
        if tag and tag.is_empty_element and handle_empty_element:
            # Unlike other parsers, html.parser doesn't send separate end tag
            # events for empty-element tags. (It's handled in
            # handle_startendtag, but only if the original markup looked like
            # <tag/>.)
            #
            # So we need to call handle_endtag() ourselves. Since we
            # know the start event is identical to the end event, we
            # don't want handle_endtag() to cross off any previous end
            # events for tags of this name.
            self.handle_endtag(name, check_already_closed=False)
-    def handle_endtag(self, name):
+            # But we might encounter an explicit closing tag for this tag
            # later on. If so, we want to ignore it.
            self.already_closed_empty_element.append(name)
    def handle_endtag(self, name, check_already_closed=True):
        #print "END", name
        if check_already_closed and name in self.already_closed_empty_element:
            # This is a redundant end tag for an empty-element tag.
            # We've already called handle_endtag() for it, so just
            # check it off the list.
            # print "ALREADY CLOSED", name
            self.already_closed_empty_element.remove(name)
        else:
            self.soup.handle_endtag(name)
    def handle_data(self, data):
@ -63,7 +136,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
    def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
-        # it's fixed.
+        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
@ -71,11 +145,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
        else:
            real_name = int(name)
        data = None
        if real_name < 256:
            # HTML numeric entities are supposed to reference Unicode
            # code points, but sometimes they reference code points in
            # some other encoding (ahem, Windows-1252). E.g. &#147;
            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
            # code tries to detect this situation and compensate.
            for encoding in (self.soup.original_encoding, 'windows-1252'):
                if not encoding:
                    continue
                try:
-            data = unichr(real_name)
+                    data = bytearray([real_name]).decode(encoding)
-        except (ValueError, OverflowError), e:
+                except UnicodeDecodeError as e:
-            data = u"\N{REPLACEMENT CHARACTER}"
+                    pass
-
+        if not data:
            try:
                data = chr(real_name)
            except (ValueError, OverflowError) as e:
                pass
        data = data or "\N{REPLACEMENT CHARACTER}"
        self.handle_data(data)
    def handle_entityref(self, name):
@ -83,7 +172,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
        if character is not None:
            data = character
        else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
            # was an character entity reference with a missing
            # semicolon or the literal string "&foo". Since this is
            # HTML, we have a complete list of all character entity references,
            # and this one wasn't found, so assume it's the literal string "&foo".
            data = "&%s" % name
        self.handle_data(data)
    def handle_comment(self, data):
@ -113,14 +207,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
    def handle_pi(self, data):
        self.soup.endData()
        if data.endswith("?") and data.lower().startswith("xml"):
            # "An XHTML processing instruction using the trailing '?'
            # will cause the '?' to be included in data." - HTMLParser
            # docs.
            #
            # Strip the question mark so we don't end up with two
            # question marks.
            data = data[:-1]
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)
@ -128,26 +214,38 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
    is_xml = False
-    features = [HTML, STRICT, HTMLPARSER]
+    picklable = True
    NAME = HTMLPARSER
    features = [NAME, HTML, STRICT]
-    def __init__(self, *args, **kwargs):
+    # The html.parser knows which line number and position in the
-        if CONSTRUCTOR_TAKES_STRICT:
+    # original file is the source of an element.
-            kwargs['strict'] = False
+    TRACKS_LINE_NUMBERS = True
-        self.parser_args = (args, kwargs)
+    
    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
        parser_args = parser_args or []
        parser_kwargs = parser_kwargs or {}
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            parser_kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            parser_kwargs['convert_charrefs'] = False
        self.parser_args = (parser_args, parser_kwargs)
    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
        """
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        """
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
            yield (markup, None, None, False)
            return
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
                               exclude_encodings=exclude_encodings)
        yield (dammit.markup, dammit.original_encoding,
               dammit.declared_html_encoding,
               dammit.contains_replacement_characters)
@ -158,10 +256,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser.soup = self.soup
        try:
            parser.feed(markup)
-        except HTMLParseError, e:
+            parser.close()
        except HTMLParseError as e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
        parser.already_closed_empty_element = []
 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -1,13 +1,26 @@
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 __all__ = [
    'LXMLTreeBuilderForXML',
    'LXMLTreeBuilder',
    ]
 try:
    from collections.abc import Callable # Python 3.6
 except ImportError as e:
    from collections import Callable
 from io import BytesIO
-from StringIO import StringIO
+from io import StringIO
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
    Comment,
    Doctype,
    NamespacedAttribute,
    ProcessingInstruction,
    XMLProcessingInstruction,
 )
 from bs4.builder import (
    FAST,
    HTML,
@ -20,19 +33,55 @@ from bs4.dammit import EncodingDetector
 LXML = 'lxml'
 def _invert(d):
    "Invert a dictionary."
    return dict((v,k) for k, v in list(d.items()))
 class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_PARSER_CLASS = etree.XMLParser
    is_xml = True
    processing_instruction_class = XMLProcessingInstruction
    NAME = "lxml-xml"
    ALTERNATE_NAMES = ["xml"]
    # Well, it's permissive by XML parser standards.
-    features = [LXML, XML, FAST, PERMISSIVE]
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]
    CHUNK_SIZE = 512
    # This namespace mapping is specified in the XML Namespace
    # standard.
-    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+    DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
    # NOTE: If we parsed Element objects and looked at .sourceline,
    # we'd be able to see the line numbers from the original document.
    # But instead we build an XMLParser or HTMLParser object to serve
    # as the target of parse messages, and those messages don't include
    # line numbers.
    def initialize_soup(self, soup):
        """Let the BeautifulSoup object know about the standard namespace
        mapping.
        """
        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
        self._register_namespaces(self.DEFAULT_NSMAPS)
    def _register_namespaces(self, mapping):
        """Let the BeautifulSoup object know about namespaces encountered
        while parsing the document.
        This might be useful later on when creating CSS selectors.
        """
        for key, value in list(mapping.items()):
            if key and key not in self.soup._namespaces:
                # Let the BeautifulSoup object know about a new namespace.
                # If there are multiple namespaces defined with the same
                # prefix, the first one in the document takes precedence.
                self.soup._namespaces[key] = value
    def default_parser(self, encoding):
        # This can either return a parser object or a class, which
@ -46,12 +95,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # Use the default parser.
        parser = self.default_parser(encoding)
-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
            # Instantiate the parser with default arguments
            parser = parser(target=self, strip_cdata=False, encoding=encoding)
        return parser
-    def __init__(self, parser=None, empty_element_tags=None):
+    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
        # TODO: Issue a warning if parser is present but not a
        # callable, since that means there's no way to create new
        # parsers for different encodings.
@ -59,7 +108,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        if empty_element_tags is not None:
            self.empty_element_tags = set(empty_element_tags)
        self.soup = None
-        self.nsmaps = [self.DEFAULT_NSMAPS]
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
@ -70,6 +120,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            return (None, tag)
    def prepare_markup(self, markup, user_specified_encoding=None,
                       exclude_encodings=None,
                       document_declared_encoding=None):
        """
        :yield: A series of 4-tuples.
@ -78,31 +129,37 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        Each 4-tuple represents a strategy for parsing the document.
        """
        if isinstance(markup, unicode):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
            yield markup, None, document_declared_encoding, False
        if isinstance(markup, unicode):
            # No, apparently not. Convert the Unicode to UTF-8 and
            # tell lxml to parse it as UTF-8.
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)
        # Instead of using UnicodeDammit to convert the bytestring to
        # Unicode using different encodings, use EncodingDetector to
        # iterate over the encodings, and tell lxml to try to parse
        # the document as each one in turn.
        is_html = not self.is_xml
        if is_html:
            self.processing_instruction_class = ProcessingInstruction
        else:
            self.processing_instruction_class = XMLProcessingInstruction
        if isinstance(markup, str):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
            yield markup, None, document_declared_encoding, False
        if isinstance(markup, str):
            # No, apparently not. Convert the Unicode to UTF-8 and
            # tell lxml to parse it as UTF-8.
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        detector = EncodingDetector(markup, try_encodings, is_html)
+        detector = EncodingDetector(
            markup, try_encodings, is_html, exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)
    def feed(self, markup):
        if isinstance(markup, bytes):
            markup = BytesIO(markup)
-        elif isinstance(markup, unicode):
+        elif isinstance(markup, str):
            markup = StringIO(markup)
        # Call feed() at least once, even if the markup is empty,
@ -117,30 +174,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                if len(data) != 0:
                    self.parser.feed(data)
            self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
    def close(self):
-        self.nsmaps = [self.DEFAULT_NSMAPS]
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
    def start(self, name, attrs, nsmap={}):
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
        attrs = dict(attrs)
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(self.nsmaps) > 1:
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
                # There are no new namespaces for this tag, but
                # non-default namespaces are in play, so we need a
                # separate tag stack to know when they end.
                self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
-            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+
-            self.nsmaps.append(inverted_nsmap)
+            # First, Let the BeautifulSoup object know about it.
            self._register_namespaces(nsmap)
            # Then, add it to our running list of inverted namespace
            # mappings.
            self.nsmaps.append(_invert(nsmap))
            # Also treat the namespace mapping as a set of attributes on the
            # tag, so we can recreate it later.
            attrs = attrs.copy()
-            for prefix, namespace in nsmap.items():
+            for prefix, namespace in list(nsmap.items()):
                attribute = NamespacedAttribute(
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace
@ -149,7 +212,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # from lxml with namespaces attached to their names, and
        # turn then into NamespacedAttribute objects.
        new_attrs = {}
-        for attr, value in attrs.items():
+        for attr, value in list(attrs.items()):
            namespace, attr = self._getNsTag(attr)
            if namespace is None:
                new_attrs[attr] = value
@ -189,7 +252,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.pop()
    def pi(self, target, data):
-        pass
+        self.soup.endData()
        self.soup.handle_data(target + ' ' + data)
        self.soup.endData(self.processing_instruction_class)
    def data(self, content):
        self.soup.handle_data(content)
@ -207,13 +272,17 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
-    features = [LXML, HTML, FAST, PERMISSIVE]
+    NAME = LXML
    ALTERNATE_NAMES = ["lxml-html"]
    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
    processing_instruction_class = ProcessingInstruction
    def default_parser(self, encoding):
        return etree.HTMLParser
@ -224,10 +293,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
            self.parser = self.parser_for(encoding)
            self.parser.feed(markup)
            self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
-        return u'<html><body>%s</body></html>' % fragment
+        return '<html><body>%s</body></html>' % fragment
--- a/lib/bs4/check_block.py
+++ b/lib/bs4/check_block.py
@ -0,0 +1,4 @@
 import requests
 data = requests.get("https://www.crummy.com/").content
 from bs4 import _s
 data = [x for x in _s(data).block_text()]
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -3,12 +3,14 @@
 This library converts a bytestream to Unicode through any means
 necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 import codecs
-from htmlentitydefs import codepoint2name
+from html.entities import codepoint2name
 import re
 import logging
 import string
@ -20,6 +22,8 @@ try:
    #  PyPI package: cchardet
    import cchardet
    def chardet_dammit(s):
        if isinstance(s, str):
            return None
        return cchardet.detect(s)['encoding']
 except ImportError:
    try:
@ -28,6 +32,8 @@ except ImportError:
        #  PyPI package: chardet
        import chardet
        def chardet_dammit(s):
            if isinstance(s, str):
                return None
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
@ -42,10 +48,19 @@ try:
 except ImportError:
    pass
-xml_encoding_re = re.compile(
+# Build bytestring and Unicode versions of regular expressions for finding
-    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+# a declared encoding inside an XML or HTML document.
-html_meta_re = re.compile(
+xml_encoding = '^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
-    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
 encoding_res = dict()
 encoding_res[bytes] = {
    'html' : re.compile(html_meta.encode("ascii"), re.I),
    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
 }
 encoding_res[str] = {
    'html' : re.compile(html_meta, re.I),
    'xml' : re.compile(xml_encoding, re.I)
 }
 class EntitySubstitution(object):
@ -55,15 +70,24 @@ class EntitySubstitution(object):
        lookup = {}
        reverse_lookup = {}
        characters_for_re = []
-        for codepoint, name in list(codepoint2name.items()):
+
-            character = unichr(codepoint)
+        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
-            if codepoint != 34:
+        # entity. We don't want to use it, but we want to recognize it on the way in.
        #
        # TODO: Ideally we would be able to recognize all HTML 5 named
        # entities, but that's a little tricky.
        extra = [(39, 'apos')]
        for codepoint, name in list(codepoint2name.items()) + extra:
            character = chr(codepoint)
            if codepoint not in (34, 39):
                # There's no point in turning the quotation mark into
-                # &quot;, unless it happens within an attribute value, which
+                # &quot; or the single quote into &apos;, unless it
-                # is handled elsewhere.
+                # happens within an attribute value, which is handled
                # elsewhere.
                characters_for_re.append(character)
                lookup[character] = name
-            # But we do want to turn &quot; into the quotation mark.
+            # But we do want to recognize those entities on the way in and
            # convert them to Unicode characters.
            reverse_lookup[name] = character
        re_definition = "[%s]" % "".join(characters_for_re)
        return lookup, reverse_lookup, re.compile(re_definition)
@ -79,7 +103,7 @@ class EntitySubstitution(object):
        }
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
-                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
                                           ")")
    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@ -212,8 +236,11 @@ class EncodingDetector:
    5. Windows-1252.
    """
-    def __init__(self, markup, override_encodings=None, is_html=False):
+    def __init__(self, markup, override_encodings=None, is_html=False,
                 exclude_encodings=None):
        self.override_encodings = override_encodings or []
        exclude_encodings = exclude_encodings or []
        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
        self.chardet_encoding = None
        self.is_html = is_html
        self.declared_encoding = None
@ -224,6 +251,8 @@ class EncodingDetector:
    def _usable(self, encoding, tried):
        if encoding is not None:
            encoding = encoding.lower()
            if encoding in self.exclude_encodings:
                return False
            if encoding not in tried:
                tried.add(encoding)
                return True
@ -266,6 +295,9 @@ class EncodingDetector:
    def strip_byte_order_mark(cls, data):
        """If a byte-order mark is present, strip it and return the encoding it implies."""
        encoding = None
        if isinstance(data, str):
            # Unicode data cannot have a byte-order mark.
            return data, encoding
        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
               and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16be'
@ -300,14 +332,22 @@ class EncodingDetector:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))
        if isinstance(markup, bytes):
            res = encoding_res[bytes]
        else:
            res = encoding_res[str]
        xml_re = res['xml']
        html_re = res['html']
        declared_encoding = None
-        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
-            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
-            declared_encoding = declared_encoding_match.groups()[0].decode(
+            declared_encoding = declared_encoding_match.groups()[0]
                'ascii')
        if declared_encoding:
            if isinstance(declared_encoding, bytes):
                declared_encoding = declared_encoding.decode('ascii', 'replace')
            return declared_encoding.lower()
        return None
@ -331,18 +371,19 @@ class UnicodeDammit:
        ]
    def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False):
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html
-
+        self.log = logging.getLogger(__name__)
-        self.detector = EncodingDetector(markup, override_encodings, is_html)
+        self.detector = EncodingDetector(
            markup, override_encodings, is_html, exclude_encodings)
        # Short-circuit if the data is in Unicode to begin with.
-        if isinstance(markup, unicode) or markup == '':
+        if isinstance(markup, str) or markup == '':
            self.markup = markup
-            self.unicode_markup = unicode(markup)
+            self.unicode_markup = str(markup)
            self.original_encoding = None
            return
@ -365,9 +406,10 @@ class UnicodeDammit:
                if encoding != "ascii":
                    u = self._convert_from(encoding, "replace")
                if u is not None:
-                    logging.warning(
+                    self.log.warning(
                            "Some characters could not be decoded, and were "
-                            "replaced with REPLACEMENT CHARACTER.")
+                            "replaced with REPLACEMENT CHARACTER."
                    )
                    self.contains_replacement_characters = True
                    break
@ -425,7 +467,7 @@ class UnicodeDammit:
    def _to_unicode(self, data, encoding, errors="strict"):
        '''Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliases'''
-        return unicode(data, encoding, errors)
+        return str(data, encoding, errors)
    @property
    def declared_html_encoding(self):
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -1,7 +1,11 @@
 """Diagnostic functions, mainly for use when doing tech support."""
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 import cProfile
-from StringIO import StringIO
+from io import StringIO
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
 import bs4
 from bs4 import BeautifulSoup, __version__
 from bs4.builder import builder_registry
@ -17,8 +21,8 @@ import cProfile
 def diagnose(data):
    """Diagnostic suite for isolating common problems."""
-    print "Diagnostic running on Beautiful Soup %s" % __version__
+    print("Diagnostic running on Beautiful Soup %s" % __version__)
-    print "Python version %s" % sys.version
+    print("Python version %s" % sys.version)
    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
@ -27,44 +31,60 @@ def diagnose(data):
                break
        else:
            basic_parsers.remove(name)
-            print (
+            print((
                "I noticed that %s is not installed. Installing it may help." %
-                name)
+                name))
    if 'lxml' in basic_parsers:
-        basic_parsers.append(["lxml", "xml"])
+        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
-        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
        except ImportError as e:
            print (
                "lxml is not installed or couldn't be imported.")
    if 'html5lib' in basic_parsers:
        try:
            import html5lib
-        print "Found html5lib version %s" % html5lib.__version__
+            print("Found html5lib version %s" % html5lib.__version__)
        except ImportError as e:
            print (
                "html5lib is not installed or couldn't be imported.")
    if hasattr(data, 'read'):
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
        data = open(data).read()
    elif data.startswith("http:") or data.startswith("https:"):
-        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
+        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
-        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
        return
-    print
+    else:
        try:
            if os.path.exists(data):
                print('"%s" looks like a filename. Reading data from the file.' % data)
                with open(data) as fp:
                    data = fp.read()
        except ValueError:
            # This can happen on some platforms when the 'filename' is
            # too long. Assume it's data and not a filename.
            pass
        print()
    for parser in basic_parsers:
-        print "Trying to parse your markup with %s" % parser
+        print("Trying to parse your markup with %s" % parser)
        success = False
        try:
-            soup = BeautifulSoup(data, parser)
+            soup = BeautifulSoup(data, features=parser)
            success = True
-        except Exception, e:
+        except Exception as e:
-            print "%s could not parse the markup." % parser
+            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
-            print "Here's what %s did with the markup:" % parser
+            print("Here's what %s did with the markup:" % parser)
-            print soup.prettify()
+            print(soup.prettify())
-        print "-" * 80
+        print("-" * 80)
 def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.
@ -74,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
-        print("%s, %4s, %s" % (event, element.tag, element.text))
+        print(("%s, %4s, %s" % (event, element.tag, element.text)))
 class AnnouncingParser(HTMLParser):
    """Announces HTMLParser parse events, without doing anything else."""
@ -156,9 +176,9 @@ def rdoc(num_elements=1000):
 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
-    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
-    print "Generated a large invalid HTML document (%d bytes)." % len(data)
+    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
@ -167,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
-        except Exception, e:
+        except Exception as e:
-            print "%s could not parse the markup." % parser
+            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
-            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
-    print "Raw lxml parsed the markup in %.2fs." % (b-a)
+    print("Raw lxml parsed the markup in %.2fs." % (b-a))
    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
-    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
 def profile(num_elements=100000, parser="lxml"):
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -0,0 +1,99 @@
 from bs4.dammit import EntitySubstitution
 class Formatter(EntitySubstitution):
    """Describes a strategy to use when outputting a parse tree to a string.
    Some parts of this strategy come from the distinction between
    HTML4, HTML5, and XML. Others are configurable by the user.
    """
    # Registries of XML and HTML formatters.
    XML_FORMATTERS = {}
    HTML_FORMATTERS = {}
    HTML = 'html'
    XML = 'xml'
    HTML_DEFAULTS = dict(
        cdata_containing_tags=set(["script", "style"]),
    )
    def _default(self, language, value, kwarg):
        if value is not None:
            return value
        if language == self.XML:
            return set()
        return self.HTML_DEFAULTS[kwarg]
    def __init__(
            self, language=None, entity_substitution=None,
            void_element_close_prefix='/', cdata_containing_tags=None,
    ):
        """
        :param void_element_close_prefix: By default, represent void
        elements as <tag/> rather than <tag>
        """
        self.language = language
        self.entity_substitution = entity_substitution
        self.void_element_close_prefix = void_element_close_prefix
        self.cdata_containing_tags = self._default(
            language, cdata_containing_tags, 'cdata_containing_tags'
        )
    def substitute(self, ns):
        """Process a string that needs to undergo entity substitution."""
        if not self.entity_substitution:
            return ns
        from .element import NavigableString
        if (isinstance(ns, NavigableString)
            and ns.parent is not None
            and ns.parent.name in self.cdata_containing_tags):
            # Do nothing.
            return ns
        # Substitute.
        return self.entity_substitution(ns)
    def attribute_value(self, value):
        """Process the value of an attribute."""
        return self.substitute(value)
    def attributes(self, tag):
        """Reorder a tag's attributes however you want."""
        return sorted(tag.attrs.items())
 class HTMLFormatter(Formatter):
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
 class XMLFormatter(Formatter):
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
 # Set up aliases for the default formatters.
 HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
 )
 HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html,
    void_element_close_prefix = None
 )
 HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
 )
 HTMLFormatter.REGISTRY[None] = HTMLFormatter(
    entity_substitution=None
 )
 XMLFormatter.REGISTRY["html"] =  XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
 )
 XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
 )
 XMLFormatter.REGISTRY[None] = Formatter(
    Formatter(Formatter.XML, entity_substitution=None)
 )
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -1,5 +1,10 @@
 # encoding: utf-8
 """Helper classes for tests."""
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
 import pickle
 import copy
 import functools
 import unittest
@ -11,29 +16,66 @@ from bs4.element import (
    ContentMetaAttributeValue,
    Doctype,
    SoupStrainer,
    Tag
 )
 from bs4.builder import HTMLParserTreeBuilder
 default_builder = HTMLParserTreeBuilder
 BAD_DOCUMENT = """A bare string
 <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
 <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
 <div><![CDATA[A CDATA section where it doesn't belong]]></div>
 <div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
 <div>A <meta> tag</div>
 <div>A <br> tag that supposedly has contents.</br></div>
 <div>AT&T</div>
 <div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
 <div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
 <div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
 <div><a href="http://example.com/</a> that attribute value never got closed</div>
 <div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
 <! This document starts with a bogus declaration ><div>a</div>
 <div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
 <div>This document ends with <!an incomplete declaration
 <div><a style={height:21px;}>That attribute value was bogus</a></div>
 <! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
 <div><table><td nowrap>That boolean attribute had no value</td></table></div>
 <div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
 <div>This document ends before the entity finishes: &gt
 <div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
 <b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
 <div><table><tr><td>Here's a table</td></tr></table></div>
 <div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
 <div>This tag contains nothing but whitespace: <b>    </b></div>
 <div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
 <div><table><div>This table contains bare markup</div></table></div>
 <div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
 <div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
 <div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
 <div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
 <div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 """
 class SoupTest(unittest.TestCase):
    @property
    def default_builder(self):
-        return default_builder()
+        return default_builder
    def soup(self, markup, **kwargs):
        """Build a Beautiful Soup object from markup."""
        builder = kwargs.pop('builder', self.default_builder)
        return BeautifulSoup(markup, builder=builder, **kwargs)
-    def document_for(self, markup):
+    def document_for(self, markup, **kwargs):
        """Turn an HTML fragment into a document.
        The details depend on the builder.
        """
-        return self.default_builder.test_fragment_to_document(markup)
+        return self.default_builder(**kwargs).test_fragment_to_document(markup)
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
@ -43,6 +85,131 @@ class SoupTest(unittest.TestCase):
        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
    def assertConnectedness(self, element):
        """Ensure that next_element and previous_element are properly
        set for all descendants of the given element.
        """
        earlier = None
        for e in element.descendants:
            if earlier:
                self.assertEqual(e, earlier.next_element)
                self.assertEqual(earlier, e.previous_element)
            earlier = e
    def linkage_validator(self, el, _recursive_call=False):
        """Ensure proper linkage throughout the document."""
        descendant = None
        # Document element should have no previous element or previous sibling.
        # It also shouldn't have a next sibling.
        if el.parent is None:
            assert el.previous_element is None,\
                "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
                    el, el.previous_element, None
                )
            assert el.previous_sibling is None,\
                "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
                    el, el.previous_sibling, None
                )
            assert el.next_sibling is None,\
                "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
                    el, el.next_sibling, None
                )
        idx = 0
        child = None
        last_child = None
        last_idx = len(el.contents) - 1
        for child in el.contents:
            descendant = None
            # Parent should link next element to their first child
            # That child should have no previous sibling
            if idx == 0:
                if el.parent is not None:
                    assert el.next_element is child,\
                       "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
                            el, el.next_element, child
                        )
                    assert child.previous_element is el,\
                       "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
                            child, child.previous_element, el
                        )
                    assert child.previous_sibling is None,\
                       "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
                            child, child.previous_sibling, None
                        )
            # If not the first child, previous index should link as sibling to this index
            # Previous element should match the last index or the last bubbled up descendant
            else:
                assert child.previous_sibling is el.contents[idx - 1],\
                    "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
                        child, child.previous_sibling, el.contents[idx - 1]
                    )
                assert el.contents[idx - 1].next_sibling is child,\
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
                        el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
                    )
                if last_child is not None:
                    assert child.previous_element is last_child,\
                        "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
                            child, child.previous_element, last_child, child.parent.contents
                        )
                    assert last_child.next_element is child,\
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
                            last_child, last_child.next_element, child
                        )
            if isinstance(child, Tag) and child.contents:
                descendant = self.linkage_validator(child, True)
                # A bubbled up descendant should have no next siblings
                assert descendant.next_sibling is None,\
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
                        descendant, descendant.next_sibling, None
                    )
            # Mark last child as either the bubbled up descendant or the current child
            if descendant is not None:
                last_child = descendant
            else:
                last_child = child
            # If last child, there are non next siblings
            if idx == last_idx:
                assert child.next_sibling is None,\
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
                        child, child.next_sibling, None
                    )
            idx += 1
        child = descendant if descendant is not None else child
        if child is None:
            child = el
        if not _recursive_call and child is not None:
            target = el
            while True:
                if target is None:
                    assert child.next_element is None, \
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
                            child, child.next_element, None
                        )
                    break
                elif target.next_sibling is not None:
                    assert child.next_element is target.next_sibling, \
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
                            child, child.next_element, target.next_sibling
                        )
                    break
                target = target.parent
            # We are done, so nothing to return
            return None
        else:
            # Return the child to the recursive caller
            return child
 class HTMLTreeBuilderSmokeTest(object):
@ -54,6 +221,27 @@ class HTMLTreeBuilderSmokeTest(object):
    markup in these tests, there's not much room for interpretation.
    """
    def test_empty_element_tags(self):
        """Verify that all HTML4 and HTML5 empty element (aka void element) tags
        are handled correctly.
        """
        for name in [
                'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
                'spacer', 'frame'
        ]:
            soup = self.soup("")
            new_tag = soup.new_tag(name)
            self.assertEqual(True, new_tag.is_empty_element)
    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
        # to the original.
        tree = self.soup("<a><b>foo</a>")
        dumped = pickle.dumps(tree, 2)
        loaded = pickle.loads(dumped)
        self.assertEqual(loaded.__class__, BeautifulSoup)
        self.assertEqual(loaded.decode(), tree.decode())
    def assertDoctypeHandled(self, doctype_fragment):
        """Assert that a given doctype string is handled correctly."""
        doctype_str, soup = self._document_with_doctype(doctype_fragment)
@ -114,6 +302,27 @@ class HTMLTreeBuilderSmokeTest(object):
            soup.encode("utf-8").replace(b"\n", b""),
            markup.replace(b"\n", b""))
    def test_namespaced_html(self):
        """When a namespaced XML document is parsed as HTML it should
        be treated as HTML with weird tag names.
        """
        markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
        soup = self.soup(markup)
        self.assertEqual(2, len(soup.find_all("ns1:foo")))
    def test_processing_instruction(self):
        # We test both Unicode and bytestring to verify that
        # process_markup correctly sets processing_instruction_class
        # even when the markup is already Unicode and there is no
        # need to process anything.
        markup = """<?PITarget PIContent?>"""
        soup = self.soup(markup)
        self.assertEqual(markup, soup.decode())
        markup = b"""<?PITarget PIContent?>"""
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))
    def test_deepcopy(self):
        """Make sure you can copy the tree builder.
@ -155,6 +364,23 @@ class HTMLTreeBuilderSmokeTest(object):
    def test_nested_formatting_elements(self):
        self.assertSoupEquals("<em><em></em></em>")
    def test_double_head(self):
        html = '''<!DOCTYPE html>
 <html>
 <head>
 <title>Ordinary HEAD element test</title>
 </head>
 <script type="text/javascript">
 alert("Help!");
 </script>
 <body>
 Hello, world!
 </body>
 </html>
 '''
        soup = self.soup(html)
        self.assertEqual("text/javascript", soup.find('script')['type'])
    def test_comment(self):
        # Comments are represented as Comment objects.
        markup = "<p>foo<!--foobar-->baz</p>"
@ -171,9 +397,22 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertEqual(comment, baz.previous_element)
    def test_preserved_whitespace_in_pre_and_textarea(self):
-        """Whitespace must be preserved in <pre> and <textarea> tags."""
+        """Whitespace must be preserved in <pre> and <textarea> tags,
-        self.assertSoupEquals("<pre>   </pre>")
+        even if that would mean not prettifying the markup.
-        self.assertSoupEquals("<textarea> woo  </textarea>")
+        """
        pre_markup = "<pre>   </pre>"
        textarea_markup = "<textarea> woo\nwoo  </textarea>"
        self.assertSoupEquals(pre_markup)
        self.assertSoupEquals(textarea_markup)
        soup = self.soup(pre_markup)
        self.assertEqual(soup.pre.prettify(), pre_markup)
        soup = self.soup(textarea_markup)
        self.assertEqual(soup.textarea.prettify(), textarea_markup)
        soup = self.soup("<textarea></textarea>")
        self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
    def test_nested_inline_elements(self):
        """Inline elements can be nested indefinitely."""
@ -213,6 +452,18 @@ class HTMLTreeBuilderSmokeTest(object):
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_multivalued_attribute_with_whitespace(self):
        # Whitespace separating the values of a multi-valued attribute
        # should be ignored.
        markup = '<div class=" foo bar	 "></a>'
        soup = self.soup(markup)
        self.assertEqual(['foo', 'bar'], soup.div['class'])
        # If you search by the literal name of the class it's like the whitespace
        # wasn't there.
        self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
    def test_deeply_nested_multivalued_attribute(self):
        # html5lib can set the attributes of the same tag many times
        # as it rearranges the tree. This has caused problems with
@ -221,18 +472,52 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(["css"], soup.div.div['class'])
    def test_multivalued_attribute_on_html(self):
        # html5lib uses a different API to set the attributes ot the
        # <html> tag. This has caused problems with multivalued
        # attributes.
        markup = '<html class="a b"></html>'
        soup = self.soup(markup)
        self.assertEqual(["a", "b"], soup.html['class'])
    def test_angle_brackets_in_attribute_values_are_escaped(self):
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
    def test_strings_resembling_character_entity_references(self):
        # "&T" and "&p" look like incomplete character entities, but they are
        # not.
        self.assertSoupEquals(
            "<p>&bull; AT&T is in the s&p 500</p>",
            "<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
        )
    def test_apos_entity(self):
        self.assertSoupEquals(
            "<p>Bob&apos;s Bar</p>",
            "<p>Bob's Bar</p>",
        )
    def test_entities_in_foreign_document_encoding(self):
        # &#147; and &#148; are invalid numeric entities referencing
        # Windows-1252 characters. &#45; references a character common
        # to Windows-1252 and Unicode, and &#9731; references a
        # character only found in Unicode.
        #
        # All of these entities should be converted to Unicode
        # characters.
        markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
        soup = self.soup(markup)
        self.assertEqual("“Hello” -☃", soup.p.string)
    def test_entities_in_attributes_converted_to_unicode(self):
-        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+        expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
    def test_entities_in_text_converted_to_unicode(self):
-        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+        expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -243,7 +528,7 @@ class HTMLTreeBuilderSmokeTest(object):
                              '<p>I said "good day!"</p>')
    def test_out_of_range_entity(self):
-        expect = u"\N{REPLACEMENT CHARACTER}"
+        expect = "\N{REPLACEMENT CHARACTER}"
        self.assertSoupEquals("&#10000000000000;", expect)
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
@ -253,6 +538,42 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
        self.assertEqual("p", soup.h2.string.next_element.name)
        self.assertEqual("p", soup.p.name)
        self.assertConnectedness(soup)
    def test_empty_element_tags(self):
        """Verify consistent handling of empty-element tags,
        no matter how they come in through the markup.
        """
        self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
        self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
    def test_head_tag_between_head_and_body(self):
        "Prevent recurrence of a bug in the html5lib treebuilder."
        content = """<html><head></head>
  <link></link>
  <body>foo</body>
 </html>
 """
        soup = self.soup(content)
        self.assertNotEqual(None, soup.html.body)
        self.assertConnectedness(soup)
    def test_multiple_copies_of_a_tag(self):
        "Prevent recurrence of a bug in the html5lib treebuilder."
        content = """<!DOCTYPE html>
 <html>
 <body>
   <article id="a" >
   <div><a href="1"></div>
   <footer>
     <a href="2"></a>
   </footer>
  </article>
  </body>
 </html>
 """
        soup = self.soup(content)
        self.assertConnectedness(soup.article)
    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
@ -285,9 +606,9 @@ class HTMLTreeBuilderSmokeTest(object):
        # A seemingly innocuous document... but it's in Unicode! And
        # it contains characters that can't be represented in the
        # encoding found in the  declaration! The horror!
-        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+        markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
        soup = self.soup(markup)
-        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+        self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
    def test_soupstrainer(self):
        """Parsers should be able to work with SoupStrainers."""
@ -327,7 +648,7 @@ class HTMLTreeBuilderSmokeTest(object):
        # Both XML and HTML entities are converted to Unicode characters
        # during parsing.
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
-        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
        self.assertSoupEquals(text, expected)
    def test_smart_quotes_converted_on_the_way_in(self):
@ -337,15 +658,15 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup(quote)
        self.assertEqual(
            soup.p.string,
-            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+            "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
    def test_non_breaking_spaces_converted_on_the_way_in(self):
        soup = self.soup("<a>&nbsp;&nbsp;</a>")
-        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+        self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
    def test_entities_converted_on_the_way_out(self):
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
-        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
        soup = self.soup(text)
        self.assertEqual(soup.p.encode("utf-8"), expected)
@ -354,7 +675,7 @@ class HTMLTreeBuilderSmokeTest(object):
        # easy-to-understand document.
        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
-        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+        unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
        # That's because we're going to encode it into ISO-Latin-1, and use
        # that to test.
@ -399,7 +720,9 @@ class HTMLTreeBuilderSmokeTest(object):
        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
        soup = self.soup(
            hebrew_document, from_encoding="iso8859-8")
-        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        # Some tree builders call it iso8859-8, others call it iso-8859-9.
        # That's not a difference we really care about.
        assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
        self.assertEqual(
            soup.encode('utf-8'),
            hebrew_document.decode("iso8859-8").encode("utf-8"))
@ -461,13 +784,39 @@ class HTMLTreeBuilderSmokeTest(object):
        data.a['foo'] = 'bar'
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
    def test_worst_case(self):
        """Test the worst case (currently) for linking issues."""
        soup = self.soup(BAD_DOCUMENT)
        self.linkage_validator(soup)
 class XMLTreeBuilderSmokeTest(object):
    def test_pickle_and_unpickle_identity(self):
        # Pickling a tree, then unpickling it, yields a tree identical
        # to the original.
        tree = self.soup("<a><b>foo</a>")
        dumped = pickle.dumps(tree, 2)
        loaded = pickle.loads(dumped)
        self.assertEqual(loaded.__class__, BeautifulSoup)
        self.assertEqual(loaded.decode(), tree.decode())
    def test_docstring_generated(self):
        soup = self.soup("<root/>")
        self.assertEqual(
            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
    def test_xml_declaration(self):
        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))
    def test_processing_instruction(self):
        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
        soup = self.soup(markup)
        self.assertEqual(markup, soup.encode("utf8"))
    def test_real_xhtml_document(self):
        """A real XHTML document should come out *exactly* the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
@ -480,12 +829,23 @@ class XMLTreeBuilderSmokeTest(object):
        self.assertEqual(
            soup.encode("utf-8"), markup)
    def test_nested_namespaces(self):
        doc = b"""<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 <parent xmlns="http://ns1/">
 <child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
 <grandchild ns3:attr="value" xmlns="http://ns4/"/>
 </child>
 </parent>"""
        soup = self.soup(doc)
        self.assertEqual(doc, soup.encode())
    def test_formatter_processes_script_tag_for_xml_documents(self):
        doc = """
  <script type="text/javascript">
  </script>
 """
-        soup = BeautifulSoup(doc, "xml")
+        soup = BeautifulSoup(doc, "lxml-xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
@ -493,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
    def test_can_parse_unicode_document(self):
-        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+        markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
        soup = self.soup(markup)
-        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+        self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
        self.assertEqual(
-            unicode(soup.rss), markup)
+            str(soup.rss), markup)
    def test_docstring_includes_correct_encoding(self):
        soup = self.soup("<root/>")
@ -532,17 +892,57 @@ class XMLTreeBuilderSmokeTest(object):
    def test_closing_namespaced_tag(self):
        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
        soup = self.soup(markup)
-        self.assertEqual(unicode(soup.p), markup)
+        self.assertEqual(str(soup.p), markup)
    def test_namespaced_attributes(self):
        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
        soup = self.soup(markup)
-        self.assertEqual(unicode(soup.foo), markup)
+        self.assertEqual(str(soup.foo), markup)
    def test_namespaced_attributes_xml_namespace(self):
        markup = '<foo xml:lang="fr">bar</foo>'
        soup = self.soup(markup)
-        self.assertEqual(unicode(soup.foo), markup)
+        self.assertEqual(str(soup.foo), markup)
    def test_find_by_prefixed_name(self):
        doc = """<?xml version="1.0" encoding="utf-8"?>
 <Document xmlns="http://example.com/ns0"
    xmlns:ns1="http://example.com/ns1"
    xmlns:ns2="http://example.com/ns2"
    <ns1:tag>foo</ns1:tag>
    <ns1:tag>bar</ns1:tag>
    <ns2:tag key="value">baz</ns2:tag>
 </Document>
 """
        soup = self.soup(doc)
        # There are three <tag> tags.
        self.assertEqual(3, len(soup.find_all('tag')))
        # But two of them are ns1:tag and one of them is ns2:tag.
        self.assertEqual(2, len(soup.find_all('ns1:tag')))
        self.assertEqual(1, len(soup.find_all('ns2:tag')))
        self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
        self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
    def test_copy_tag_preserves_namespace(self):
        xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 <w:document xmlns:w="http://example.com/ns0"/>"""
        soup = self.soup(xml)
        tag = soup.document
        duplicate = copy.copy(tag)
        # The two tags have the same namespace prefix.
        self.assertEqual(tag.prefix, duplicate.prefix)
    def test_worst_case(self):
        """Test the worst case (currently) for linking issues."""
        soup = self.soup(BAD_DOCUMENT)
        self.linkage_validator(soup)
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""
--- a/lib/bs4/tests/init.py
+++ b/lib/bs4/tests/init.py
@ -0,0 +1 @@
 "The beautifulsoup tests."
--- a/lib/bs4/tests/test_builder_registry.py
+++ b/lib/bs4/tests/test_builder_registry.py
@ -0,0 +1,147 @@
 """Tests of the builder registry."""
 import unittest
 import warnings
 from bs4 import BeautifulSoup
 from bs4.builder import (
    builder_registry as registry,
    HTMLParserTreeBuilder,
    TreeBuilderRegistry,
 )
 try:
    from bs4.builder import HTML5TreeBuilder
    HTML5LIB_PRESENT = True
 except ImportError:
    HTML5LIB_PRESENT = False
 try:
    from bs4.builder import (
        LXMLTreeBuilderForXML,
        LXMLTreeBuilder,
        )
    LXML_PRESENT = True
 except ImportError:
    LXML_PRESENT = False
 class BuiltInRegistryTest(unittest.TestCase):
    """Test the built-in registry with the default builders registered."""
    def test_combination(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('fast', 'html'),
                             LXMLTreeBuilder)
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('permissive', 'xml'),
                             LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup('strict', 'html'),
                          HTMLParserTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib', 'html'),
                              HTML5TreeBuilder)
    def test_lookup_by_markup_type(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
            self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
        else:
            self.assertEqual(registry.lookup('xml'), None)
            if HTML5LIB_PRESENT:
                self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
            else:
                self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
    def test_named_library(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('lxml', 'xml'),
                             LXMLTreeBuilderForXML)
            self.assertEqual(registry.lookup('lxml', 'html'),
                             LXMLTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib'),
                              HTML5TreeBuilder)
        self.assertEqual(registry.lookup('html.parser'),
                          HTMLParserTreeBuilder)
    def test_beautifulsoup_constructor_does_lookup(self):
        with warnings.catch_warnings(record=True) as w:
            # This will create a warning about not explicitly
            # specifying a parser, but we'll ignore it.
            # You can pass in a string.
            BeautifulSoup("", features="html")
            # Or a list of strings.
            BeautifulSoup("", features=["html", "fast"])
        # You'll get an exception if BS can't find an appropriate
        # builder.
        self.assertRaises(ValueError, BeautifulSoup,
                          "", features="no-such-feature")
 class RegistryTest(unittest.TestCase):
    """Test the TreeBuilderRegistry class in general."""
    def setUp(self):
        self.registry = TreeBuilderRegistry()
    def builder_for_features(self, *feature_list):
        cls = type('Builder_' + '_'.join(feature_list),
                   (object,), {'features' : feature_list})
        self.registry.register(cls)
        return cls
    def test_register_with_no_features(self):
        builder = self.builder_for_features()
        # Since the builder advertises no features, you can't find it
        # by looking up features.
        self.assertEqual(self.registry.lookup('foo'), None)
        # But you can find it by doing a lookup with no features, if
        # this happens to be the only registered builder.
        self.assertEqual(self.registry.lookup(), builder)
    def test_register_with_features_makes_lookup_succeed(self):
        builder = self.builder_for_features('foo', 'bar')
        self.assertEqual(self.registry.lookup('foo'), builder)
        self.assertEqual(self.registry.lookup('bar'), builder)
    def test_lookup_fails_when_no_builder_implements_feature(self):
        builder = self.builder_for_features('foo', 'bar')
        self.assertEqual(self.registry.lookup('baz'), None)
    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
        builder1 = self.builder_for_features('foo')
        builder2 = self.builder_for_features('bar')
        self.assertEqual(self.registry.lookup(), builder2)
    def test_lookup_fails_when_no_tree_builders_registered(self):
        self.assertEqual(self.registry.lookup(), None)
    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
        has_one = self.builder_for_features('foo')
        has_the_other = self.builder_for_features('bar')
        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
        lacks_one = self.builder_for_features('bar')
        has_the_other = self.builder_for_features('foo')
        # There are two builders featuring 'foo' and 'bar', but
        # the one that also features 'quux' was registered later.
        self.assertEqual(self.registry.lookup('foo', 'bar'),
                          has_both_late)
        # There is only one builder featuring 'foo', 'bar', and 'baz'.
        self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
                          has_both_early)
    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
        builder1 = self.builder_for_features('foo', 'bar')
        builder2 = self.builder_for_features('foo', 'baz')
        self.assertEqual(self.registry.lookup('bar', 'baz'), None)
--- a/lib/bs4/tests/test_docs.py
+++ b/lib/bs4/tests/test_docs.py
@ -0,0 +1,36 @@
 "Test harness for doctests."
 # pylint: disable-msg=E0611,W0142
 __metaclass__ = type
 __all__ = [
    'additional_tests',
    ]
 import atexit
 import doctest
 import os
 #from pkg_resources import (
 #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
 import unittest
 DOCTEST_FLAGS = (
    doctest.ELLIPSIS |
    doctest.NORMALIZE_WHITESPACE |
    doctest.REPORT_NDIFF)
 # def additional_tests():
 #     "Run the doc tests (README.txt and docs/*, if any exist)"
 #     doctest_files = [
 #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
 #     if resource_exists('bs4', 'docs'):
 #         for name in resource_listdir('bs4', 'docs'):
 #             if name.endswith('.txt'):
 #                 doctest_files.append(
 #                     os.path.abspath(
 #                         resource_filename('bs4', 'docs/%s' % name)))
 #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
 #     atexit.register(cleanup_resources)
 #     return unittest.TestSuite((
 #         doctest.DocFileSuite(*doctest_files, **kwargs)))
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -0,0 +1,184 @@
 """Tests to ensure that the html5lib tree builder generates good trees."""
 import warnings
 try:
    from bs4.builder import HTML5TreeBuilder
    HTML5LIB_PRESENT = True
 except ImportError as e:
    HTML5LIB_PRESENT = False
 from bs4.element import SoupStrainer
 from bs4.testing import (
    HTML5TreeBuilderSmokeTest,
    SoupTest,
    skipIf,
 )
@skipIf(
    not HTML5LIB_PRESENT,
    "html5lib seems not to be present, not testing its tree builder.")
 class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
    """See ``HTML5TreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return HTML5TreeBuilder
    def test_soupstrainer(self):
        # The html5lib tree builder does not support SoupStrainers.
        strainer = SoupStrainer("b")
        markup = "<p>A <b>bold</b> statement.</p>"
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(
            soup.decode(), self.document_for(markup))
        self.assertTrue(
            "the html5lib tree builder doesn't support parse_only" in
            str(w[0].message))
    def test_correctly_nested_tables(self):
        """html5lib inserts <tbody> tags where other parsers don't."""
        markup = ('<table id="1">'
                  '<tr>'
                  "<td>Here's another table:"
                  '<table id="2">'
                  '<tr><td>foo</td></tr>'
                  '</table></td>')
        self.assertSoupEquals(
            markup,
            '<table id="1"><tbody><tr><td>Here\'s another table:'
            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
            '</td></tr></tbody></table>')
        self.assertSoupEquals(
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_xml_declaration_followed_by_doctype(self):
        markup = '''<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html>
 <html>
  <head>
  </head>
  <body>
   <p>foo</p>
  </body>
 </html>'''
        soup = self.soup(markup)
        # Verify that we can reach the <p> tag; this means the tree is connected.
        self.assertEqual(b"<p>foo</p>", soup.p.encode())
    def test_reparented_markup(self):
        markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
        soup = self.soup(markup)
        self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
        self.assertEqual(2, len(soup.find_all('p')))
    def test_reparented_markup_ends_with_whitespace(self):
        markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
        soup = self.soup(markup)
        self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
        self.assertEqual(2, len(soup.find_all('p')))
    def test_reparented_markup_containing_identical_whitespace_nodes(self):
        """Verify that we keep the two whitespace nodes in this
        document distinct when reparenting the adjacent <tbody> tags.
        """
        markup = '<table> <tbody><tbody><ims></tbody> </table>'
        soup = self.soup(markup)
        space1, space2 = soup.find_all(string=' ')
        tbody1, tbody2 = soup.find_all('tbody')
        assert space1.next_element is tbody1
        assert tbody2.next_element is space2
    def test_reparented_markup_containing_children(self):
        markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
        soup = self.soup(markup)
        noscript = soup.noscript
        self.assertEqual("target", noscript.next_element)
        target = soup.find(string='target')
        # The 'aftermath' string was duplicated; we want the second one.
        final_aftermath = soup.find_all(string='aftermath')[-1]
        # The <noscript> tag was moved beneath a copy of the <a> tag,
        # but the 'target' string within is still connected to the
        # (second) 'aftermath' string.
        self.assertEqual(final_aftermath, target.next_element)
        self.assertEqual(target, final_aftermath.previous_element)
    def test_processing_instruction(self):
        """Processing instructions become comments."""
        markup = b"""<?PITarget PIContent?>"""
        soup = self.soup(markup)
        assert str(soup).startswith("<!--?PITarget PIContent?-->")
    def test_cloned_multivalue_node(self):
        markup = b"""<a class="my_class"><p></a>"""
        soup = self.soup(markup)
        a1, a2 = soup.find_all('a')
        self.assertEqual(a1, a2)
        assert a1 is not a2
    def test_foster_parenting(self):
        markup = b"""<table><td></tbody>A"""
        soup = self.soup(markup)
        self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
    def test_extraction(self):
        """
        Test that extraction does not destroy the tree.
        https://bugs.launchpad.net/beautifulsoup/+bug/1782928
        """
        markup = """
 <html><head></head>
 <style>
 </style><script></script><body><p>hello</p></body></html>
 """
        soup = self.soup(markup)
        [s.extract() for s in soup('script')]
        [s.extract() for s in soup('style')]
        self.assertEqual(len(soup.find_all("p")), 1)
    def test_empty_comment(self):
        """
        Test that empty comment does not break structure.
        https://bugs.launchpad.net/beautifulsoup/+bug/1806598
        """
        markup = """
 <html>
 <body>
 <form>
 <!----><input type="text">
 </form>
 </body>
 </html>
 """
        soup = self.soup(markup)
        inputs = []
        for form in soup.find_all('form'):
            inputs.extend(form.find_all('input'))
        self.assertEqual(len(inputs), 1)
    def test_tracking_line_numbers(self):
        # The html.parser TreeBuilder keeps track of line number and
        # position of each element.
        markup = "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
        soup = self.soup(markup)
        self.assertEqual(2, soup.p.sourceline)
        self.assertEqual(5, soup.p.sourcepos)
        self.assertEqual("sourceline", soup.p.find('sourceline').name)
        # You can deactivate this behavior.
        soup = self.soup(markup, store_line_numbers=False)
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -0,0 +1,61 @@
 """Tests to ensure that the html.parser tree builder generates good
 trees."""
 from pdb import set_trace
 import pickle
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
 from bs4.builder._htmlparser import BeautifulSoupHTMLParser
 class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    default_builder = HTMLParserTreeBuilder
    def test_namespaced_system_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
    def test_namespaced_public_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
    def test_builder_is_pickled(self):
        """Unlike most tree builders, HTMLParserTreeBuilder and will
        be restored after pickling.
        """
        tree = self.soup("<a><b>foo</a>")
        dumped = pickle.dumps(tree, 2)
        loaded = pickle.loads(dumped)
        self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
    def test_redundant_empty_element_closing_tags(self):
        self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
        self.assertSoupEquals('</br></br></br>', "")
    def test_empty_element(self):
        # This verifies that any buffered data present when the parser
        # finishes working is handled.
        self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
    def test_tracking_line_numbers(self):
        # The html.parser TreeBuilder keeps track of line number and
        # position of each element.
        markup = "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
        soup = self.soup(markup)
        self.assertEqual(2, soup.p.sourceline)
        self.assertEqual(3, soup.p.sourcepos)
        self.assertEqual("sourceline", soup.p.find('sourceline').name)
        # You can deactivate this behavior.
        soup = self.soup(markup, store_line_numbers=False)
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)
 class TestHTMLParserSubclass(SoupTest):
    def test_error(self):
        """Verify that our HTMLParser subclass implements error() in a way
        that doesn't cause a crash.
        """
        parser = BeautifulSoupHTMLParser()
        parser.error("don't crash")
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -0,0 +1,115 @@
 """Tests to ensure that the lxml tree builder generates good trees."""
 import re
 import warnings
 try:
    import lxml.etree
    LXML_PRESENT = True
    LXML_VERSION = lxml.etree.LXML_VERSION
 except ImportError as e:
    LXML_PRESENT = False
    LXML_VERSION = (0,)
 if LXML_PRESENT:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
    )
 from bs4.element import Comment, Doctype, SoupStrainer
 from bs4.testing import skipIf
 from bs4.tests import test_htmlparser
 from bs4.testing import (
    HTMLTreeBuilderSmokeTest,
    XMLTreeBuilderSmokeTest,
    SoupTest,
    skipIf,
 )
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its tree builder.")
 class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return LXMLTreeBuilder
    def test_out_of_range_entity(self):
        self.assertSoupEquals(
            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
    def test_entities_in_foreign_document_encoding(self):
        # We can't implement this case correctly because by the time we
        # hear about markup like "&#147;", it's been (incorrectly) converted into
        # a string like u'\x93'
        pass
    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
    # test if an old version of lxml is installed.
    @skipIf(
        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
        "Skipping doctype test for old version of lxml to avoid segfault.")
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())
    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<b />")
        self.assertEqual("<b/>", str(soup.b))
        self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
    def test_tracking_line_numbers(self):
        # The lxml TreeBuilder cannot keep track of line numbers from
        # the original markup. Even if you ask for line numbers, we
        # don't have 'em.
        #
        # This means that if you have a tag like <sourceline> or
        # <sourcepos>, attribute access will find it rather than
        # giving you a numeric answer.
        soup = self.soup(
            "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
            store_line_numbers=True
        )
        self.assertEqual("sourceline", soup.p.sourceline.name)
        self.assertEqual("sourcepos", soup.p.sourcepos.name)
@skipIf(
    not LXML_PRESENT,
    "lxml seems not to be present, not testing its XML tree builder.")
 class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
    """See ``HTMLTreeBuilderSmokeTest``."""
    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML
    def test_namespace_indexing(self):
        # We should not track un-prefixed namespaces as we can only hold one
        # and it will be recognized as the default namespace by soupsieve,
        # which may be confusing in some situations. When no namespace is provided
        # for a selector, the default namespace (if defined) is assumed.
        soup = self.soup(
            '<?xml version="1.1"?>\n'
            '<root>'
            '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
            '<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
            '</root>'
        )
        self.assertEqual(
            soup._namespaces,
            {'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
        )
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -0,0 +1,682 @@
 # -*- coding: utf-8 -*-
 """Tests of Beautiful Soup as a whole."""
 from pdb import set_trace
 import logging
 import unittest
 import sys
 import tempfile
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
 )
 from bs4.builder import (
    TreeBuilder,
    ParserRejectedMarkup,
 )
 from bs4.element import (
    CharsetMetaAttributeValue,
    Comment,
    ContentMetaAttributeValue,
    SoupStrainer,
    NamespacedAttribute,
    Tag,
    NavigableString,
    )
 import bs4.dammit
 from bs4.dammit import (
    EntitySubstitution,
    UnicodeDammit,
    EncodingDetector,
 )
 from bs4.testing import (
    default_builder,
    SoupTest,
    skipIf,
 )
 import warnings
 try:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
    LXML_PRESENT = True
 except ImportError as e:
    LXML_PRESENT = False
 PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
 class TestConstructor(SoupTest):
    def test_short_unicode_input(self):
        data = "<h1>éé</h1>"
        soup = self.soup(data)
        self.assertEqual("éé", soup.h1.string)
    def test_embedded_null(self):
        data = "<h1>foo\0bar</h1>"
        soup = self.soup(data)
        self.assertEqual("foo\0bar", soup.h1.string)
    def test_exclude_encodings(self):
        utf8_data = "Räksmörgås".encode("utf-8")
        soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
        self.assertEqual("windows-1252", soup.original_encoding)
    def test_custom_builder_class(self):
        # Verify that you can pass in a custom Builder class and
        # it'll be instantiated with the appropriate keyword arguments.
        class Mock(object):
            def __init__(self, **kwargs):
                self.called_with = kwargs
                self.is_xml = True
                self.store_line_numbers = False
                self.cdata_list_attributes = []
                self.preserve_whitespace_tags = []
            def initialize_soup(self, soup):
                pass
            def feed(self, markup):
                self.fed = markup
            def reset(self):
                pass
            def ignore(self, ignore):
                pass
            set_up_substitutions = can_be_empty_element = ignore
            def prepare_markup(self, *args, **kwargs):
                yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
        kwargs = dict(
            var="value",
            # This is a deprecated BS3-era keyword argument, which
            # will be stripped out.
            convertEntities=True,
        )
        with warnings.catch_warnings(record=True):
            soup = BeautifulSoup('', builder=Mock, **kwargs)
        assert isinstance(soup.builder, Mock)
        self.assertEqual(dict(var="value"), soup.builder.called_with)
        self.assertEqual("prepared markup", soup.builder.fed)
        # You can also instantiate the TreeBuilder yourself. In this
        # case, that specific object is used and any keyword arguments
        # to the BeautifulSoup constructor are ignored.
        builder = Mock(**kwargs)
        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulSoup(
                '', builder=builder, ignored_value=True,
            )
        msg = str(w[0].message)
        assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
        self.assertEqual(builder, soup.builder)
        self.assertEqual(kwargs, builder.called_with)
    def test_parser_markup_rejection(self):
        # If markup is completely rejected by the parser, an
        # explanatory ParserRejectedMarkup exception is raised.
        class Mock(TreeBuilder):
            def feed(self, *args, **kwargs):
                raise ParserRejectedMarkup("Nope.")
        def prepare_markup(self, *args, **kwargs):
            # We're going to try two different ways of preparing this markup,
            # but feed() will reject both of them.
            yield markup, None, None, False
            yield markup, None, None, False
        import re
        self.assertRaisesRegex(
            ParserRejectedMarkup,
            "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
            BeautifulSoup, '', builder=Mock,
        )
    def test_cdata_list_attributes(self):
        # Most attribute values are represented as scalars, but the
        # HTML standard says that some attributes, like 'class' have
        # space-separated lists as values.
        markup = '<a id=" an id " class=" a class "></a>'
        soup = self.soup(markup)
        # Note that the spaces are stripped for 'class' but not for 'id'.
        a = soup.a
        self.assertEqual(" an id ", a['id'])
        self.assertEqual(["a", "class"], a['class'])
        # TreeBuilder takes an argument called 'mutli_valued_attributes'  which lets
        # you customize or disable this. As always, you can customize the TreeBuilder
        # by passing in a keyword argument to the BeautifulSoup constructor.
        soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
        self.assertEqual(" a class ", soup.a['class'])
        # Here are two ways of saying that `id` is a multi-valued
        # attribute in this context, but 'class' is not.
        for switcheroo in ({'*': 'id'}, {'a': 'id'}):
            with warnings.catch_warnings(record=True) as w:
                # This will create a warning about not explicitly
                # specifying a parser, but we'll ignore it.
                soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
            a = soup.a
            self.assertEqual(["an", "id"], a['id'])
            self.assertEqual(" a class ", a['class'])
    def test_replacement_classes(self):
        # Test the ability to pass in replacements for element classes
        # which will be used when building the tree.
        class TagPlus(Tag):
            pass
        class StringPlus(NavigableString):
            pass
        class CommentPlus(Comment):
            pass
        soup = self.soup(
            "<a><b>foo</b>bar</a><!--whee-->",
            element_classes = {
                Tag: TagPlus,
                NavigableString: StringPlus,
                Comment: CommentPlus,
            }
        )
        # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
        # rather than Tag, String, and Comment objects.
        assert all(
            isinstance(x, (TagPlus, StringPlus, CommentPlus))
            for x in soup.recursiveChildGenerator()
        )
 class TestWarnings(SoupTest):
    def _no_parser_specified(self, s, is_there=True):
        v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
        self.assertTrue(v)
    def test_warning_if_no_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>")
        msg = str(w[0].message)
        self._assert_no_parser_specified(msg)
    def test_warning_if_parser_specified_too_vague(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>", "html")
        msg = str(w[0].message)
        self._assert_no_parser_specified(msg)
    def test_no_warning_if_explicit_parser_specified(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>", "html.parser")
        self.assertEqual([], w)
    def test_parseOnlyThese_renamed_to_parse_only(self):
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
        msg = str(w[0].message)
        self.assertTrue("parseOnlyThese" in msg)
        self.assertTrue("parse_only" in msg)
        self.assertEqual(b"<b></b>", soup.encode())
    def test_fromEncoding_renamed_to_from_encoding(self):
        with warnings.catch_warnings(record=True) as w:
            utf8 = b"\xc3\xa9"
            soup = self.soup(utf8, fromEncoding="utf8")
        msg = str(w[0].message)
        self.assertTrue("fromEncoding" in msg)
        self.assertTrue("from_encoding" in msg)
        self.assertEqual("utf8", soup.original_encoding)
    def test_unrecognized_keyword_argument(self):
        self.assertRaises(
            TypeError, self.soup, "<a>", no_such_argument=True)
 class TestWarnings(SoupTest):
    def test_disk_file_warning(self):
        filehandle = tempfile.NamedTemporaryFile()
        filename = filehandle.name
        try:
            with warnings.catch_warnings(record=True) as w:
                soup = self.soup(filename)
            msg = str(w[0].message)
            self.assertTrue("looks like a filename" in msg)
        finally:
            filehandle.close()
        # The file no longer exists, so Beautiful Soup will no longer issue the warning.
        with warnings.catch_warnings(record=True) as w:
            soup = self.soup(filename)
        self.assertEqual(0, len(w))
    def test_url_warning_with_bytes_url(self):
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup(b"http://www.crummybytes.com/")
        # Be aware this isn't the only warning that can be raised during
        # execution..
        self.assertTrue(any("looks like a URL" in str(w.message) 
            for w in warning_list))
    def test_url_warning_with_unicode_url(self):
        with warnings.catch_warnings(record=True) as warning_list:
            # note - this url must differ from the bytes one otherwise
            # python's warnings system swallows the second warning
            soup = self.soup("http://www.crummyunicode.com/")
        self.assertTrue(any("looks like a URL" in str(w.message) 
            for w in warning_list))
    def test_url_warning_with_bytes_and_space(self):
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup(b"http://www.crummybytes.com/ is great")
        self.assertFalse(any("looks like a URL" in str(w.message) 
            for w in warning_list))
    def test_url_warning_with_unicode_and_space(self):
        with warnings.catch_warnings(record=True) as warning_list:
            soup = self.soup("http://www.crummyuncode.com/ is great")
        self.assertFalse(any("looks like a URL" in str(w.message) 
            for w in warning_list))
 class TestSelectiveParsing(SoupTest):
    def test_parse_with_soupstrainer(self):
        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
        strainer = SoupStrainer("b")
        soup = self.soup(markup, parse_only=strainer)
        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
 class TestEntitySubstitution(unittest.TestCase):
    """Standalone tests of the EntitySubstitution class."""
    def setUp(self):
        self.sub = EntitySubstitution
    def test_simple_html_substitution(self):
        # Unicode characters corresponding to named HTML entites
        # are substituted, and no others.
        s = "foo\u2200\N{SNOWMAN}\u00f5bar"
        self.assertEqual(self.sub.substitute_html(s),
                          "foo&forall;\N{SNOWMAN}&otilde;bar")
    def test_smart_quote_substitution(self):
        # MS smart quotes are a common source of frustration, so we
        # give them a special test.
        quotes = b"\x91\x92foo\x93\x94"
        dammit = UnicodeDammit(quotes)
        self.assertEqual(self.sub.substitute_html(dammit.markup),
                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, False), s)
    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
        self.assertEqual(self.sub.substitute_xml("Welcome", True),
                          '"Welcome"')
        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
                          '"Bob\'s Bar"')
    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
        s = 'Welcome to "my bar"'
        self.assertEqual(self.sub.substitute_xml(s, True),
                          "'Welcome to \"my bar\"'")
    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
        s = 'Welcome to "Bob\'s Bar"'
        self.assertEqual(
            self.sub.substitute_xml(s, True),
            '"Welcome to &quot;Bob\'s Bar&quot;"')
    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
        quoted = 'Welcome to "Bob\'s Bar"'
        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
    def test_xml_quoting_handles_angle_brackets(self):
        self.assertEqual(
            self.sub.substitute_xml("foo<bar>"),
            "foo&lt;bar&gt;")
    def test_xml_quoting_handles_ampersands(self):
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
    def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml("&Aacute;T&T"),
            "&amp;Aacute;T&amp;T")
    def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
            "&Aacute;T&amp;T")
    def test_quotes_not_html_substituted(self):
        """There's no need to do this except inside attribute values."""
        text = 'Bob\'s "bar"'
        self.assertEqual(self.sub.substitute_html(text), text)
 class TestEncodingConversion(SoupTest):
    # Test Beautiful Soup's ability to decode and encode from various
    # encodings.
    def setUp(self):
        super(TestEncodingConversion, self).setUp()
        self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
        self.utf8_data = self.unicode_data.encode("utf-8")
        # Just so you know what it looks like.
        self.assertEqual(
            self.utf8_data,
            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
    def test_ascii_in_unicode_out(self):
        # ASCII input is converted to Unicode. The original_encoding
        # attribute is set to 'utf-8', a superset of ASCII.
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            # Disable chardet, which will realize that the ASCII is ASCII.
            bs4.dammit.chardet_dammit = noop
            ascii = b"<foo>a</foo>"
            soup_from_ascii = self.soup(ascii)
            unicode_output = soup_from_ascii.decode()
            self.assertTrue(isinstance(unicode_output, str))
            self.assertEqual(unicode_output, self.document_for(ascii.decode()))
            self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
    def test_unicode_in_unicode_out(self):
        # Unicode input is left alone. The original_encoding attribute
        # is not set.
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
        self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
        self.assertEqual(soup_from_unicode.original_encoding, None)
    def test_utf8_in_unicode_out(self):
        # UTF-8 input is converted to Unicode. The original_encoding
        # attribute is set.
        soup_from_utf8 = self.soup(self.utf8_data)
        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
        self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
    def test_utf8_out(self):
        # The internal data structures can be encoded as UTF-8.
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
    @skipIf(
        PYTHON_3_PRE_3_2,
        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
    def test_attribute_name_containing_unicode_characters(self):
        markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
 class TestUnicodeDammit(unittest.TestCase):
    """Standalone tests of UnicodeDammit."""
    def test_unicode_input(self):
        markup = "I'm already Unicode! \N{SNOWMAN}"
        dammit = UnicodeDammit(markup)
        self.assertEqual(dammit.unicode_markup, markup)
    def test_smart_quotes_to_unicode(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup)
        self.assertEqual(
            dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
    def test_smart_quotes_to_xml_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
    def test_smart_quotes_to_html_entities(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="html")
        self.assertEqual(
            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
    def test_smart_quotes_to_ascii(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
        self.assertEqual(
            dammit.unicode_markup, """<foo>''""</foo>""")
    def test_detect_utf8(self):
        utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
        dammit = UnicodeDammit(utf8)
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
    def test_convert_hebrew(self):
        hebrew = b"\xed\xe5\xec\xf9"
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
        self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
    def test_dont_see_smart_quotes_where_there_are_none(self):
        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
        dammit = UnicodeDammit(utf_8)
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
    def test_ignore_inappropriate_codecs(self):
        utf8_data = "Räksmörgås".encode("utf-8")
        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_ignore_invalid_codecs(self):
        utf8_data = "Räksmörgås".encode("utf-8")
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
            dammit = UnicodeDammit(utf8_data, [bad_encoding])
            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
    def test_exclude_encodings(self):
        # This is UTF-8.
        utf8_data = "Räksmörgås".encode("utf-8")
        # But if we exclude UTF-8 from consideration, the guess is
        # Windows-1252.
        dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
        self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
        # And if we exclude that, there is no valid guess at all.
        dammit = UnicodeDammit(
            utf8_data, exclude_encodings=["utf-8", "windows-1252"])
        self.assertEqual(dammit.original_encoding, None)
    def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
        detected = EncodingDetector(
            b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
        encodings = list(detected.encodings)
        assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
    def test_detect_html5_style_meta_tag(self):
        for data in (
            b'<html><meta charset="euc-jp" /></html>',
            b"<html><meta charset='euc-jp' /></html>",
            b"<html><meta charset=euc-jp /></html>",
            b"<html><meta charset=euc-jp/></html>"):
            dammit = UnicodeDammit(data, is_html=True)
            self.assertEqual(
                "euc-jp", dammit.original_encoding)
    def test_last_ditch_entity_replacement(self):
        # This is a UTF-8 document that contains bytestrings
        # completely incompatible with UTF-8 (ie. encoded with some other
        # encoding).
        #
        # Since there is no consistent encoding for the document,
        # Unicode, Dammit will eventually encode the document as UTF-8
        # and encode the incompatible characters as REPLACEMENT
        # CHARACTER.
        #
        # If chardet is installed, it will detect that the document
        # can be converted into ISO-8859-1 without errors. This happens
        # to be the wrong encoding, but it is a consistent encoding, so the
        # code we're testing here won't run.
        #
        # So we temporarily disable chardet if it's present.
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
        chardet = bs4.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            bs4.dammit.chardet_dammit = noop
            dammit = UnicodeDammit(doc)
            self.assertEqual(True, dammit.contains_replacement_characters)
            self.assertTrue("\ufffd" in dammit.unicode_markup)
            soup = BeautifulSoup(doc, "html.parser")
            self.assertTrue(soup.contains_replacement_characters)
        finally:
            logging.disable(logging.NOTSET)
            bs4.dammit.chardet_dammit = chardet
    def test_byte_order_mark_removed(self):
        # A document written in UTF-16LE will have its byte order marker stripped.
        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
        dammit = UnicodeDammit(data)
        self.assertEqual("<a>áé</a>", dammit.unicode_markup)
        self.assertEqual("utf-16le", dammit.original_encoding)
    def test_detwingle(self):
        # Here's a UTF8 document.
        utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
        # Here's a Windows-1252 document.
        windows_1252 = (
            "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
            "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
        # Through some unholy alchemy, they've been stuck together.
        doc = utf8 + windows_1252 + utf8
        # The document can't be turned into UTF-8:
        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
        # Unicode, Dammit thinks the whole document is Windows-1252,
        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
        # But if we run it through fix_embedded_windows_1252, it's fixed:
        fixed = UnicodeDammit.detwingle(doc)
        self.assertEqual(
            "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
    def test_detwingle_ignores_multibyte_characters(self):
        # Each of these characters has a UTF-8 representation ending
        # in \x93. \x93 is a smart quote if interpreted as
        # Windows-1252. But our code knows to skip over multibyte
        # UTF-8 characters, so they'll survive the process unscathed.
        for tricky_unicode_char in (
            "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
            "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
            "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
            ):
            input = tricky_unicode_char.encode("utf8")
            self.assertTrue(input.endswith(b'\x93'))
            output = UnicodeDammit.detwingle(input)
            self.assertEqual(output, input)
    def test_find_declared_encoding(self):
        # Test our ability to find a declared encoding inside an
        # XML or HTML document.
        #
        # Even if the document comes in as Unicode, it may be
        # interesting to know what encoding was claimed
        # originally.
        html_unicode = '<html><head><meta charset="utf-8"></head></html>'
        html_bytes = html_unicode.encode("ascii")
        xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
        xml_bytes = xml_unicode.encode("ascii")
        m = EncodingDetector.find_declared_encoding
        self.assertEqual(None, m(html_unicode, is_html=False))
        self.assertEqual("utf-8", m(html_unicode, is_html=True))
        self.assertEqual("utf-8", m(html_bytes, is_html=True))
        self.assertEqual("iso-8859-1", m(xml_unicode))
        self.assertEqual("iso-8859-1", m(xml_bytes))
        # Normally, only the first few kilobytes of a document are checked for
        # an encoding.
        spacer = b' ' * 5000
        self.assertEqual(None, m(spacer + html_bytes))
        self.assertEqual(None, m(spacer + xml_bytes))
        # But you can tell find_declared_encoding to search an entire
        # HTML document.
        self.assertEqual(
            "utf-8",
            m(spacer + html_bytes, is_html=True, search_entire_document=True)
        )
        # The XML encoding declaration has to be the very first thing
        # in the document. We'll allow whitespace before the document
        # starts, but nothing else.
        self.assertEqual(
            "iso-8859-1",
            m(xml_bytes, search_entire_document=True)
        )
        self.assertEqual(
            None, m(b'a' + xml_bytes, search_entire_document=True)
        )
 class TestNamedspacedAttribute(SoupTest):
    def test_name_may_be_none_or_missing(self):
        a = NamespacedAttribute("xmlns", None)
        self.assertEqual(a, "xmlns")
        a = NamespacedAttribute("xmlns")
        self.assertEqual(a, "xmlns")
    def test_attribute_is_equivalent_to_colon_separated_string(self):
        a = NamespacedAttribute("a", "b")
        self.assertEqual("a:b", a)
    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
        a = NamespacedAttribute("a", "b", "c")
        b = NamespacedAttribute("a", "b", "c")
        self.assertEqual(a, b)
        # The actual namespace is not considered.
        c = NamespacedAttribute("a", "b", None)
        self.assertEqual(a, c)
        # But name and prefix are important.
        d = NamespacedAttribute("a", "z", "c")
        self.assertNotEqual(a, d)
        e = NamespacedAttribute("z", "b", "c")
        self.assertNotEqual(a, e)
 class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
    def test_content_meta_attribute_value(self):
        value = CharsetMetaAttributeValue("euc-jp")
        self.assertEqual("euc-jp", value)
        self.assertEqual("euc-jp", value.original_value)
        self.assertEqual("utf8", value.encode("utf8"))
    def test_content_meta_attribute_value(self):
        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
        self.assertEqual("text/html; charset=euc-jp", value)
        self.assertEqual("text/html; charset=euc-jp", value.original_value)
        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py