Update bleach-4.1.0

2025-08-20 13:23:24 -07:00 · 2021-10-14 20:51:23 -07:00 · 2021-10-14 20:51:23 -07:00 · a4130d6c56
commit a4130d6c56
parent 4086529906
51 changed files with 17071 additions and 568 deletions
--- a/lib/bleach/init.py
+++ b/lib/bleach/init.py
@ -1,401 +1,131 @@
 # -*- coding: utf-8 -*-

-from __future__ import unicode_literals
-import logging
-import re
+import packaging.version

-import html5lib
-from html5lib.sanitizer import HTMLSanitizer
-from html5lib.serializer.htmlserializer import HTMLSerializer
-
-from . import callbacks as linkify_callbacks
-from .encoding import force_unicode
-from .sanitizer import BleachSanitizer
+from bleach.linkifier import (
+    DEFAULT_CALLBACKS,
+    Linker,
+)
+from bleach.sanitizer import (
+    ALLOWED_ATTRIBUTES,
+    ALLOWED_PROTOCOLS,
+    ALLOWED_STYLES,
+    ALLOWED_TAGS,
+    Cleaner,
+)


-VERSION = (1, 4, 2)
-__version__ = '.'.join([str(n) for n in VERSION])
-
-__all__ = ['clean', 'linkify']
-
-log = logging.getLogger('bleach')
-
-ALLOWED_TAGS = [
-    'a',
-    'abbr',
-    'acronym',
-    'b',
-    'blockquote',
-    'code',
-    'em',
-    'i',
-    'li',
-    'ol',
-    'strong',
-    'ul',
-]
-
-ALLOWED_ATTRIBUTES = {
-    'a': ['href', 'title'],
-    'abbr': ['title'],
-    'acronym': ['title'],
-}
-
-ALLOWED_STYLES = []
-
-ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-
-TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
-       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
-       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
-       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
-       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
-       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
-       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
-       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
-       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
-       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
-       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
-       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
-       xn xxx ye yt yu za zm zw""".split()
-
-# Make sure that .com doesn't get matched by .co first
-TLDS.reverse()
-
-PROTOCOLS = HTMLSanitizer.acceptable_protocols
-
-url_re = re.compile(
-    r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
-        # /path/zz (excluding "unsafe" chars from RFC 1738,
-        # except for # and ~, which happen in practice)
-    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
-    re.IGNORECASE | re.VERBOSE | re.UNICODE)
-
-proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-
-punct_re = re.compile(r'([\.,]+)$')
-
-email_re = re.compile(
-    r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
-    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
-        |\\[\001-011\013\014\016-\177])*"  # quoted-string
-    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
-    """,
-    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
-
-NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
-
-ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
-# a simple routine that returns the tag name with the namespace prefix
-# as returned by etree's Element.tag attribute
-
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+# yyyymmdd
+__releasedate__ = "20210825"
+# x.y.z or x.y.z.dev0 -- semver
+__version__ = "4.1.0"
+VERSION = packaging.version.Version(__version__)


-def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
-          strip_comments=True):
-    """Clean an HTML fragment and return it
+__all__ = ["clean", "linkify"]

-    :arg text: the text to clean
-    :arg tags: whitelist of allowed tags; defaults to
-        ``bleach.ALLOWED_TAGS``
-    :arg attributes: whitelist of allowed attributes; defaults to
-        ``bleach.ALLOWED_ATTRIBUTES``
-    :arg styles: whitelist of allowed css; defaults to
-        ``bleach.ALLOWED_STYLES``
-    :arg protocols: whitelist of allowed protocols for links; defaults
-        to ``bleach.ALLOWED_PROTOCOLS``
-    :arg strip: whether or not to strip disallowed elements
-    :arg strip_comments: whether or not to strip HTML comments
+
+def clean(
+    text,
+    tags=ALLOWED_TAGS,
+    attributes=ALLOWED_ATTRIBUTES,
+    styles=ALLOWED_STYLES,
+    protocols=ALLOWED_PROTOCOLS,
+    strip=False,
+    strip_comments=True,
+):
+    """Clean an HTML fragment of malicious content and return it
+
+    This function is a security-focused function whose sole purpose is to
+    remove malicious content from a string such that it can be displayed as
+    content in a web page.
+
+    This function is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    Example::
+
+        import bleach
+
+        better_text = bleach.clean(yucky_text)
+
+
+    .. Note::
+
+       If you're cleaning a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.sanitizer.Cleaner` instance.
+
+    :arg str text: the text to clean
+
+    :arg list tags: allowed list of tags; defaults to
+        ``bleach.sanitizer.ALLOWED_TAGS``
+
+    :arg dict attributes: allowed attributes; can be a callable, list or dict;
+        defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+    :arg list styles: allowed list of css styles; defaults to
+        ``bleach.sanitizer.ALLOWED_STYLES``
+
+    :arg list protocols: allowed list of protocols for links; defaults
+        to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+    :arg bool strip: whether or not to strip disallowed elements
+
+    :arg bool strip_comments: whether or not to strip HTML comments
+
+    :returns: cleaned text as unicode

    """
-    if not text:
-        return ''
-
-    text = force_unicode(text)
-
-    class s(BleachSanitizer):
-        allowed_elements = tags
-        allowed_attributes = attributes
-        allowed_css_properties = styles
-        allowed_protocols = protocols
-        strip_disallowed_elements = strip
-        strip_html_comments = strip_comments
-
-    parser = html5lib.HTMLParser(tokenizer=s)
-
-    return _render(parser.parseFragment(text))
+    cleaner = Cleaner(
+        tags=tags,
+        attributes=attributes,
+        styles=styles,
+        protocols=protocols,
+        strip=strip,
+        strip_comments=strip_comments,
+    )
+    return cleaner.clean(text)


-def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            parse_email=False, tokenizer=HTMLSanitizer):
-    """Convert URL-like strings in an HTML fragment to links.
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    .. Note::
+
+       If you're linking a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.linkifier.Linker` instance.
+
+    .. Note::
+
+       If you have text that you want to clean and then linkify, consider using
+       the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+       pass. That way you're not parsing the HTML twice.
+
+    :arg str text: the text to linkify
+
+    :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+        defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+    :arg list skip_tags: list of tags that you don't want to linkify the
+        contents of; for example, you could set this to ``['pre']`` to skip
+        linkifying contents of ``pre`` tags
+
+    :arg bool parse_email: whether or not to linkify email addresses
+
+    :returns: linkified text as unicode

-    linkify() converts strings that look like URLs or domain names in a
-    blob of text that may be an HTML fragment to links, while preserving
-    (a) links already in the string, (b) urls found in attributes, and
-    (c) email addresses.
    """
-    text = force_unicode(text)
-
-    if not text:
-        return ''
-
-    parser = html5lib.HTMLParser(tokenizer=tokenizer)
-
-    forest = parser.parseFragment(text)
-    _seen = set([])
-
-    def replace_nodes(tree, new_frag, node, index=0):
-        """
-        Doesn't really replace nodes, but inserts the nodes contained in
-        new_frag into the treee at position index and returns the number
-        of nodes inserted.
-        If node is passed in, it is removed from the tree
-        """
-        count = 0
-        new_tree = parser.parseFragment(new_frag)
-        # capture any non-tag text at the start of the fragment
-        if new_tree.text:
-            if index == 0:
-                tree.text = tree.text or ''
-                tree.text += new_tree.text
-            else:
-                tree[index - 1].tail = tree[index - 1].tail or ''
-                tree[index - 1].tail += new_tree.text
-        # the put in the tagged elements into the old tree
-        for n in new_tree:
-            if n.tag == ETREE_TAG('a'):
-                _seen.add(n)
-            tree.insert(index + count, n)
-            count += 1
-        # if we got a node to remove...
-        if node is not None:
-            tree.remove(node)
-        return count
-
-    def strip_wrapping_parentheses(fragment):
-        """Strips wrapping parentheses.
-
-        Returns a tuple of the following format::
-
-            (string stripped from wrapping parentheses,
-             count of stripped opening parentheses,
-             count of stripped closing parentheses)
-        """
-        opening_parentheses = closing_parentheses = 0
-        # Count consecutive opening parentheses
-        # at the beginning of the fragment (string).
-        for char in fragment:
-            if char == '(':
-                opening_parentheses += 1
-            else:
-                break
-
-        if opening_parentheses:
-            newer_frag = ''
-            # Cut the consecutive opening brackets from the fragment.
-            fragment = fragment[opening_parentheses:]
-            # Reverse the fragment for easier detection of parentheses
-            # inside the URL.
-            reverse_fragment = fragment[::-1]
-            skip = False
-            for char in reverse_fragment:
-                # Remove the closing parentheses if it has a matching
-                # opening parentheses (they are balanced).
-                if (char == ')' and
-                        closing_parentheses < opening_parentheses and
-                        not skip):
-                    closing_parentheses += 1
-                    continue
-                # Do not remove ')' from the URL itself.
-                elif char != ')':
-                    skip = True
-                newer_frag += char
-            fragment = newer_frag[::-1]
-
-        return fragment, opening_parentheses, closing_parentheses
-
-    def apply_callbacks(attrs, new):
-        for cb in callbacks:
-            attrs = cb(attrs, new)
-            if attrs is None:
-                return None
-        return attrs
-
-    def _render_inner(node):
-        out = ['' if node.text is None else node.text]
-        for subnode in node:
-            out.append(_render(subnode))
-            if subnode.tail:
-                out.append(subnode.tail)
-        return ''.join(out)
-
-    def linkify_nodes(tree, parse_text=True):
-        children = len(tree)
-        current_child = -1
-        # start at -1 to process the parent first
-        while current_child < len(tree):
-            if current_child < 0:
-                node = tree
-                if parse_text and node.text:
-                    new_txt = old_txt = node.text
-                    if parse_email:
-                        new_txt = re.sub(email_re, email_repl, node.text)
-                        if new_txt and new_txt != node.text:
-                            node.text = ''
-                            adj = replace_nodes(tree, new_txt, None, 0)
-                            children += adj
-                            current_child += adj
-                            linkify_nodes(tree, True)
-                            continue
-
-                    new_txt = re.sub(url_re, link_repl, new_txt)
-                    if new_txt != old_txt:
-                        node.text = ''
-                        adj = replace_nodes(tree, new_txt, None, 0)
-                        children += adj
-                        current_child += adj
-                        continue
-            else:
-                node = tree[current_child]
-
-            if parse_text and node.tail:
-                new_tail = old_tail = node.tail
-                if parse_email:
-                    new_tail = re.sub(email_re, email_repl, new_tail)
-                    if new_tail != node.tail:
-                        node.tail = ''
-                        adj = replace_nodes(tree, new_tail, None,
-                                            current_child + 1)
-                        # Insert the new nodes made from my tail into
-                        # the tree right after me. current_child+1
-                        children += adj
-                        continue
-
-                new_tail = re.sub(url_re, link_repl, new_tail)
-                if new_tail != old_tail:
-                    node.tail = ''
-                    adj = replace_nodes(tree, new_tail, None,
-                                        current_child + 1)
-                    children += adj
-
-            if node.tag == ETREE_TAG('a') and not (node in _seen):
-                if not node.get('href', None) is None:
-                    attrs = dict(node.items())
-
-                    _text = attrs['_text'] = _render_inner(node)
-
-                    attrs = apply_callbacks(attrs, False)
-
-                    if attrs is None:
-                        # <a> tag replaced by the text within it
-                        adj = replace_nodes(tree, _text, node,
-                                            current_child)
-                        current_child -= 1
-                        # pull back current_child by 1 to scan the
-                        # new nodes again.
-                    else:
-                        text = force_unicode(attrs.pop('_text'))
-                        for attr_key, attr_val in attrs.items():
-                            node.set(attr_key, attr_val)
-
-                        for n in reversed(list(node)):
-                            node.remove(n)
-                        text = parser.parseFragment(text)
-                        node.text = text.text
-                        for n in text:
-                            node.append(n)
-                        _seen.add(node)
-
-            elif current_child >= 0:
-                if node.tag == ETREE_TAG('pre') and skip_pre:
-                    linkify_nodes(node, False)
-                elif not (node in _seen):
-                    linkify_nodes(node, True)
-
-            current_child += 1
-
-    def email_repl(match):
-        addr = match.group(0).replace('"', '&quot;')
-        link = {
-            '_text': addr,
-            'href': 'mailto:{0!s}'.format(addr),
-        }
-        link = apply_callbacks(link, True)
-
-        if link is None:
-            return addr
-
-        _href = link.pop('href')
-        _text = link.pop('_text')
-
-        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
-        attr = '{0!s}="{1!s}"'
-        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-        return repl.format(_href, attribs, _text)
-
-    def link_repl(match):
-        url = match.group(0)
-        open_brackets = close_brackets = 0
-        if url.startswith('('):
-            _wrapping = strip_wrapping_parentheses(url)
-            url, open_brackets, close_brackets = _wrapping
-        end = ''
-        m = re.search(punct_re, url)
-        if m:
-            end = m.group(0)
-            url = url[0:m.start()]
-        if re.search(proto_re, url):
-            href = url
-        else:
-            href = ''.join(['http://', url])
-
-        link = {
-            '_text': url,
-            'href': href,
-        }
-
-        link = apply_callbacks(link, True)
-
-        if link is None:
-            return '(' * open_brackets + url + ')' * close_brackets
-
-        _text = link.pop('_text')
-        _href = link.pop('href')
-
-        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
-        attr = '{0!s}="{1!s}"'
-        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-
-        return repl.format('(' * open_brackets,
-                           _href, attribs, _text, end,
-                           ')' * close_brackets)
-
-    try:
-        linkify_nodes(forest)
-    except RuntimeError as e:
-        # If we hit the max recursion depth, just return what we've got.
-        log.exception('Probable recursion error: {0!r}'.format(e))
-
-    return _render(forest)
-
-
-def _render(tree):
-    """Try rendering as HTML, then XML, then give up."""
-    return force_unicode(_serialize(tree))
-
-
-def _serialize(domtree):
-    walker = html5lib.treewalkers.getTreeWalker('etree')
-    stream = walker(domtree)
-    serializer = HTMLSerializer(quote_attr_values=True,
-                                alphabetical_attributes=True,
-                                omit_optional_tags=False)
-    return serializer.render(stream)
+    linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email)
+    return linker.linkify(text)
--- a/lib/bleach/_vendor/README.rst
+++ b/lib/bleach/_vendor/README.rst
@ -0,0 +1,61 @@
+=======================
+Vendored library policy
+=======================
+
+To simplify Bleach development, we're now vendoring certain libraries that
+we use.
+
+Vendored libraries must follow these rules:
+
+1. Vendored libraries must be pure Python--no compiling.
+2. Source code for the libary is included in this directory.
+3. License must be included in this repo and in the Bleach distribution.
+4. Requirements of the library become requirements of Bleach.
+5. No modifications to the library may be made.
+
+
+Adding/Updating a vendored library
+==================================
+
+Way to vendor a library or update a version:
+
+1. Update ``vendor.txt`` with the library, version, and hash. You can use
+   `hashin <https://pypi.org/project/hashin/>`_.
+2. Remove all old files and directories of the old version.
+3. Run ``pip_install_vendor.sh`` and check everything it produced in including
+   the ``.dist-info`` directory and contents.
+4. Update the bleach minor version in the next release.
+
+
+Reviewing a change involving a vendored library
+===============================================
+
+Way to verify a vendored library addition/update:
+
+1. Pull down the branch.
+2. Delete all the old files and directories of the old version.
+3. Run ``pip_install_vendor.sh``.
+4. Run ``git diff`` and verify there are no changes.
+
+
+NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info
+
+
+Removing/Unvendoring a vendored library
+=======================================
+
+A vendored library might be removed for any of the following reasons:
+
+* it violates the vendoring policy (e.g. an incompatible license
+  change)
+* a suitable replacement is found
+* bleach has the resources to test and QA new bleach releases against
+  multiple versions of the previously vendored library
+
+To unvendor a library:
+
+1. Remove the library and its hashes from ``vendor.txt``.
+2. Remove library files and directories from this directory.
+3. Run ``install_vendor.sh`` and check the previously vendored library including
+   the ``.dist-info`` directory and contents is not installed.
+4. Update the bleach minor version in the next release.
--- a/lib/bleach/_vendor/init.py
+++ b/lib/bleach/_vendor/init.py
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
@ -0,0 +1,66 @@
+Credits
+=======
+
+``html5lib`` is written and maintained by:
+
+- James Graham
+- Sam Sneddon
+- Łukasz Langa
+- Will Kahn-Greene
+
+
+Patches and suggestions
+-----------------------
+(In chronological order, by first commit:)
+
+- Anne van Kesteren
+- Lachlan Hunt
+- lantis63
+- Sam Ruby
+- Thomas Broyer
+- Tim Fletcher
+- Mark Pilgrim
+- Ryan King
+- Philip Taylor
+- Edward Z. Yang
+- fantasai
+- Philip Jägenstedt
+- Ms2ger
+- Mohammad Taha Jahangir
+- Andy Wingo
+- Andreas Madsack
+- Karim Valiev
+- Juan Carlos Garcia Segovia
+- Mike West
+- Marc DM
+- Simon Sapin
+- Michael[tm] Smith
+- Ritwik Gupta
+- Marc Abramowitz
+- Tony Lopes
+- lilbludevil
+- Kevin
+- Drew Hubl
+- Austin Kumbera
+- Jim Baker
+- Jon Dufresne
+- Donald Stufft
+- Alex Gaynor
+- Nik Nyby
+- Jakub Wilk
+- Sigmund Cherem
+- Gabi Davar
+- Florian Mounier
+- neumond
+- Vitalik Verhovodov
+- Kovid Goyal
+- Adam Chainz
+- John Vandenberg
+- Eric Amorde
+- Benedikt Morbach
+- Jonathan Vanasco
+- Tom Most
+- Ville Skyttä
+- Hugo van Kemenade
+- Mark Vasilkov
+
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
@ -0,0 +1 @@
+pip
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
@ -0,0 +1,552 @@
+Metadata-Version: 2.1
+Name: html5lib
+Version: 1.1
+Summary: HTML parser based on the WHATWG HTML specification
+Home-page: https://github.com/html5lib/html5lib-python
+Maintainer: James Graham
+Maintainer-email: james@hoppipolla.co.uk
+License: MIT License
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
+Requires-Dist: six (>=1.9)
+Requires-Dist: webencodings
+Provides-Extra: all
+Requires-Dist: genshi ; extra == 'all'
+Requires-Dist: chardet (>=2.2) ; extra == 'all'
+Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all'
+Provides-Extra: chardet
+Requires-Dist: chardet (>=2.2) ; extra == 'chardet'
+Provides-Extra: genshi
+Requires-Dist: genshi ; extra == 'genshi'
+Provides-Extra: lxml
+Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml'
+
+html5lib
+========
+
+.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master
+    :target: https://travis-ci.org/html5lib/html5lib-python
+
+
+html5lib is a pure-python library for parsing HTML. It is designed to
+conform to the WHATWG HTML specification, as is implemented by all major
+web browsers.
+
+
+Usage
+-----
+
+Simple usage follows this pattern:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      document = html5lib.parse(f)
+
+or:
+
+.. code-block:: python
+
+  import html5lib
+  document = html5lib.parse("<p>Hello World!")
+
+By default, the ``document`` will be an ``xml.etree`` element instance.
+Whenever possible, html5lib chooses the accelerated ``ElementTree``
+implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+
+Two other tree types are supported: ``xml.dom.minidom`` and
+``lxml.etree``. To use an alternative format, specify the name of
+a treebuilder:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
+
+When using with ``urllib2`` (Python 2), the charset from HTTP should be
+pass into html5lib as follows:
+
+.. code-block:: python
+
+  from contextlib import closing
+  from urllib2 import urlopen
+  import html5lib
+
+  with closing(urlopen("http://example.com/")) as f:
+      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
+
+When using with ``urllib.request`` (Python 3), the charset from HTTP
+should be pass into html5lib as follows:
+
+.. code-block:: python
+
+  from urllib.request import urlopen
+  import html5lib
+
+  with urlopen("http://example.com/") as f:
+      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
+
+To have more control over the parser, create a parser object explicitly.
+For instance, to make the parser raise exceptions on parse errors, use:
+
+.. code-block:: python
+
+  import html5lib
+  with open("mydocument.html", "rb") as f:
+      parser = html5lib.HTMLParser(strict=True)
+      document = parser.parse(f)
+
+When you're instantiating parser objects explicitly, pass a treebuilder
+class as the ``tree`` keyword argument to use an alternative document
+format:
+
+.. code-block:: python
+
+  import html5lib
+  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
+  minidom_document = parser.parse("<p>Hello World!")
+
+More documentation is available at https://html5lib.readthedocs.io/.
+
+
+Installation
+------------
+
+html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
+
+.. code-block:: bash
+
+    $ pip install html5lib
+
+The goal is to support a (non-strict) superset of the versions that `pip
+supports
+<https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
+
+Optional Dependencies
+---------------------
+
+The following third-party libraries may be used for additional
+functionality:
+
+- ``lxml`` is supported as a tree format (for both building and
+  walking) under CPython (but *not* PyPy where it is known to cause
+  segfaults);
+
+- ``genshi`` has a treewalker (but not builder); and
+
+- ``chardet`` can be used as a fallback when character encoding cannot
+  be determined.
+
+
+Bugs
+----
+
+Please report any bugs on the `issue tracker
+<https://github.com/html5lib/html5lib-python/issues>`_.
+
+
+Tests
+-----
+
+Unit tests require the ``pytest`` and ``mock`` libraries and can be
+run using the ``py.test`` command in the root directory.
+
+Test data are contained in a separate `html5lib-tests
+<https://github.com/html5lib/html5lib-tests>`_ repository and included
+as a submodule, thus for git checkouts they must be initialized::
+
+  $ git submodule init
+  $ git submodule update
+
+If you have all compatible Python implementations available on your
+system, you can run tests on all of them using the ``tox`` utility,
+which can be found on PyPI.
+
+
+Questions?
+----------
+
+There's a mailing list available for support on Google Groups,
+`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
+though you may get a quicker response asking on IRC in `#whatwg on
+irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
+
+Change Log
+----------
+
+1.1
+~~~
+
+UNRELEASED
+
+Breaking changes:
+
+* Drop support for Python 3.3. (#358)
+* Drop support for Python 3.4. (#421)
+
+Deprecations:
+
+* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
+  ``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
+  <https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
+  use. (#443)
+
+Other changes:
+
+* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
+  ``html5lib`` keeps working in future Python versions. (#403)
+* Drop optional ``datrie`` dependency. (#442)
+
+
+1.0.1
+~~~~~
+
+Released on December 7, 2017
+
+Breaking changes:
+
+* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
+* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
+
+Features:
+
+* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
+  Will Kahn-Greene!)
+* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
+* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
+* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
+* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
+* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
+  Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
+* Semver-compliant version number.
+
+Bug fixes:
+
+* Add support for setuptools < 18.5 to support environment markers. (Thank you,
+  John Vandenberg!)
+* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
+* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
+  you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
+* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
+  Kahn-Greene!)
+* Include license file in generated wheel package. (#350) (Thank you, Jon
+  Dufresne!)
+* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
+* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
+  Komal Dembla, Hugo!)
+
+
+1.0
+~~~
+
+Released and unreleased on December 7, 2017. Badly packaged release.
+
+
+0.999999999/1.0b10
+~~~~~~~~~~~~~~~~~~
+
+Released on July 15, 2016
+
+* Fix attribute order going to the tree builder to be document order
+  instead of reverse document order(!).
+
+
+0.99999999/1.0b9
+~~~~~~~~~~~~~~~~
+
+Released on July 14, 2016
+
+* **Added ordereddict as a mandatory dependency on Python 2.6.**
+
+* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
+  extras that will do the right thing based on the specific
+  interpreter implementation.
+
+* Now requires the ``mock`` package for the testsuite.
+
+* Cease supporting DATrie under PyPy.
+
+* **Remove PullDOM support, as this hasn't ever been properly
+  tested, doesn't entirely work, and as far as I can tell is
+  completely unused by anyone.**
+
+* Move testsuite to ``py.test``.
+
+* **Fix #124: move to webencodings for decoding the input byte stream;
+  this makes html5lib compliant with the Encoding Standard, and
+  introduces a required dependency on webencodings.**
+
+* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
+
+* **Fix comments containing double-dash with lxml 3.5 and above.**
+
+* **Use scripting disabled by default (as we don't implement
+  scripting).**
+
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+  allowing attribute values to be escaped out of in old browser versions,
+  changing the quote_attr_values option on serializer to take one of
+  three values, "always" (the old True value), "legacy" (the new option,
+  and the new default), and "spec" (the old False value, and the old
+  default).**
+
+* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
+  (instead of the tokenizer); as such, this will require amending all
+  callers of it to use it via the treewalker API.**
+
+* **Drop support of charade, now that chardet is supported once more.**
+
+* **Replace the charset keyword argument on parse and related methods
+  with a set of keyword arguments: override_encoding, transport_encoding,
+  same_origin_parent_encoding, likely_encoding, and default_encoding.**
+
+* **Move filters._base, treebuilder._base, and treewalkers._base to .base
+  to clarify their status as public.**
+
+* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
+  sanitizer.htmlsanitizer module and move that to sanitizer. This means
+  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
+  code changes.**
+
+* **Rename treewalkers.lxmletree to .etree_lxml and
+  treewalkers.genshistream to .genshi to have a consistent API.**
+
+* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
+  utils) to be underscore prefixed to clarify their status as private.
+
+
+0.9999999/1.0b8
+~~~~~~~~~~~~~~~
+
+Released on September 10, 2015
+
+* Fix #195: fix the sanitizer to drop broken URLs (it threw an
+  exception between 0.9999 and 0.999999).
+
+
+0.999999/1.0b7
+~~~~~~~~~~~~~~
+
+Released on July 7, 2015
+
+* Fix #189: fix the sanitizer to allow relative URLs again (as it did
+  prior to 0.9999/1.0b5).
+
+
+0.99999/1.0b6
+~~~~~~~~~~~~~
+
+Released on April 30, 2015
+
+* Fix #188: fix the sanitizer to not throw an exception when sanitizing
+  bogus data URLs.
+
+
+0.9999/1.0b5
+~~~~~~~~~~~~
+
+Released on April 29, 2015
+
+* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
+  this sounds, this has no known security implications.  No known version
+  of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
+  Chrome (1 to current), or Opera (12 to current) will run any script
+  provided in these attributes.
+
+* Pass error message to the ParseError exception in strict parsing mode.
+
+* Allow data URIs in the sanitizer, with a whitelist of content-types.
+
+* Add support for Python implementations that don't support lone
+  surrogates (read: Jython). Fixes #2.
+
+* Remove localization of error messages. This functionality was totally
+  unused (and untested that everything was localizable), so we may as
+  well follow numerous browsers in not supporting translating technical
+  strings.
+
+* Expose treewalkers.pprint as a public API.
+
+* Add a documentEncoding property to HTML5Parser, fix #121.
+
+
+0.999
+~~~~~
+
+Released on December 23, 2013
+
+* Fix #127: add work-around for CPython issue #20007: .read(0) on
+  http.client.HTTPResponse drops the rest of the content.
+
+* Fix #115: lxml treewalker can now deal with fragments containing, at
+  their root level, text nodes with non-ASCII characters on Python 2.
+
+
+0.99
+~~~~
+
+Released on September 10, 2013
+
+* No library changes from 1.0b3; released as 0.99 as pip has changed
+  behaviour from 1.4 to avoid installing pre-release versions per
+  PEP 440.
+
+
+1.0b3
+~~~~~
+
+Released on July 24, 2013
+
+* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
+  implementation using it should be moved to
+  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
+  for years.
+
+* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
+  object, thereby fixing any case where html5lib is passed a
+  non-seekable RawIOBase-like object.
+
+
+1.0b2
+~~~~~
+
+Released on June 27, 2013
+
+* Removed reordering of attributes within the serializer. There is now
+  an ``alphabetical_attributes`` option which preserves the previous
+  behaviour through a new filter. This allows attribute order to be
+  preserved through html5lib if the tree builder preserves order.
+
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+  ``treeadapters.sax.to_sax`` which is generic and supports any
+  treewalker; it also resolves all known bugs with ``dom2sax``.
+
+* Fix treewalker assertions on hitting bytes strings on
+  Python 2. Previous to 1.0b1, treewalkers coped with mixed
+  bytes/unicode data on Python 2; this reintroduces this prior
+  behaviour on Python 2. Behaviour is unchanged on Python 3.
+
+
+1.0b1
+~~~~~
+
+Released on May 17, 2013
+
+* Implementation updated to implement the `HTML specification
+  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
+  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
+
+* Python 3.2+ supported in a single codebase using the ``six`` library.
+
+* Removed support for Python 2.5 and older.
+
+* Removed the deprecated Beautiful Soup 3 treebuilder.
+  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
+  since it doesn't support namespaces, foreign content like SVG and
+  MathML is parsed incorrectly.
+
+* Removed ``simpletree`` from the package. The default tree builder is
+  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
+  available, and ``xml.etree.ElementTree`` otherwise).
+
+* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
+  output was well-formed XML, and hence provided little of use.
+
+* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
+  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
+  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
+
+* Optional heuristic character encoding detection now based on
+  ``charade`` for Python 2.6 - 3.3 compatibility.
+
+* Optional ``Genshi`` treewalker support fixed.
+
+* Many bugfixes, including:
+
+  * #33: null in attribute value breaks XML AttValue;
+
+  * #4: nested, indirect descendant, <button> causes infinite loop;
+
+  * `Google Code 215
+    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
+    detect seekable streams;
+
+  * `Google Code 206
+    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
+    support for <video preload=...>, <audio preload=...>;
+
+  * `Google Code 205
+    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
+    support for <video poster=...>;
+
+  * `Google Code 202
+    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
+    file breaks InputStream.
+
+* Source code is now mostly PEP 8 compliant.
+
+* Test harness has been improved and now depends on ``nose``.
+
+* Documentation updated and moved to https://html5lib.readthedocs.io/.
+
+
+0.95
+~~~~
+
+Released on February 11, 2012
+
+
+0.90
+~~~~
+
+Released on January 17, 2010
+
+
+0.11.1
+~~~~~~
+
+Released on June 12, 2008
+
+
+0.11
+~~~~
+
+Released on June 10, 2008
+
+
+0.10
+~~~~
+
+Released on October 7, 2007
+
+
+0.9
+~~~
+
+Released on March 11, 2007
+
+
+0.2
+~~~
+
+Released on January 8, 2007
+
+
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
@ -0,0 +1,41 @@
+html5lib-1.1.dist-info/AUTHORS.rst,sha256=DrNAMifoDpuQyJn-KW-H6K8Tt2a5rKnV2UF4-DRrGUI,983
+html5lib-1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+html5lib-1.1.dist-info/LICENSE,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
+html5lib-1.1.dist-info/METADATA,sha256=Y3w-nd_22HQnQRy3yypVsV_ke2FF94uUD4-vGpc2DnI,16076
+html5lib-1.1.dist-info/RECORD,,
+html5lib-1.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html5lib-1.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
+html5lib-1.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
+html5lib/__init__.py,sha256=pWnYcfZ69wNLrdQL7bpr49FUi8O8w0KhKCOHsyRgYGQ,1143
+html5lib/_ihatexml.py,sha256=ifOwF7pXqmyThIXc3boWc96s4MDezqRrRVp7FwDYUFs,16728
+html5lib/_inputstream.py,sha256=IKuMiY8rzb7pqIGCpbvTqsxysLEpgEHWYvYEFu4LUAI,32300
+html5lib/_tokenizer.py,sha256=WvJQa2Mli4NtTmhLXkX8Jy5FcWttqCaiDTiKyaw8D-k,77028
+html5lib/_trie/__init__.py,sha256=nqfgO910329BEVJ5T4psVwQtjd2iJyEXQ2-X8c1YxwU,109
+html5lib/_trie/_base.py,sha256=CaybYyMro8uERQYjby2tTeSUatnWDfWroUN9N7ety5w,1013
+html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
+html5lib/_utils.py,sha256=AxAJSG15eyarCgKMnlUwzs1X6jFHXqEvhlYEOxAFmis,4919
+html5lib/constants.py,sha256=Ll-yzLU_jcjyAI_h57zkqZ7aQWE5t5xA4y_jQgoUUhw,83464
+html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
+html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
+html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
+html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
+html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
+html5lib/filters/sanitizer.py,sha256=XGNSdzIqDTaHot1V-rRj1V_XOolApJ7n95tHP9JcgNU,26885
+html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
+html5lib/html5parser.py,sha256=w5hZJh0cvD3g4CS196DiTmuGpSKCMYe1GS46-yf_WZQ,117174
+html5lib/serializer.py,sha256=K2kfoLyMPMFPfdusfR30SrxNkf0mJB92-P5_RntyaaI,15747
+html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
+html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
+html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
+html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
+html5lib/treebuilders/base.py,sha256=oeZNGEB-kt90YJGVH05gb5a8E7ids2AbYwGRsVCieWk,14553
+html5lib/treebuilders/dom.py,sha256=22whb0C71zXIsai5mamg6qzBEiigcBIvaDy4Asw3at0,8925
+html5lib/treebuilders/etree.py,sha256=EbmHx-wQ-11MVucTPtF7Ul92-mQGN3Udu_KfDn-Ifhk,12824
+html5lib/treebuilders/etree_lxml.py,sha256=OazDHZGO_q4FnVs4Dhs4hzzn2JwGAOs-rfV8LAlUGW4,14754
+html5lib/treewalkers/__init__.py,sha256=OBPtc1TU5mGyy18QDMxKEyYEz0wxFUUNj5v0-XgmYhY,5719
+html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
+html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
+html5lib/treewalkers/etree.py,sha256=gkD4tfEfRWPsEGvgHHJxZmKZXUvBzVVGz3v5C_MIiOE,4539
+html5lib/treewalkers/etree_lxml.py,sha256=eLedbn6nPjlpebibsWVijey7WEpzDwxU3ubwUoudBuA,6345
+html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
@ -0,0 +1,6 @@
+Wheel-Version: 1.0
+Generator: bdist_wheel (0.34.2)
+Root-Is-Purelib: true
+Tag: py2-none-any
+Tag: py3-none-any
+
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
@ -0,0 +1 @@
+html5lib
--- a/lib/bleach/_vendor/html5lib/init.py
+++ b/lib/bleach/_vendor/html5lib/init.py
@ -0,0 +1,35 @@
+"""
+HTML parsing library based on the `WHATWG HTML specification
+<https://whatwg.org/html>`_. The parser is designed to be compatible with
+existing HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage::
+
+    import html5lib
+    with open("my_document.html", "rb") as f:
+        tree = html5lib.parse(f)
+
+For convenience, this module re-exports the following names:
+
+* :func:`~.html5parser.parse`
+* :func:`~.html5parser.parseFragment`
+* :class:`~.html5parser.HTMLParser`
+* :func:`~.treebuilders.getTreeBuilder`
+* :func:`~.treewalkers.getTreeWalker`
+* :func:`~.serializer.serialize`
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .html5parser import HTMLParser, parse, parseFragment
+from .treebuilders import getTreeBuilder
+from .treewalkers import getTreeWalker
+from .serializer import serialize
+
+__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
+           "getTreeWalker", "serialize"]
+
+# this has to be at the top level, see how setup.py parses this
+#: Distribution version number.
+__version__ = "1.1"
--- a/lib/bleach/_vendor/html5lib/_ihatexml.py
+++ b/lib/bleach/_vendor/html5lib/_ihatexml.py
@ -0,0 +1,289 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+import warnings
+
+from .constants import DataLossWarning
+
+baseChar = """
+[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
+[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
+[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
+[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
+[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
+[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
+[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
+[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
+[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
+[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
+[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
+[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
+[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
+[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
+[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
+[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
+[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
+[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
+[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
+[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
+[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
+[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
+[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
+[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
+[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
+[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
+[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
+[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
+[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
+[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
+#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
+#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
+#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
+[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
+[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
+#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
+[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
+[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
+[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
+[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
+[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
+#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
+[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
+[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
+[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
+[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """
+[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
+[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
+[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
+[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
+#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
+[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
+[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
+#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
+[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
+[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
+#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
+[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
+[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
+[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
+[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
+[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
+#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
+[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
+#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
+[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
+[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
+#x3099 | #x309A"""
+
+digit = """
+[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
+[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
+[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
+[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """
+#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
+#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+# Without the
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
+                   extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1] * 2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i + j][1]
+            j += 1
+        i += j
+    return rv
+
+
+# We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+
+
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1] + 1, charList[i + 1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+            rv.append(escapeRegexp(chr(item[0])))
+        else:
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
+                      escapeRegexp(chr(item[1])))
+    return "[%s]" % "".join(rv)
+
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                         "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, "\\" + char)
+
+    return string
+
+# output from the above
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+# Simpler things
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
+
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+
+    def __init__(self,
+                 dropXmlnsLocalName=False,
+                 dropXmlnsAttrNs=False,
+                 preventDoubleDashComments=False,
+                 preventDashAtCommentEnd=False,
+                 replaceFormFeedCharacters=True,
+                 preventSingleQuotePubid=False):
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.preventSingleQuotePubid = preventSingleQuotePubid
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
+            return None
+        elif (self.dropXmlnsAttrNs and
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                data = data.replace("--", "- -")
+            if data.endswith("-"):
+                warnings.warn("Comments cannot end in a dash", DataLossWarning)
+                data += " "
+        return data
+
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            for _ in range(data.count("\x0C")):
+                warnings.warn("Text cannot contain U+000C", DataLossWarning)
+            data = data.replace("\x0C", " ")
+        # Other non-xml characters
+        return data
+
+    def coercePubid(self, data):
+        dataOutput = data
+        for char in nonPubidCharRegexp.findall(data):
+            warnings.warn("Coercing non-XML pubid", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            dataOutput = dataOutput.replace(char, replacement)
+        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
+            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
+            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
+        return dataOutput
+
+    def toXmlName(self, name):
+        nameFirst = name[0]
+        nameRest = name[1:]
+        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+        if m:
+            warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
+            nameFirstOutput = self.getReplacementCharacter(nameFirst)
+        else:
+            nameFirstOutput = nameFirst
+
+        nameRestOutput = nameRest
+        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
+        for char in replaceChars:
+            warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            nameRestOutput = nameRestOutput.replace(char, replacement)
+        return nameFirstOutput + nameRestOutput
+
+    def getReplacementCharacter(self, char):
+        if char in self.replaceCache:
+            replacement = self.replaceCache[char]
+        else:
+            replacement = self.escapeChar(char)
+        return replacement
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U%05X" % ord(char)
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return chr(int(charcode[1:], 16))
--- a/lib/bleach/_vendor/html5lib/_inputstream.py
+++ b/lib/bleach/_vendor/html5lib/_inputstream.py
@ -0,0 +1,918 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type
+from six.moves import http_client, urllib
+
+import codecs
+import re
+from io import BytesIO, StringIO
+
+import webencodings
+
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import _ReparseException
+from . import _utils
+
+# Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
+
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
+
+if _utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # eval. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
+
+non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                              0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                              0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                              0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                              0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                              0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                              0x10FFFE, 0x10FFFF}
+
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+
+
+class BufferedStream(object):
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that
+    joining many strings will be slow since it is O(n**2)
+    """
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1, 0]  # chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos <= self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= len(self.buffer[i])
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+
+        return b"".join(rv)
+
+
+def HTMLInputStream(source, **kwargs):
+    # Work around Python bug #20007: read(0) closes the connection.
+    # http://bugs.python.org/issue20007
+    if (isinstance(source, http_client.HTTPResponse) or
+        # Also check for addinfourl wrapping HTTPResponse
+        (isinstance(source, urllib.response.addbase) and
+         isinstance(source.fp, http_client.HTTPResponse))):
+        isUnicode = False
+    elif hasattr(source, "read"):
+        isUnicode = isinstance(source.read(0), text_type)
+    else:
+        isUnicode = isinstance(source, text_type)
+
+    if isUnicode:
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
+
+        return HTMLUnicodeInputStream(source, **kwargs)
+    else:
+        return HTMLBinaryInputStream(source, **kwargs)
+
+
+class HTMLUnicodeInputStream(object):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    _defaultChunkSize = 10240
+
+    def __init__(self, source):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+
+        if not _utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+        elif len("\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+
+        # List of where new lines occur
+        self.newLines = [0]
+
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
+        self.dataStream = self.openStream(source)
+
+        self.reset()
+
+    def reset(self):
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+        self.errors = []
+
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
+
+        # Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = StringIO(source)
+
+        return stream
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count('\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind('\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        line, col = self._position(self.chunkOffset)
+        return (line + 1, col)
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
+                return EOF
+
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1
+
+        return char
+
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
+        data = self.dataStream.read(chunkSize)
+
+        # Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
+            return False
+
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)
+
+        # Replace invalid characters
+        data = data.replace("\r\n", "\n")
+        data = data.replace("\r", "\n")
+
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for _ in range(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        # Someone picked the wrong compile option
+        # You lose
+        skip = False
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            # Pretty sure there should be endianness issues here
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
+                # We have a surrogate pair!
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+
+    def charsUntil(self, characters, opposite=False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
+        """
+
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters:
+                    assert(ord(c) < 128)
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = "^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
+
+        rv = []
+
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
+                    break
+            else:
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break
+
+        r = "".join(rv)
+        return r
+
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget
+        if char is not EOF:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
+            else:
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char
+
+
+class HTMLBinaryInputStream(HTMLUnicodeInputStream):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
+        self.rawStream = self.openStream(source)
+
+        HTMLUnicodeInputStream.__init__(self, self.rawStream)
+
+        # Encoding Information
+        # Number of bytes to use when looking for a meta element with
+        # encoding information
+        self.numBytesMeta = 1024
+        # Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding
+
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None
+
+        # Call superclass
+        self.reset()
+
+    def reset(self):
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
+        HTMLUnicodeInputStream.reset(self)
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = BytesIO(source)
+
+        try:
+            stream.seek(stream.tell())
+        except Exception:
+            stream = BufferedStream(stream)
+
+        return stream
+
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
+        # This will also read past the BOM if present
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overridden, we've been overridden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Guess with chardet, if available
+        if chardet:
+            try:
+                from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    assert isinstance(buffer, bytes)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = lookupEncoding(detector.result['encoding'])
+                self.rawStream.seek(0)
+                if encoding is not None:
+                    return encoding, "tentative"
+
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"
+
+    def changeEncoding(self, newEncoding):
+        assert self.charEncoding[1] != "certain"
+        newEncoding = lookupEncoding(newEncoding)
+        if newEncoding is None:
+            return
+        if newEncoding.name in ("utf-16be", "utf-16le"):
+            newEncoding = lookupEncoding("utf-8")
+            assert newEncoding is not None
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.charEncoding = (newEncoding, "certain")
+            self.reset()
+            raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        string = self.rawStream.read(4)
+        assert isinstance(string, bytes)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])         # UTF-8
+        seek = 3
+        if not encoding:
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
+            if not encoding:
+                encoding = bomDict.get(string[:2])  # UTF-16
+                seek = 2
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        if encoding:
+            self.rawStream.seek(seek)
+            return lookupEncoding(encoding)
+        else:
+            self.rawStream.seek(0)
+            return None
+
+    def detectEncodingMeta(self):
+        """Report the encoding declared by the meta element
+        """
+        buffer = self.rawStream.read(self.numBytesMeta)
+        assert isinstance(buffer, bytes)
+        parser = EncodingParser(buffer)
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
+            encoding = lookupEncoding("utf-8")
+
+        return encoding
+
+
+class EncodingBytes(bytes):
+    """String-like object with an associated position and various extra methods
+    If the position is ever greater than the string length then an exception is
+    raised"""
+    def __new__(self, value):
+        assert isinstance(value, bytes)
+        return bytes.__new__(self, value.lower())
+
+    def __init__(self, value):
+        # pylint:disable=unused-argument
+        self._position = -1
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p:p + 1]
+
+    def next(self):
+        # Py2 compat
+        return self.__next__()
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p:p + 1]
+
+    def setPosition(self, position):
+        if self._position >= len(self):
+            raise StopIteration
+        self._position = position
+
+    def getPosition(self):
+        if self._position >= len(self):
+            raise StopIteration
+        if self._position >= 0:
+            return self._position
+        else:
+            return None
+
+    position = property(getPosition, setPosition)
+
+    def getCurrentByte(self):
+        return self[self.position:self.position + 1]
+
+    currentByte = property(getCurrentByte)
+
+    def skip(self, chars=spaceCharactersBytes):
+        """Skip past a list of characters"""
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p:p + 1]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p:p + 1]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def matchBytes(self, bytes):
+        """Look for a sequence of bytes at the start of a string. If the bytes
+        are found return True and advance the position to the byte after the
+        match. Otherwise return False and leave the position alone"""
+        rv = self.startswith(bytes, self.position)
+        if rv:
+            self.position += len(bytes)
+        return rv
+
+    def jumpTo(self, bytes):
+        """Look for the next sequence of bytes matching a given sequence. If
+        a match is found advance the position to the last byte of the match"""
+        try:
+            self._position = self.index(bytes, self.position) + len(bytes) - 1
+        except ValueError:
+            raise StopIteration
+        return True
+
+
+class EncodingParser(object):
+    """Mini parser for detecting character encoding from meta elements"""
+
+    def __init__(self, data):
+        """string - the data to work on for encoding detection"""
+        self.data = EncodingBytes(data)
+        self.encoding = None
+
+    def getEncoding(self):
+        if b"<meta" not in self.data:
+            return None
+
+        methodDispatch = (
+            (b"<!--", self.handleComment),
+            (b"<meta", self.handleMeta),
+            (b"</", self.handlePossibleEndTag),
+            (b"<!", self.handleOther),
+            (b"<?", self.handleOther),
+            (b"<", self.handlePossibleStartTag))
+        for _ in self.data:
+            keepParsing = True
+            try:
+                self.data.jumpTo(b"<")
+            except StopIteration:
+                break
+            for key, method in methodDispatch:
+                if self.data.matchBytes(key):
+                    try:
+                        keepParsing = method()
+                        break
+                    except StopIteration:
+                        keepParsing = False
+                        break
+            if not keepParsing:
+                break
+
+        return self.encoding
+
+    def handleComment(self):
+        """Skip over comments"""
+        return self.data.jumpTo(b"-->")
+
+    def handleMeta(self):
+        if self.data.currentByte not in spaceCharactersBytes:
+            # if we have <meta not followed by a space so just keep going
+            return True
+        # We have a valid meta element we want to search for attributes
+        hasPragma = False
+        pendingEncoding = None
+        while True:
+            # Try to find the next attribute after the current position
+            attr = self.getAttribute()
+            if attr is None:
+                return True
+            else:
+                if attr[0] == b"http-equiv":
+                    hasPragma = attr[1] == b"content-type"
+                    if hasPragma and pendingEncoding is not None:
+                        self.encoding = pendingEncoding
+                        return False
+                elif attr[0] == b"charset":
+                    tentativeEncoding = attr[1]
+                    codec = lookupEncoding(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
+                        return False
+                elif attr[0] == b"content":
+                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+                    tentativeEncoding = contentParser.parse()
+                    if tentativeEncoding is not None:
+                        codec = lookupEncoding(tentativeEncoding)
+                        if codec is not None:
+                            if hasPragma:
+                                self.encoding = codec
+                                return False
+                            else:
+                                pendingEncoding = codec
+
+    def handlePossibleStartTag(self):
+        return self.handlePossibleTag(False)
+
+    def handlePossibleEndTag(self):
+        next(self.data)
+        return self.handlePossibleTag(True)
+
+    def handlePossibleTag(self, endTag):
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
+            # If the next byte is not an ascii letter either ignore this
+            # fragment (possible start tag case) or treat it according to
+            # handleOther
+            if endTag:
+                data.previous()
+                self.handleOther()
+            return True
+
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == b"<":
+            # return to the first step in the overall "two step" algorithm
+            # reprocessing the < byte
+            data.previous()
+        else:
+            # Read all attributes
+            attr = self.getAttribute()
+            while attr is not None:
+                attr = self.getAttribute()
+        return True
+
+    def handleOther(self):
+        return self.data.jumpTo(b">")
+
+    def getAttribute(self):
+        """Return a name,value pair for the next attribute in the stream,
+        if one is found, or None"""
+        data = self.data
+        # Step 1 (skip chars)
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
+        assert c is None or len(c) == 1
+        # Step 2
+        if c in (b">", None):
+            return None
+        # Step 3
+        attrName = []
+        attrValue = []
+        # Step 4 attribute name
+        while True:
+            if c == b"=" and attrName:
+                break
+            elif c in spaceCharactersBytes:
+                # Step 6!
+                c = data.skip()
+                break
+            elif c in (b"/", b">"):
+                return b"".join(attrName), b""
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrName.append(c)
+            # Step 5
+            c = next(data)
+        # Step 7
+        if c != b"=":
+            data.previous()
+            return b"".join(attrName), b""
+        # Step 8
+        next(data)
+        # Step 9
+        c = data.skip()
+        # Step 10
+        if c in (b"'", b'"'):
+            # 10.1
+            quoteChar = c
+            while True:
+                # 10.2
+                c = next(data)
+                # 10.3
+                if c == quoteChar:
+                    next(data)
+                    return b"".join(attrName), b"".join(attrValue)
+                # 10.4
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
+                # 10.5
+                else:
+                    attrValue.append(c)
+        elif c == b">":
+            return b"".join(attrName), b""
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
+        else:
+            attrValue.append(c)
+        # Step 11
+        while True:
+            c = next(data)
+            if c in spacesAngleBrackets:
+                return b"".join(attrName), b"".join(attrValue)
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrValue.append(c)
+
+
+class ContentAttrParser(object):
+    def __init__(self, data):
+        assert isinstance(data, bytes)
+        self.data = data
+
+    def parse(self):
+        try:
+            # Check if the attr name is charset
+            # otherwise return
+            self.data.jumpTo(b"charset")
+            self.data.position += 1
+            self.data.skip()
+            if not self.data.currentByte == b"=":
+                # If there is no = sign keep looking for attrs
+                return None
+            self.data.position += 1
+            self.data.skip()
+            # Look for an encoding between matching quote marks
+            if self.data.currentByte in (b'"', b"'"):
+                quoteMark = self.data.currentByte
+                self.data.position += 1
+                oldPosition = self.data.position
+                if self.data.jumpTo(quoteMark):
+                    return self.data[oldPosition:self.data.position]
+                else:
+                    return None
+            else:
+                # Unquoted value
+                oldPosition = self.data.position
+                try:
+                    self.data.skipUntil(spaceCharactersBytes)
+                    return self.data[oldPosition:self.data.position]
+                except StopIteration:
+                    # Return the whole remaining value
+                    return self.data[oldPosition:]
+        except StopIteration:
+            return None
+
+
+def lookupEncoding(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if isinstance(encoding, bytes):
+        try:
+            encoding = encoding.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+
+    if encoding is not None:
+        try:
+            return webencodings.lookup(encoding)
+        except AttributeError:
+            return None
+    else:
+        return None
--- a/lib/bleach/_vendor/html5lib/_tokenizer.py
+++ b/lib/bleach/_vendor/html5lib/_tokenizer.py
--- a/lib/bleach/_vendor/html5lib/_trie/init.py
+++ b/lib/bleach/_vendor/html5lib/_trie/init.py
@ -0,0 +1,5 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from .py import Trie
+
+__all__ = ["Trie"]
--- a/lib/bleach/_vendor/html5lib/_trie/_base.py
+++ b/lib/bleach/_vendor/html5lib/_trie/_base.py
@ -0,0 +1,40 @@
+from __future__ import absolute_import, division, unicode_literals
+
+try:
+    from collections.abc import Mapping
+except ImportError:  # Python 2.7
+    from collections import Mapping
+
+
+class Trie(Mapping):
+    """Abstract base class for tries"""
+
+    def keys(self, prefix=None):
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()
+
+        if prefix is None:
+            return set(keys)
+
+        return {x for x in keys if x.startswith(prefix)}
+
+    def has_keys_with_prefix(self, prefix):
+        for key in self.keys():
+            if key.startswith(prefix):
+                return True
+
+        return False
+
+    def longest_prefix(self, prefix):
+        if prefix in self:
+            return prefix
+
+        for i in range(1, len(prefix) + 1):
+            if prefix[:-i] in self:
+                return prefix[:-i]
+
+        raise KeyError(prefix)
+
+    def longest_prefix_item(self, prefix):
+        lprefix = self.longest_prefix(prefix)
+        return (lprefix, self[lprefix])
--- a/lib/bleach/_vendor/html5lib/_trie/py.py
+++ b/lib/bleach/_vendor/html5lib/_trie/py.py
@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from bisect import bisect_left
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        if not all(isinstance(x, text_type) for x in data.keys()):
+            raise TypeError("All keys must be strings")
+
+        self._data = data
+        self._keys = sorted(data.keys())
+        self._cachestr = ""
+        self._cachepoints = (0, len(data))
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        if prefix is None or prefix == "" or not self._keys:
+            return set(self._keys)
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            start = i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            start = i = bisect_left(self._keys, prefix)
+
+        keys = set()
+        if start == len(self._keys):
+            return keys
+
+        while self._keys[i].startswith(prefix):
+            keys.add(self._keys[i])
+            i += 1
+
+        self._cachestr = prefix
+        self._cachepoints = (start, i)
+
+        return keys
+
+    def has_keys_with_prefix(self, prefix):
+        if prefix in self._data:
+            return True
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            i = bisect_left(self._keys, prefix)
+
+        if i == len(self._keys):
+            return False
+
+        return self._keys[i].startswith(prefix)
--- a/lib/bleach/_vendor/html5lib/_utils.py
+++ b/lib/bleach/_vendor/html5lib/_utils.py
@ -0,0 +1,159 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from types import ModuleType
+
+try:
+    from collections.abc import Mapping
+except ImportError:
+    from collections import Mapping
+
+from six import text_type, PY3
+
+if PY3:
+    import xml.etree.ElementTree as default_etree
+else:
+    try:
+        import xml.etree.cElementTree as default_etree
+    except ImportError:
+        import xml.etree.ElementTree as default_etree
+
+
+__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
+        assert isinstance(_x, text_type)
+except Exception:
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
+
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        _dictEntries = []
+        for name, value in items:
+            if isinstance(name, (list, tuple, frozenset, set)):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        assert len(self) == len(_dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+    def __get__(self, instance, owner=None):
+        return BoundMethodDispatcher(instance, self)
+
+
+class BoundMethodDispatcher(Mapping):
+    """Wraps a MethodDispatcher, binding its return values to `instance`"""
+    def __init__(self, instance, dispatcher):
+        self.instance = instance
+        self.dispatcher = dispatcher
+
+    def __getitem__(self, key):
+        # see https://docs.python.org/3/reference/datamodel.html#object.__get__
+        # on a function, __get__ is used to bind a function to an instance as a bound method
+        return self.dispatcher[key].__get__(self.instance)
+
+    def get(self, key, default):
+        if key in self.dispatcher:
+            return self[key]
+        else:
+            return default
+
+    def __iter__(self):
+        return iter(self.dispatcher)
+
+    def __len__(self):
+        return len(self.dispatcher)
+
+    def __contains__(self, key):
+        return key in self.dispatcher
+
+
+# Some utility functions to deal with weirdness around UCS2 vs UCS4
+# python builds
+
+def isSurrogatePair(data):
+    return (len(data) == 2 and
+            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
+            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
+
+
+def surrogatePairToCodepoint(data):
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
+                (ord(data[1]) - 0xDC00))
+    return char_val
+
+# Module Factory Factory (no, this isn't Java, I know)
+# Here to stop this being duplicated all over the place.
+
+
+def moduleFactoryFactory(factory):
+    moduleCache = {}
+
+    def moduleFactory(baseModule, *args, **kwargs):
+        if isinstance(ModuleType.__name__, type("")):
+            name = "_%s_factory" % baseModule.__name__
+        else:
+            name = b"_%s_factory" % baseModule.__name__
+
+        kwargs_tuple = tuple(kwargs.items())
+
+        try:
+            return moduleCache[name][args][kwargs_tuple]
+        except KeyError:
+            mod = ModuleType(name)
+            objs = factory(baseModule, *args, **kwargs)
+            mod.__dict__.update(objs)
+            if "name" not in moduleCache:
+                moduleCache[name] = {}
+            if "args" not in moduleCache[name]:
+                moduleCache[name][args] = {}
+            if "kwargs" not in moduleCache[name][args]:
+                moduleCache[name][args][kwargs_tuple] = {}
+            moduleCache[name][args][kwargs_tuple] = mod
+            return mod
+
+    return moduleFactory
+
+
+def memoize(func):
+    cache = {}
+
+    def wrapped(*args, **kwargs):
+        key = (tuple(args), tuple(kwargs.items()))
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapped
--- a/lib/bleach/_vendor/html5lib/constants.py
+++ b/lib/bleach/_vendor/html5lib/constants.py
--- a/lib/bleach/_vendor/html5lib/filters/init.py
+++ b/lib/bleach/_vendor/html5lib/filters/init.py
--- a/lib/bleach/_vendor/html5lib/filters/alphabeticalattributes.py
+++ b/lib/bleach/_vendor/html5lib/filters/alphabeticalattributes.py
@ -0,0 +1,29 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+from collections import OrderedDict
+
+
+def _attr_key(attr):
+    """Return an appropriate key for an attribute for sorting
+
+    Attributes have a namespace that can be either ``None`` or a string. We
+    can't compare the two because they're different types, so we convert
+    ``None`` to an empty string first.
+
+    """
+    return (attr[0][0] or ''), attr[0][1]
+
+
+class Filter(base.Filter):
+    """Alphabetizes attributes for elements"""
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            if token["type"] in ("StartTag", "EmptyTag"):
+                attrs = OrderedDict()
+                for name, value in sorted(token["data"].items(),
+                                          key=_attr_key):
+                    attrs[name] = value
+                token["data"] = attrs
+            yield token
--- a/lib/bleach/_vendor/html5lib/filters/base.py
+++ b/lib/bleach/_vendor/html5lib/filters/base.py
@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+class Filter(object):
+    def __init__(self, source):
+        self.source = source
+
+    def __iter__(self):
+        return iter(self.source)
+
+    def __getattr__(self, name):
+        return getattr(self.source, name)
--- a/lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
+++ b/lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
@ -0,0 +1,73 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    """Injects ``<meta charset=ENCODING>`` tag into head of document"""
+    def __init__(self, source, encoding):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg encoding: the encoding to set
+
+        """
+        base.Filter.__init__(self, source)
+        self.encoding = encoding
+
+    def __iter__(self):
+        state = "pre_head"
+        meta_found = (self.encoding is None)
+        pending = []
+
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag":
+                if token["name"].lower() == "head":
+                    state = "in_head"
+
+            elif type == "EmptyTag":
+                if token["name"].lower() == "meta":
+                    # replace charset with actual encoding
+                    has_http_equiv_content_type = False
+                    for (namespace, name), value in token["data"].items():
+                        if namespace is not None:
+                            continue
+                        elif name.lower() == 'charset':
+                            token["data"][(namespace, name)] = self.encoding
+                            meta_found = True
+                            break
+                        elif name == 'http-equiv' and value.lower() == 'content-type':
+                            has_http_equiv_content_type = True
+                    else:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
+                            meta_found = True
+
+                elif token["name"].lower() == "head" and not meta_found:
+                    # insert meta into empty head
+                    yield {"type": "StartTag", "name": "head",
+                           "data": token["data"]}
+                    yield {"type": "EmptyTag", "name": "meta",
+                           "data": {(None, "charset"): self.encoding}}
+                    yield {"type": "EndTag", "name": "head"}
+                    meta_found = True
+                    continue
+
+            elif type == "EndTag":
+                if token["name"].lower() == "head" and pending:
+                    # insert meta into head (if necessary) and flush pending queue
+                    yield pending.pop(0)
+                    if not meta_found:
+                        yield {"type": "EmptyTag", "name": "meta",
+                               "data": {(None, "charset"): self.encoding}}
+                    while pending:
+                        yield pending.pop(0)
+                    meta_found = True
+                    state = "post_head"
+
+            if state == "in_head":
+                pending.append(token)
+            else:
+                yield token
--- a/lib/bleach/_vendor/html5lib/filters/lint.py
+++ b/lib/bleach/_vendor/html5lib/filters/lint.py
@ -0,0 +1,93 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type
+
+from . import base
+from ..constants import namespaces, voidElements
+
+from ..constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+class Filter(base.Filter):
+    """Lints the token stream for errors
+
+    If it finds any errors, it'll raise an ``AssertionError``.
+
+    """
+    def __init__(self, source, require_matching_tags=True):
+        """Creates a Filter
+
+        :arg source: the source token stream
+
+        :arg require_matching_tags: whether or not to require matching tags
+
+        """
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags
+
+    def __iter__(self):
+        open_elements = []
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
+                if type == "StartTag" and self.require_matching_tags:
+                    open_elements.append((namespace, name))
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)
+
+            elif type == "EndTag":
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                elif self.require_matching_tags:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)
+
+            elif type == "Comment":
+                data = token["data"]
+                assert isinstance(data, text_type)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                data = token["data"]
+                assert isinstance(data, text_type)
+                assert data != ""
+                if type == "SpaceCharacters":
+                    assert data.strip(spaceCharacters) == ""
+
+            elif type == "Doctype":
+                name = token["name"]
+                assert name is None or isinstance(name, text_type)
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)
+
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)
+
+            else:
+                assert False, "Unknown token type: %(type)s" % {"type": type}
+
+            yield token
--- a/lib/bleach/_vendor/html5lib/filters/optionaltags.py
+++ b/lib/bleach/_vendor/html5lib/filters/optionaltags.py
@ -0,0 +1,207 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    """Removes optional tags from the token stream"""
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        if previous1 is not None:
+            yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if (token["data"] or
+                        not self.is_optional_start(token["name"], previous, next)):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceded by another colgroup element whose
+            # end tag has been omitted.
+            if type in ("StartTag", "EmptyTag"):
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                        previous['name'] in ('tbody', 'thead', 'tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog',
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol',
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
--- a/lib/bleach/_vendor/html5lib/filters/sanitizer.py
+++ b/lib/bleach/_vendor/html5lib/filters/sanitizer.py
@ -0,0 +1,916 @@
+"""Deprecated from html5lib 1.1.
+
+See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
+information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
+is recommended as a replacement. Please let us know in the aforementioned issue
+if Bleach is unsuitable for your needs.
+
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+import warnings
+from xml.sax.saxutils import escape, unescape
+
+from six.moves import urllib_parse as urlparse
+
+from . import base
+from ..constants import namespaces, prefixes
+
+__all__ = ["Filter"]
+
+
+_deprecation_msg = (
+    "html5lib's sanitizer is deprecated; see " +
+    "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
+    "us know if Bleach is unsuitable for your needs"
+)
+
+warnings.warn(_deprecation_msg, DeprecationWarning)
+
+allowed_elements = frozenset((
+    (namespaces['html'], 'a'),
+    (namespaces['html'], 'abbr'),
+    (namespaces['html'], 'acronym'),
+    (namespaces['html'], 'address'),
+    (namespaces['html'], 'area'),
+    (namespaces['html'], 'article'),
+    (namespaces['html'], 'aside'),
+    (namespaces['html'], 'audio'),
+    (namespaces['html'], 'b'),
+    (namespaces['html'], 'big'),
+    (namespaces['html'], 'blockquote'),
+    (namespaces['html'], 'br'),
+    (namespaces['html'], 'button'),
+    (namespaces['html'], 'canvas'),
+    (namespaces['html'], 'caption'),
+    (namespaces['html'], 'center'),
+    (namespaces['html'], 'cite'),
+    (namespaces['html'], 'code'),
+    (namespaces['html'], 'col'),
+    (namespaces['html'], 'colgroup'),
+    (namespaces['html'], 'command'),
+    (namespaces['html'], 'datagrid'),
+    (namespaces['html'], 'datalist'),
+    (namespaces['html'], 'dd'),
+    (namespaces['html'], 'del'),
+    (namespaces['html'], 'details'),
+    (namespaces['html'], 'dfn'),
+    (namespaces['html'], 'dialog'),
+    (namespaces['html'], 'dir'),
+    (namespaces['html'], 'div'),
+    (namespaces['html'], 'dl'),
+    (namespaces['html'], 'dt'),
+    (namespaces['html'], 'em'),
+    (namespaces['html'], 'event-source'),
+    (namespaces['html'], 'fieldset'),
+    (namespaces['html'], 'figcaption'),
+    (namespaces['html'], 'figure'),
+    (namespaces['html'], 'footer'),
+    (namespaces['html'], 'font'),
+    (namespaces['html'], 'form'),
+    (namespaces['html'], 'header'),
+    (namespaces['html'], 'h1'),
+    (namespaces['html'], 'h2'),
+    (namespaces['html'], 'h3'),
+    (namespaces['html'], 'h4'),
+    (namespaces['html'], 'h5'),
+    (namespaces['html'], 'h6'),
+    (namespaces['html'], 'hr'),
+    (namespaces['html'], 'i'),
+    (namespaces['html'], 'img'),
+    (namespaces['html'], 'input'),
+    (namespaces['html'], 'ins'),
+    (namespaces['html'], 'keygen'),
+    (namespaces['html'], 'kbd'),
+    (namespaces['html'], 'label'),
+    (namespaces['html'], 'legend'),
+    (namespaces['html'], 'li'),
+    (namespaces['html'], 'm'),
+    (namespaces['html'], 'map'),
+    (namespaces['html'], 'menu'),
+    (namespaces['html'], 'meter'),
+    (namespaces['html'], 'multicol'),
+    (namespaces['html'], 'nav'),
+    (namespaces['html'], 'nextid'),
+    (namespaces['html'], 'ol'),
+    (namespaces['html'], 'output'),
+    (namespaces['html'], 'optgroup'),
+    (namespaces['html'], 'option'),
+    (namespaces['html'], 'p'),
+    (namespaces['html'], 'pre'),
+    (namespaces['html'], 'progress'),
+    (namespaces['html'], 'q'),
+    (namespaces['html'], 's'),
+    (namespaces['html'], 'samp'),
+    (namespaces['html'], 'section'),
+    (namespaces['html'], 'select'),
+    (namespaces['html'], 'small'),
+    (namespaces['html'], 'sound'),
+    (namespaces['html'], 'source'),
+    (namespaces['html'], 'spacer'),
+    (namespaces['html'], 'span'),
+    (namespaces['html'], 'strike'),
+    (namespaces['html'], 'strong'),
+    (namespaces['html'], 'sub'),
+    (namespaces['html'], 'sup'),
+    (namespaces['html'], 'table'),
+    (namespaces['html'], 'tbody'),
+    (namespaces['html'], 'td'),
+    (namespaces['html'], 'textarea'),
+    (namespaces['html'], 'time'),
+    (namespaces['html'], 'tfoot'),
+    (namespaces['html'], 'th'),
+    (namespaces['html'], 'thead'),
+    (namespaces['html'], 'tr'),
+    (namespaces['html'], 'tt'),
+    (namespaces['html'], 'u'),
+    (namespaces['html'], 'ul'),
+    (namespaces['html'], 'var'),
+    (namespaces['html'], 'video'),
+    (namespaces['mathml'], 'maction'),
+    (namespaces['mathml'], 'math'),
+    (namespaces['mathml'], 'merror'),
+    (namespaces['mathml'], 'mfrac'),
+    (namespaces['mathml'], 'mi'),
+    (namespaces['mathml'], 'mmultiscripts'),
+    (namespaces['mathml'], 'mn'),
+    (namespaces['mathml'], 'mo'),
+    (namespaces['mathml'], 'mover'),
+    (namespaces['mathml'], 'mpadded'),
+    (namespaces['mathml'], 'mphantom'),
+    (namespaces['mathml'], 'mprescripts'),
+    (namespaces['mathml'], 'mroot'),
+    (namespaces['mathml'], 'mrow'),
+    (namespaces['mathml'], 'mspace'),
+    (namespaces['mathml'], 'msqrt'),
+    (namespaces['mathml'], 'mstyle'),
+    (namespaces['mathml'], 'msub'),
+    (namespaces['mathml'], 'msubsup'),
+    (namespaces['mathml'], 'msup'),
+    (namespaces['mathml'], 'mtable'),
+    (namespaces['mathml'], 'mtd'),
+    (namespaces['mathml'], 'mtext'),
+    (namespaces['mathml'], 'mtr'),
+    (namespaces['mathml'], 'munder'),
+    (namespaces['mathml'], 'munderover'),
+    (namespaces['mathml'], 'none'),
+    (namespaces['svg'], 'a'),
+    (namespaces['svg'], 'animate'),
+    (namespaces['svg'], 'animateColor'),
+    (namespaces['svg'], 'animateMotion'),
+    (namespaces['svg'], 'animateTransform'),
+    (namespaces['svg'], 'clipPath'),
+    (namespaces['svg'], 'circle'),
+    (namespaces['svg'], 'defs'),
+    (namespaces['svg'], 'desc'),
+    (namespaces['svg'], 'ellipse'),
+    (namespaces['svg'], 'font-face'),
+    (namespaces['svg'], 'font-face-name'),
+    (namespaces['svg'], 'font-face-src'),
+    (namespaces['svg'], 'g'),
+    (namespaces['svg'], 'glyph'),
+    (namespaces['svg'], 'hkern'),
+    (namespaces['svg'], 'linearGradient'),
+    (namespaces['svg'], 'line'),
+    (namespaces['svg'], 'marker'),
+    (namespaces['svg'], 'metadata'),
+    (namespaces['svg'], 'missing-glyph'),
+    (namespaces['svg'], 'mpath'),
+    (namespaces['svg'], 'path'),
+    (namespaces['svg'], 'polygon'),
+    (namespaces['svg'], 'polyline'),
+    (namespaces['svg'], 'radialGradient'),
+    (namespaces['svg'], 'rect'),
+    (namespaces['svg'], 'set'),
+    (namespaces['svg'], 'stop'),
+    (namespaces['svg'], 'svg'),
+    (namespaces['svg'], 'switch'),
+    (namespaces['svg'], 'text'),
+    (namespaces['svg'], 'title'),
+    (namespaces['svg'], 'tspan'),
+    (namespaces['svg'], 'use'),
+))
+
+allowed_attributes = frozenset((
+    # HTML attributes
+    (None, 'abbr'),
+    (None, 'accept'),
+    (None, 'accept-charset'),
+    (None, 'accesskey'),
+    (None, 'action'),
+    (None, 'align'),
+    (None, 'alt'),
+    (None, 'autocomplete'),
+    (None, 'autofocus'),
+    (None, 'axis'),
+    (None, 'background'),
+    (None, 'balance'),
+    (None, 'bgcolor'),
+    (None, 'bgproperties'),
+    (None, 'border'),
+    (None, 'bordercolor'),
+    (None, 'bordercolordark'),
+    (None, 'bordercolorlight'),
+    (None, 'bottompadding'),
+    (None, 'cellpadding'),
+    (None, 'cellspacing'),
+    (None, 'ch'),
+    (None, 'challenge'),
+    (None, 'char'),
+    (None, 'charoff'),
+    (None, 'choff'),
+    (None, 'charset'),
+    (None, 'checked'),
+    (None, 'cite'),
+    (None, 'class'),
+    (None, 'clear'),
+    (None, 'color'),
+    (None, 'cols'),
+    (None, 'colspan'),
+    (None, 'compact'),
+    (None, 'contenteditable'),
+    (None, 'controls'),
+    (None, 'coords'),
+    (None, 'data'),
+    (None, 'datafld'),
+    (None, 'datapagesize'),
+    (None, 'datasrc'),
+    (None, 'datetime'),
+    (None, 'default'),
+    (None, 'delay'),
+    (None, 'dir'),
+    (None, 'disabled'),
+    (None, 'draggable'),
+    (None, 'dynsrc'),
+    (None, 'enctype'),
+    (None, 'end'),
+    (None, 'face'),
+    (None, 'for'),
+    (None, 'form'),
+    (None, 'frame'),
+    (None, 'galleryimg'),
+    (None, 'gutter'),
+    (None, 'headers'),
+    (None, 'height'),
+    (None, 'hidefocus'),
+    (None, 'hidden'),
+    (None, 'high'),
+    (None, 'href'),
+    (None, 'hreflang'),
+    (None, 'hspace'),
+    (None, 'icon'),
+    (None, 'id'),
+    (None, 'inputmode'),
+    (None, 'ismap'),
+    (None, 'keytype'),
+    (None, 'label'),
+    (None, 'leftspacing'),
+    (None, 'lang'),
+    (None, 'list'),
+    (None, 'longdesc'),
+    (None, 'loop'),
+    (None, 'loopcount'),
+    (None, 'loopend'),
+    (None, 'loopstart'),
+    (None, 'low'),
+    (None, 'lowsrc'),
+    (None, 'max'),
+    (None, 'maxlength'),
+    (None, 'media'),
+    (None, 'method'),
+    (None, 'min'),
+    (None, 'multiple'),
+    (None, 'name'),
+    (None, 'nohref'),
+    (None, 'noshade'),
+    (None, 'nowrap'),
+    (None, 'open'),
+    (None, 'optimum'),
+    (None, 'pattern'),
+    (None, 'ping'),
+    (None, 'point-size'),
+    (None, 'poster'),
+    (None, 'pqg'),
+    (None, 'preload'),
+    (None, 'prompt'),
+    (None, 'radiogroup'),
+    (None, 'readonly'),
+    (None, 'rel'),
+    (None, 'repeat-max'),
+    (None, 'repeat-min'),
+    (None, 'replace'),
+    (None, 'required'),
+    (None, 'rev'),
+    (None, 'rightspacing'),
+    (None, 'rows'),
+    (None, 'rowspan'),
+    (None, 'rules'),
+    (None, 'scope'),
+    (None, 'selected'),
+    (None, 'shape'),
+    (None, 'size'),
+    (None, 'span'),
+    (None, 'src'),
+    (None, 'start'),
+    (None, 'step'),
+    (None, 'style'),
+    (None, 'summary'),
+    (None, 'suppress'),
+    (None, 'tabindex'),
+    (None, 'target'),
+    (None, 'template'),
+    (None, 'title'),
+    (None, 'toppadding'),
+    (None, 'type'),
+    (None, 'unselectable'),
+    (None, 'usemap'),
+    (None, 'urn'),
+    (None, 'valign'),
+    (None, 'value'),
+    (None, 'variable'),
+    (None, 'volume'),
+    (None, 'vspace'),
+    (None, 'vrml'),
+    (None, 'width'),
+    (None, 'wrap'),
+    (namespaces['xml'], 'lang'),
+    # MathML attributes
+    (None, 'actiontype'),
+    (None, 'align'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnlines'),
+    (None, 'columnspacing'),
+    (None, 'columnspan'),
+    (None, 'depth'),
+    (None, 'display'),
+    (None, 'displaystyle'),
+    (None, 'equalcolumns'),
+    (None, 'equalrows'),
+    (None, 'fence'),
+    (None, 'fontstyle'),
+    (None, 'fontweight'),
+    (None, 'frame'),
+    (None, 'height'),
+    (None, 'linethickness'),
+    (None, 'lspace'),
+    (None, 'mathbackground'),
+    (None, 'mathcolor'),
+    (None, 'mathvariant'),
+    (None, 'mathvariant'),
+    (None, 'maxsize'),
+    (None, 'minsize'),
+    (None, 'other'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowlines'),
+    (None, 'rowspacing'),
+    (None, 'rowspan'),
+    (None, 'rspace'),
+    (None, 'scriptlevel'),
+    (None, 'selection'),
+    (None, 'separator'),
+    (None, 'stretchy'),
+    (None, 'width'),
+    (None, 'width'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'type'),
+    # SVG attributes
+    (None, 'accent-height'),
+    (None, 'accumulate'),
+    (None, 'additive'),
+    (None, 'alphabetic'),
+    (None, 'arabic-form'),
+    (None, 'ascent'),
+    (None, 'attributeName'),
+    (None, 'attributeType'),
+    (None, 'baseProfile'),
+    (None, 'bbox'),
+    (None, 'begin'),
+    (None, 'by'),
+    (None, 'calcMode'),
+    (None, 'cap-height'),
+    (None, 'class'),
+    (None, 'clip-path'),
+    (None, 'color'),
+    (None, 'color-rendering'),
+    (None, 'content'),
+    (None, 'cx'),
+    (None, 'cy'),
+    (None, 'd'),
+    (None, 'dx'),
+    (None, 'dy'),
+    (None, 'descent'),
+    (None, 'display'),
+    (None, 'dur'),
+    (None, 'end'),
+    (None, 'fill'),
+    (None, 'fill-opacity'),
+    (None, 'fill-rule'),
+    (None, 'font-family'),
+    (None, 'font-size'),
+    (None, 'font-stretch'),
+    (None, 'font-style'),
+    (None, 'font-variant'),
+    (None, 'font-weight'),
+    (None, 'from'),
+    (None, 'fx'),
+    (None, 'fy'),
+    (None, 'g1'),
+    (None, 'g2'),
+    (None, 'glyph-name'),
+    (None, 'gradientUnits'),
+    (None, 'hanging'),
+    (None, 'height'),
+    (None, 'horiz-adv-x'),
+    (None, 'horiz-origin-x'),
+    (None, 'id'),
+    (None, 'ideographic'),
+    (None, 'k'),
+    (None, 'keyPoints'),
+    (None, 'keySplines'),
+    (None, 'keyTimes'),
+    (None, 'lang'),
+    (None, 'marker-end'),
+    (None, 'marker-mid'),
+    (None, 'marker-start'),
+    (None, 'markerHeight'),
+    (None, 'markerUnits'),
+    (None, 'markerWidth'),
+    (None, 'mathematical'),
+    (None, 'max'),
+    (None, 'min'),
+    (None, 'name'),
+    (None, 'offset'),
+    (None, 'opacity'),
+    (None, 'orient'),
+    (None, 'origin'),
+    (None, 'overline-position'),
+    (None, 'overline-thickness'),
+    (None, 'panose-1'),
+    (None, 'path'),
+    (None, 'pathLength'),
+    (None, 'points'),
+    (None, 'preserveAspectRatio'),
+    (None, 'r'),
+    (None, 'refX'),
+    (None, 'refY'),
+    (None, 'repeatCount'),
+    (None, 'repeatDur'),
+    (None, 'requiredExtensions'),
+    (None, 'requiredFeatures'),
+    (None, 'restart'),
+    (None, 'rotate'),
+    (None, 'rx'),
+    (None, 'ry'),
+    (None, 'slope'),
+    (None, 'stemh'),
+    (None, 'stemv'),
+    (None, 'stop-color'),
+    (None, 'stop-opacity'),
+    (None, 'strikethrough-position'),
+    (None, 'strikethrough-thickness'),
+    (None, 'stroke'),
+    (None, 'stroke-dasharray'),
+    (None, 'stroke-dashoffset'),
+    (None, 'stroke-linecap'),
+    (None, 'stroke-linejoin'),
+    (None, 'stroke-miterlimit'),
+    (None, 'stroke-opacity'),
+    (None, 'stroke-width'),
+    (None, 'systemLanguage'),
+    (None, 'target'),
+    (None, 'text-anchor'),
+    (None, 'to'),
+    (None, 'transform'),
+    (None, 'type'),
+    (None, 'u1'),
+    (None, 'u2'),
+    (None, 'underline-position'),
+    (None, 'underline-thickness'),
+    (None, 'unicode'),
+    (None, 'unicode-range'),
+    (None, 'units-per-em'),
+    (None, 'values'),
+    (None, 'version'),
+    (None, 'viewBox'),
+    (None, 'visibility'),
+    (None, 'width'),
+    (None, 'widths'),
+    (None, 'x'),
+    (None, 'x-height'),
+    (None, 'x1'),
+    (None, 'x2'),
+    (namespaces['xlink'], 'actuate'),
+    (namespaces['xlink'], 'arcrole'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'role'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'title'),
+    (namespaces['xlink'], 'type'),
+    (namespaces['xml'], 'base'),
+    (namespaces['xml'], 'lang'),
+    (namespaces['xml'], 'space'),
+    (None, 'y'),
+    (None, 'y1'),
+    (None, 'y2'),
+    (None, 'zoomAndPan'),
+))
+
+attr_val_is_uri = frozenset((
+    (None, 'href'),
+    (None, 'src'),
+    (None, 'cite'),
+    (None, 'action'),
+    (None, 'longdesc'),
+    (None, 'poster'),
+    (None, 'background'),
+    (None, 'datasrc'),
+    (None, 'dynsrc'),
+    (None, 'lowsrc'),
+    (None, 'ping'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xml'], 'base'),
+))
+
+svg_attr_val_allows_ref = frozenset((
+    (None, 'clip-path'),
+    (None, 'color-profile'),
+    (None, 'cursor'),
+    (None, 'fill'),
+    (None, 'filter'),
+    (None, 'marker'),
+    (None, 'marker-start'),
+    (None, 'marker-mid'),
+    (None, 'marker-end'),
+    (None, 'mask'),
+    (None, 'stroke'),
+))
+
+svg_allow_local_href = frozenset((
+    (None, 'altGlyph'),
+    (None, 'animate'),
+    (None, 'animateColor'),
+    (None, 'animateMotion'),
+    (None, 'animateTransform'),
+    (None, 'cursor'),
+    (None, 'feImage'),
+    (None, 'filter'),
+    (None, 'linearGradient'),
+    (None, 'pattern'),
+    (None, 'radialGradient'),
+    (None, 'textpath'),
+    (None, 'tref'),
+    (None, 'set'),
+    (None, 'use')
+))
+
+allowed_css_properties = frozenset((
+    'azimuth',
+    'background-color',
+    'border-bottom-color',
+    'border-collapse',
+    'border-color',
+    'border-left-color',
+    'border-right-color',
+    'border-top-color',
+    'clear',
+    'color',
+    'cursor',
+    'direction',
+    'display',
+    'elevation',
+    'float',
+    'font',
+    'font-family',
+    'font-size',
+    'font-style',
+    'font-variant',
+    'font-weight',
+    'height',
+    'letter-spacing',
+    'line-height',
+    'overflow',
+    'pause',
+    'pause-after',
+    'pause-before',
+    'pitch',
+    'pitch-range',
+    'richness',
+    'speak',
+    'speak-header',
+    'speak-numeral',
+    'speak-punctuation',
+    'speech-rate',
+    'stress',
+    'text-align',
+    'text-decoration',
+    'text-indent',
+    'unicode-bidi',
+    'vertical-align',
+    'voice-family',
+    'volume',
+    'white-space',
+    'width',
+))
+
+allowed_css_keywords = frozenset((
+    'auto',
+    'aqua',
+    'black',
+    'block',
+    'blue',
+    'bold',
+    'both',
+    'bottom',
+    'brown',
+    'center',
+    'collapse',
+    'dashed',
+    'dotted',
+    'fuchsia',
+    'gray',
+    'green',
+    '!important',
+    'italic',
+    'left',
+    'lime',
+    'maroon',
+    'medium',
+    'none',
+    'navy',
+    'normal',
+    'nowrap',
+    'olive',
+    'pointer',
+    'purple',
+    'red',
+    'right',
+    'solid',
+    'silver',
+    'teal',
+    'top',
+    'transparent',
+    'underline',
+    'white',
+    'yellow',
+))
+
+allowed_svg_properties = frozenset((
+    'fill',
+    'fill-opacity',
+    'fill-rule',
+    'stroke',
+    'stroke-width',
+    'stroke-linecap',
+    'stroke-linejoin',
+    'stroke-opacity',
+))
+
+allowed_protocols = frozenset((
+    'ed2k',
+    'ftp',
+    'http',
+    'https',
+    'irc',
+    'mailto',
+    'news',
+    'gopher',
+    'nntp',
+    'telnet',
+    'webcal',
+    'xmpp',
+    'callto',
+    'feed',
+    'urn',
+    'aim',
+    'rsync',
+    'tag',
+    'ssh',
+    'sftp',
+    'rtsp',
+    'afs',
+    'data',
+))
+
+allowed_content_types = frozenset((
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp',
+    'text/plain',
+))
+
+
+data_content_type = re.compile(r'''
+                                ^
+                                # Match a content type <application>/<type>
+                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                # Match any character set and encoding
+                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                                # Assume the rest is data
+                                ,.*
+                                $
+                                ''',
+                               re.VERBOSE)
+
+
+class Filter(base.Filter):
+    """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
+    def __init__(self,
+                 source,
+                 allowed_elements=allowed_elements,
+                 allowed_attributes=allowed_attributes,
+                 allowed_css_properties=allowed_css_properties,
+                 allowed_css_keywords=allowed_css_keywords,
+                 allowed_svg_properties=allowed_svg_properties,
+                 allowed_protocols=allowed_protocols,
+                 allowed_content_types=allowed_content_types,
+                 attr_val_is_uri=attr_val_is_uri,
+                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+                 svg_allow_local_href=svg_allow_local_href):
+        """Creates a Filter
+
+        :arg allowed_elements: set of elements to allow--everything else will
+            be escaped
+
+        :arg allowed_attributes: set of attributes to allow in
+            elements--everything else will be stripped
+
+        :arg allowed_css_properties: set of CSS properties to allow--everything
+            else will be stripped
+
+        :arg allowed_css_keywords: set of CSS keywords to allow--everything
+            else will be stripped
+
+        :arg allowed_svg_properties: set of SVG properties to allow--everything
+            else will be removed
+
+        :arg allowed_protocols: set of allowed protocols for URIs
+
+        :arg allowed_content_types: set of allowed content types for ``data`` URIs.
+
+        :arg attr_val_is_uri: set of attributes that have URI values--values
+            that have a scheme not listed in ``allowed_protocols`` are removed
+
+        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
+            references
+
+        :arg svg_allow_local_href: set of SVG elements that can have local
+            hrefs--these are removed
+
+        """
+        super(Filter, self).__init__(source)
+
+        warnings.warn(_deprecation_msg, DeprecationWarning)
+
+        self.allowed_elements = allowed_elements
+        self.allowed_attributes = allowed_attributes
+        self.allowed_css_properties = allowed_css_properties
+        self.allowed_css_keywords = allowed_css_keywords
+        self.allowed_svg_properties = allowed_svg_properties
+        self.allowed_protocols = allowed_protocols
+        self.allowed_content_types = allowed_content_types
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.svg_allow_local_href = svg_allow_local_href
+
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
+    # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
+    # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
+    # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
+    # allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in ("StartTag", "EndTag", "EmptyTag"):
+            name = token["name"]
+            namespace = token["namespace"]
+            if ((namespace, name) in self.allowed_elements or
+                (namespace is None and
+                 (namespaces["html"], name) in self.allowed_elements)):
+                return self.allowed_token(token)
+            else:
+                return self.disallowed_token(token)
+        elif token_type == "Comment":
+            pass
+        else:
+            return token
+
+    def allowed_token(self, token):
+        if "data" in token:
+            attrs = token["data"]
+            attr_names = set(attrs.keys())
+
+            # Remove forbidden attributes
+            for to_remove in (attr_names - self.allowed_attributes):
+                del token["data"][to_remove]
+                attr_names.remove(to_remove)
+
+            # Remove attributes with disallowed URL values
+            for attr in (attr_names & self.attr_val_is_uri):
+                assert attr in attrs
+                # I don't have a clue where this regexp comes from or why it matches those
+                # characters, nor why we call unescape. I just know it's always been here.
+                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+                # this will do is remove *more* than it otherwise would.
+                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
+                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = data_content_type.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
+                                                                     attrs[(namespaces['xlink'], 'href')])):
+                del attrs[(namespaces['xlink'], 'href')]
+            if (None, 'style') in attrs:
+                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+            token["data"] = attrs
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if keyword not in self.allowed_css_keywords and \
+                            not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
--- a/lib/bleach/_vendor/html5lib/filters/whitespace.py
+++ b/lib/bleach/_vendor/html5lib/filters/whitespace.py
@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+
+from . import base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
+
+
+class Filter(base.Filter):
+    """Collapses whitespace except in pre, textarea, and script elements"""
+    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+    def __iter__(self):
+        preserve = 0
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag" \
+                    and (preserve or token["name"] in self.spacePreserveElements):
+                preserve += 1
+
+            elif type == "EndTag" and preserve:
+                preserve -= 1
+
+            elif not preserve and type == "SpaceCharacters" and token["data"]:
+                # Test on token["data"] above to not introduce spaces where there were not
+                token["data"] = " "
+
+            elif not preserve and type == "Characters":
+                token["data"] = collapse_spaces(token["data"])
+
+            yield token
+
+
+def collapse_spaces(text):
+    return SPACES_REGEX.sub(' ', text)
--- a/lib/bleach/_vendor/html5lib/html5parser.py
+++ b/lib/bleach/_vendor/html5lib/html5parser.py
--- a/lib/bleach/_vendor/html5lib/serializer.py
+++ b/lib/bleach/_vendor/html5lib/serializer.py
@ -0,0 +1,409 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+import re
+
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
+from xml.sax.saxutils import escape
+
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000]")
+
+
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((_is_ucs4 and len(v) > 1) or
+            (not _is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = _utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in _encode_entity_map or k.islower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            _encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = _encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
+
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+    """Serializes the input token stream using the specified treewalker
+
+    :arg input: the token stream to serialize
+
+    :arg tree: the treewalker to use
+
+    :arg encoding: the encoding to use
+
+    :arg serializer_opts: any options to pass to the
+        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
+
+    :returns: the tree serialized as a string
+
+    Example:
+
+    >>> from html5lib.html5parser import parse
+    >>> from html5lib.serializer import serialize
+    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
+    >>> serialize(token_stream, omit_optional_tags=False)
+    '<html><head></head><body><p>Hi!</p></body></html>'
+
+    """
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree)
+    s = HTMLSerializer(**serializer_opts)
+    return s.render(walker(input), encoding)
+
+
+class HTMLSerializer(object):
+
+    # attribute quoting options
+    quote_attr_values = "legacy"  # be secure by default
+    quote_char = '"'
+    use_best_quote_char = True
+
+    # tag syntax options
+    omit_optional_tags = True
+    minimize_boolean_attributes = True
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+
+    # escaping options
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+    resolve_entities = True
+
+    # miscellaneous options
+    alphabetical_attributes = False
+    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+               "omit_optional_tags", "minimize_boolean_attributes",
+               "use_trailing_solidus", "space_before_trailing_solidus",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+               "alphabetical_attributes", "inject_meta_charset",
+               "strip_whitespace", "sanitize")
+
+    def __init__(self, **kwargs):
+        """Initialize HTMLSerializer
+
+        :arg inject_meta_charset: Whether or not to inject the meta charset.
+
+            Defaults to ``True``.
+
+        :arg quote_attr_values: Whether to quote attribute values that don't
+            require quoting per legacy browser behavior (``"legacy"``), when
+            required by the standard (``"spec"``), or always (``"always"``).
+
+            Defaults to ``"legacy"``.
+
+        :arg quote_char: Use given quote character for attribute quoting.
+
+            Defaults to ``"`` which will use double quotes unless attribute
+            value contains a double quote, in which case single quotes are
+            used.
+
+        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
+            values.
+
+            Defaults to ``False``.
+
+        :arg escape_rcdata: Whether to escape characters that need to be
+            escaped within normal elements within rcdata elements such as
+            style.
+
+            Defaults to ``False``.
+
+        :arg resolve_entities: Whether to resolve named character entities that
+            appear in the source tree. The XML predefined entities &lt; &gt;
+            &amp; &quot; &apos; are unaffected by this setting.
+
+            Defaults to ``True``.
+
+        :arg strip_whitespace: Whether to remove semantically meaningless
+            whitespace. (This compresses all whitespace to a single space
+            except within ``pre``.)
+
+            Defaults to ``False``.
+
+        :arg minimize_boolean_attributes: Shortens boolean attributes to give
+            just the attribute value, for example::
+
+              <input disabled="disabled">
+
+            becomes::
+
+              <input disabled>
+
+            Defaults to ``True``.
+
+        :arg use_trailing_solidus: Includes a close-tag slash at the end of the
+            start tag of void elements (empty elements whose end tag is
+            forbidden). E.g. ``<hr/>``.
+
+            Defaults to ``False``.
+
+        :arg space_before_trailing_solidus: Places a space immediately before
+            the closing slash in a tag using a trailing solidus. E.g.
+            ``<hr />``. Requires ``use_trailing_solidus=True``.
+
+            Defaults to ``True``.
+
+        :arg sanitize: Strip all unsafe or unknown constructs from output.
+            See :py:class:`html5lib.filters.sanitizer.Filter`.
+
+            Defaults to ``False``.
+
+        :arg omit_optional_tags: Omit start/end tags that are optional.
+
+            Defaults to ``True``.
+
+        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
+
+            Defaults to ``False``.
+
+        """
+        unexpected_args = frozenset(kwargs) - frozenset(self.options)
+        if len(unexpected_args) > 0:
+            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
+        if 'quote_char' in kwargs:
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def encode(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "htmlentityreplace")
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
+    def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
+        self.encoding = encoding
+        in_cdata = False
+        self.errors = []
+
+        if encoding and self.inject_meta_charset:
+            from .filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from .filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
+        # WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            from .filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from .filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
+        if self.omit_optional_tags:
+            from .filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
+
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = "<!DOCTYPE %s" % token["name"]
+
+                if token["publicId"]:
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
+                elif token["systemId"]:
+                    doctype += " SYSTEM"
+                if token["systemId"]:
+                    if token["systemId"].find('"') >= 0:
+                        if token["systemId"].find("'") >= 0:
+                            self.serializeError("System identifier contains both single and double quote characters")
+                        quote_char = "'"
+                    else:
+                        quote_char = '"'
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+                doctype += ">"
+                yield self.encodeStrict(doctype)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError("Unexpected </ in CDATA")
+                    yield self.encode(token["data"])
+                else:
+                    yield self.encode(escape(token["data"]))
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                yield self.encodeStrict("<%s" % name)
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                for (_, attr_name), attr_value in token["data"].items():
+                    # TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(' ')
+
+                    yield self.encodeStrict(k)
+                    if not self.minimize_boolean_attributes or \
+                        (k not in booleanAttributes.get(name, tuple()) and
+                         k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict("=")
+                        if self.quote_attr_values == "always" or len(v) == 0:
+                            quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = _quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = _quoteAttributeLegacy.search(v) is not None
+                        else:
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
+                        v = v.replace("&", "&amp;")
+                        if self.escape_lt_in_attrs:
+                            v = v.replace("<", "&lt;")
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if "'" in v and '"' not in v:
+                                    quote_char = '"'
+                                elif '"' in v and "'" not in v:
+                                    quote_char = "'"
+                            if quote_char == "'":
+                                v = v.replace("'", "&#39;")
+                            else:
+                                v = v.replace('"', "&quot;")
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
+                        else:
+                            yield self.encode(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        yield self.encodeStrict(" /")
+                    else:
+                        yield self.encodeStrict("/")
+                yield self.encode(">")
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                yield self.encodeStrict("</%s>" % name)
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError("Comment contains --")
+                yield self.encodeStrict("<!--%s-->" % token["data"])
+
+            elif type == "Entity":
+                name = token["name"]
+                key = name + ";"
+                if key not in entities:
+                    self.serializeError("Entity %s not recognized" % name)
+                if self.resolve_entities and key not in xmlEntities:
+                    data = entities[key]
+                else:
+                    data = "&%s;" % name
+                yield self.encodeStrict(data)
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        """Serializes the stream from the treewalker into a string
+
+        :arg treewalker: the treewalker to serialize
+
+        :arg encoding: the string encoding to use
+
+        :returns: the serialized tree
+
+        Example:
+
+        >>> from html5lib import parse, getTreeWalker
+        >>> from html5lib.serializer import HTMLSerializer
+        >>> token_stream = parse('<html><body>Hi!</body></html>')
+        >>> walker = getTreeWalker('etree')
+        >>> serializer = HTMLSerializer(omit_optional_tags=False)
+        >>> serializer.render(walker(token_stream))
+        '<html><head></head><body>Hi!</body></html>'
+
+        """
+        if encoding:
+            return b"".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return "".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+
+class SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
--- a/lib/bleach/_vendor/html5lib/treeadapters/init.py
+++ b/lib/bleach/_vendor/html5lib/treeadapters/init.py
@ -0,0 +1,30 @@
+"""Tree adapters let you convert from one tree structure to another
+
+Example:
+
+.. code-block:: python
+
+   import html5lib
+   from html5lib.treeadapters import genshi
+
+   doc = '<html><body>Hi!</body></html>'
+   treebuilder = html5lib.getTreeBuilder('etree')
+   parser = html5lib.HTMLParser(tree=treebuilder)
+   tree = parser.parse(doc)
+   TreeWalker = html5lib.getTreeWalker('etree')
+
+   genshi_tree = genshi.to_genshi(TreeWalker(tree))
+
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+from . import sax
+
+__all__ = ["sax"]
+
+try:
+    from . import genshi  # noqa
+except ImportError:
+    pass
+else:
+    __all__.append("genshi")
--- a/lib/bleach/_vendor/html5lib/treeadapters/genshi.py
+++ b/lib/bleach/_vendor/html5lib/treeadapters/genshi.py
@ -0,0 +1,54 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName, Attrs
+from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
+
+
+def to_genshi(walker):
+    """Convert a tree to a genshi tree
+
+    :arg walker: the treewalker to use to walk the tree to convert it
+
+    :returns: generator of genshi nodes
+
+    """
+    text = []
+    for token in walker:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            text.append(token["data"])
+        elif text:
+            yield TEXT, "".join(text), (None, -1, -1)
+            text = []
+
+        if type in ("StartTag", "EmptyTag"):
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+            attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
+                           for attr, value in token["data"].items()])
+            yield (START, (QName(name), attrs), (None, -1, -1))
+            if type == "EmptyTag":
+                type = "EndTag"
+
+        if type == "EndTag":
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+
+            yield END, QName(name), (None, -1, -1)
+
+        elif type == "Comment":
+            yield COMMENT, token["data"], (None, -1, -1)
+
+        elif type == "Doctype":
+            yield DOCTYPE, (token["name"], token["publicId"],
+                            token["systemId"]), (None, -1, -1)
+
+        else:
+            pass  # FIXME: What to do?
+
+    if text:
+        yield TEXT, "".join(text), (None, -1, -1)
--- a/lib/bleach/_vendor/html5lib/treeadapters/sax.py
+++ b/lib/bleach/_vendor/html5lib/treeadapters/sax.py
@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+    if prefix is not None:
+        prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+    """Call SAX-like content handler based on treewalker walker
+
+    :arg walker: the treewalker to use to walk the tree to convert it
+
+    :arg handler: SAX handler to use
+
+    """
+    handler.startDocument()
+    for prefix, namespace in prefix_mapping.items():
+        handler.startPrefixMapping(prefix, namespace)
+
+    for token in walker:
+        type = token["type"]
+        if type == "Doctype":
+            continue
+        elif type in ("StartTag", "EmptyTag"):
+            attrs = AttributesNSImpl(token["data"],
+                                     unadjustForeignAttributes)
+            handler.startElementNS((token["namespace"], token["name"]),
+                                   token["name"],
+                                   attrs)
+            if type == "EmptyTag":
+                handler.endElementNS((token["namespace"], token["name"]),
+                                     token["name"])
+        elif type == "EndTag":
+            handler.endElementNS((token["namespace"], token["name"]),
+                                 token["name"])
+        elif type in ("Characters", "SpaceCharacters"):
+            handler.characters(token["data"])
+        elif type == "Comment":
+            pass
+        else:
+            assert False, "Unknown token type"
+
+    for prefix, namespace in prefix_mapping.items():
+        handler.endPrefixMapping(prefix)
+    handler.endDocument()
--- a/lib/bleach/_vendor/html5lib/treebuilders/init.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/init.py
@ -0,0 +1,88 @@
+"""A collection of modules for building different kinds of trees from HTML
+documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1. A set of classes for various types of elements: Document, Doctype, Comment,
+   Element. These must implement the interface of ``base.treebuilders.Node``
+   (although comment nodes have a different signature for their constructor,
+   see ``treebuilders.etree.Comment``) Textual content may also be implemented
+   as another node type, or not, as your tree implementation requires.
+
+2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
+   from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
+
+   * ``documentClass`` - the class to use for the bottommost node of a document
+   * ``elementClass`` - the class to use for HTML Elements
+   * ``commentClass`` - the class to use for comments
+   * ``doctypeClass`` - the class to use for doctypes
+
+   It also has one required method:
+
+   * ``getDocument`` - Returns the root node of the complete document tree
+
+3. If you wish to run the unit tests, you must also create a ``testSerializer``
+   method on your treebuilder which accepts a node and returns a string
+   containing Node and its children serialized according to the format used in
+   the unittests
+
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .._utils import default_etree
+
+treeBuilderCache = {}
+
+
+def getTreeBuilder(treeType, implementation=None, **kwargs):
+    """Get a TreeBuilder class for various types of trees with built-in support
+
+    :arg treeType: the name of the tree type required (case-insensitive). Supported
+        values are:
+
+        * "dom" - A generic builder for DOM implementations, defaulting to a
+          xml.dom.minidom based implementation.
+        * "etree" - A generic builder for tree implementations exposing an
+          ElementTree-like interface, defaulting to xml.etree.cElementTree if
+          available and xml.etree.ElementTree if not.
+        * "lxml" - A etree-based builder for lxml.etree, handling limitations
+          of lxml's implementation.
+
+    :arg implementation: (Currently applies to the "etree" and "dom" tree
+        types). A module implementing the tree type e.g. xml.etree.ElementTree
+        or xml.etree.cElementTree.
+
+    :arg kwargs: Any additional options to pass to the TreeBuilder when
+        creating it.
+
+    Example:
+
+    >>> from html5lib.treebuilders import getTreeBuilder
+    >>> builder = getTreeBuilder('etree')
+
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeBuilderCache:
+        if treeType == "dom":
+            from . import dom
+            # Come up with a sane default (pref. from the stdlib)
+            if implementation is None:
+                from xml.dom import minidom
+                implementation = minidom
+            # NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
+        else:
+            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
+    return treeBuilderCache.get(treeType)
--- a/lib/bleach/_vendor/html5lib/treebuilders/base.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/base.py
@ -0,0 +1,417 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from ..constants import scopingElements, tableInsertModeElements, namespaces
+
+# The scope markers are inserted when entering object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, object elements, and marquees.
+Marker = None
+
+listElementsMap = {
+    None: (frozenset(scopingElements), False),
+    "button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
+    "list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
+                                          (namespaces["html"], "ul")}), False),
+    "table": (frozenset([(namespaces["html"], "html"),
+                         (namespaces["html"], "table")]), False),
+    "select": (frozenset([(namespaces["html"], "optgroup"),
+                          (namespaces["html"], "option")]), True)
+}
+
+
+class Node(object):
+    """Represents an item in the tree"""
+    def __init__(self, name):
+        """Creates a Node
+
+        :arg name: The tag name associated with the node
+
+        """
+        # The tag name associated with the node
+        self.name = name
+        # The parent of the current node (or None for the document node)
+        self.parent = None
+        # The value of the current node (applies to text nodes and comments)
+        self.value = None
+        # A dict holding name -> value pairs for attributes of the node
+        self.attributes = {}
+        # A list of child nodes of the current node. This must include all
+        # elements but not necessarily other node types.
+        self.childNodes = []
+        # A list of miscellaneous flags that can be set on the node.
+        self._flags = []
+
+    def __str__(self):
+        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
+                                  for name, value in
+                                  self.attributes.items()])
+        if attributesStr:
+            return "<%s %s>" % (self.name, attributesStr)
+        else:
+            return "<%s>" % (self.name)
+
+    def __repr__(self):
+        return "<%s>" % (self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+
+        :arg node: the node to insert
+
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the
+        start of node insertBefore or to the end of the node's text.
+
+        :arg data: the data to insert
+
+        :arg insertBefore: True if you want to insert the text before the node
+            and False if you want to insert it after the node
+
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the
+        list of child nodes. Raises ValueError if refNode is not a child of
+        the current node
+
+        :arg node: the node to insert
+
+        :arg refNode: the child node to insert the node before
+
+        """
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+
+        :arg node: the child node to remove
+
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent.
+        This is needed so that trees that don't store text as nodes move the
+        text in the correct way
+
+        :arg newParent: the node to move all this node's children to
+
+        """
+        # XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+
+class ActiveFormattingElements(list):
+    def append(self, node):
+        equalCount = 0
+        if node != Marker:
+            for element in self[::-1]:
+                if element == Marker:
+                    break
+                if self.nodesEqual(element, node):
+                    equalCount += 1
+                if equalCount == 3:
+                    self.remove(element)
+                    break
+        list.append(self, node)
+
+    def nodesEqual(self, node1, node2):
+        if not node1.nameTuple == node2.nameTuple:
+            return False
+
+        if not node1.attributes == node2.attributes:
+            return False
+
+        return True
+
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+
+    * documentClass - the class to use for the bottommost node of a document
+    * elementClass - the class to use for HTML Elements
+    * commentClass - the class to use for comments
+    * doctypeClass - the class to use for doctypes
+
+    """
+    # pylint:disable=not-callable
+
+    # Document class
+    documentClass = None
+
+    # The class to use for creating a node
+    elementClass = None
+
+    # The class to use for creating comments
+    commentClass = None
+
+    # The class to use for creating doctypes
+    doctypeClass = None
+
+    # Fragment class
+    fragmentClass = None
+
+    def __init__(self, namespaceHTMLElements):
+        """Create a TreeBuilder
+
+        :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+        """
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+        self.reset()
+
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
+
+        # XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, variant=None):
+
+        # If we pass a node in we match that. if we pass a string
+        # match any node with that name
+        exactNode = hasattr(target, "nameTuple")
+        if not exactNode:
+            if isinstance(target, text_type):
+                target = (namespaces["html"], target)
+            assert isinstance(target, tuple)
+
+        listElements, invert = listElementsMap[variant]
+
+        for node in reversed(self.openElements):
+            if exactNode and node == target:
+                return True
+            elif not exactNode and node.nameTuple == target:
+                return True
+            elif (invert ^ (node.nameTuple in listElements)):
+                return False
+
+        assert False  # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = len(self.activeFormattingElements) - 1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            if i == 0:
+                # This will be reset to 0 below
+                i = -1
+                break
+            i -= 1
+            # Step 5: let entry be one earlier in the list.
+            entry = self.activeFormattingElements[i]
+
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            entry = self.activeFormattingElements[i]
+            clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
+
+            # Step 9
+            element = self.insertElement({"type": "StartTag",
+                                          "name": clone.name,
+                                          "namespace": clone.namespace,
+                                          "data": clone.attributes})
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.document.appendChild(doctype)
+
+    def insertComment(self, token, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(token["data"]))
+
+    def createElement(self, token):
+        """Create an element but don't insert it anywhere"""
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+
+    def insertElementNormal(self, token):
+        name = token["name"]
+        assert isinstance(name, text_type), "Element %s not unicode" % name
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, token):
+        """Create an element and insert it into the tree"""
+        element = self.createElement(token)
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(token)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name
+                                         not in tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        # The foster parent element is the one which comes before the most
+        # recently opened table element
+        # XXX - this is really inelegant
+        lastTable = None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == "table":
+                lastTable = elm
+                break
+        if lastTable:
+            # XXX - we should really check that this parent is actually a
+            # node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        # XXX td, th and tr are not actually needed
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
+                name != exclude):
+            self.openElements.pop()
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        """Return the final tree"""
+        return self.document
+
+    def getFragment(self):
+        """Return the final fragment"""
+        # assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+
+        :arg node: the node from which to start serializing
+
+        """
+        raise NotImplementedError
--- a/lib/bleach/_vendor/html5lib/treebuilders/dom.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/dom.py
@ -0,0 +1,239 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+try:
+    from collections.abc import MutableMapping
+except ImportError:  # Python 2.7
+    from collections import MutableMapping
+from xml.dom import minidom, Node
+import weakref
+
+from . import base
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+
+    class AttrList(MutableMapping):
+        def __init__(self, element):
+            self.element = element
+
+        def __iter__(self):
+            return iter(self.element.attributes.keys())
+
+        def __setitem__(self, name, value):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                attr = self.element.ownerDocument.createAttribute(name)
+                attr.value = value
+                self.element.attributes[name] = attr
+
+        def __len__(self):
+            return len(self.element.attributes)
+
+        def items(self):
+            return list(self.element.attributes.items())
+
+        def values(self):
+            return list(self.element.attributes.values())
+
+        def __getitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.attributes[name].value
+
+        def __delitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                del self.element.attributes[name]
+
+    class NodeBuilder(base.Node):
+        def __init__(self, element):
+            base.Node.__init__(self, element.nodeName)
+            self.element = element
+
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
+                             self.element.namespaceURI or None)
+
+        def appendChild(self, node):
+            node.parent = self
+            self.element.appendChild(node.element)
+
+        def insertText(self, data, insertBefore=None):
+            text = self.element.ownerDocument.createTextNode(data)
+            if insertBefore:
+                self.element.insertBefore(text, insertBefore.element)
+            else:
+                self.element.appendChild(text)
+
+        def insertBefore(self, node, refNode):
+            self.element.insertBefore(node.element, refNode.element)
+            node.parent = self
+
+        def removeChild(self, node):
+            if node.element.parentNode == self.element:
+                self.element.removeChild(node.element)
+            node.parent = None
+
+        def reparentChildren(self, newParent):
+            while self.element.hasChildNodes():
+                child = self.element.firstChild
+                self.element.removeChild(child)
+                newParent.element.appendChild(child)
+            self.childNodes = []
+
+        def getAttributes(self):
+            return AttrList(self.element)
+
+        def setAttributes(self, attributes):
+            if attributes:
+                for name, value in list(attributes.items()):
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" + name[1])
+                        else:
+                            qualifiedName = name[1]
+                        self.element.setAttributeNS(name[2], qualifiedName,
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            name, value)
+        attributes = property(getAttributes, setAttributes)
+
+        def cloneNode(self):
+            return NodeBuilder(self.element.cloneNode(False))
+
+        def hasContent(self):
+            return self.element.hasChildNodes()
+
+        def getNameTuple(self):
+            if self.namespace is None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
+
+        nameTuple = property(getNameTuple)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        def documentClass(self):
+            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
+            return weakref.proxy(self)
+
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
+            doctype = domimpl.createDocumentType(name, publicId, systemId)
+            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
+                doctype.ownerDocument = self.dom
+
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
+
+        def commentClass(self, data):
+            return NodeBuilder(self.dom.createComment(data))
+
+        def fragmentClass(self):
+            return NodeBuilder(self.dom.createDocumentFragment())
+
+        def appendChild(self, node):
+            self.dom.appendChild(node.element)
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            return self.dom
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self).element
+
+        def insertText(self, data, parent=None):
+            data = data
+            if parent != self:
+                base.TreeBuilder.insertText(self, data, parent)
+            else:
+                # HACK: allow text nodes as children of the document node
+                if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
+                    if Node.TEXT_NODE not in self.dom._child_node_types:
+                        self.dom._child_node_types = list(self.dom._child_node_types)
+                        self.dom._child_node_types.append(Node.TEXT_NODE)
+                self.dom.appendChild(self.dom.createTextNode(data))
+
+        implementation = DomImplementation
+        name = None
+
+    def testSerializer(element):
+        element.normalize()
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, element.name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif element.nodeType == Node.DOCUMENT_NODE:
+                rv.append("#document")
+            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+                rv.append("#document-fragment")
+            elif element.nodeType == Node.COMMENT_NODE:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
+            elif element.nodeType == Node.TEXT_NODE:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
+            else:
+                if (hasattr(element, "namespaceURI") and
+                        element.namespaceURI is not None):
+                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
+                                      element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.hasAttributes():
+                    attributes = []
+                    for i in range(len(element.attributes)):
+                        attr = element.attributes.item(i)
+                        name = attr.nodeName
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
+                        else:
+                            name = attr.nodeName
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+            indent += 2
+            for child in element.childNodes:
+                serializeElement(child, indent)
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    return locals()
+
+
+# The actual means to get a module!
+getDomModule = moduleFactoryFactory(getDomBuilder)
--- a/lib/bleach/_vendor/html5lib/treebuilders/etree.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/etree.py
@ -0,0 +1,343 @@
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+from six import text_type
+
+import re
+
+from copy import copy
+
+from . import base
+from .. import _ihatexml
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation, fullTree=False):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class Element(base.Node):
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s" % (namespace, name)
+            return etree_tag
+
+        def _setName(self, name):
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getName(self):
+            return self._name
+
+        name = property(_getName, _setName)
+
+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
+
+        def _getAttributes(self):
+            return self._element.attrib
+
+        def _setAttributes(self, attributes):
+            el_attrib = self._element.attrib
+            el_attrib.clear()
+            if attributes:
+                # calling .items _always_ allocates, and the above truthy check is cheaper than the
+                # allocation on average
+                for key, value in attributes.items():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s" % (key[2], key[1])
+                    else:
+                        name = key
+                    el_attrib[name] = value
+
+        attributes = property(_getAttributes, _setAttributes)
+
+        def _getChildNodes(self):
+            return self._childNodes
+
+        def _setChildNodes(self, value):
+            del self._element[:]
+            self._childNodes = []
+            for element in value:
+                self.insertChild(element)
+
+        childNodes = property(_getChildNodes, _setChildNodes)
+
+        def hasContent(self):
+            """Return true if the node has children or text"""
+            return bool(self._element.text or len(self._element))
+
+        def appendChild(self, node):
+            self._childNodes.append(node)
+            self._element.append(node._element)
+            node.parent = self
+
+        def insertBefore(self, node, refNode):
+            index = list(self._element).index(refNode._element)
+            self._element.insert(index, node._element)
+            node.parent = self
+
+        def removeChild(self, node):
+            self._childNodes.remove(node)
+            self._element.remove(node._element)
+            node.parent = None
+
+        def insertText(self, data, insertBefore=None):
+            if not(len(self._element)):
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+            elif insertBefore is None:
+                # Insert the text as the tail of the last child element
+                if not self._element[-1].tail:
+                    self._element[-1].tail = ""
+                self._element[-1].tail += data
+            else:
+                # Insert the text before the specified node
+                children = list(self._element)
+                index = children.index(insertBefore._element)
+                if index > 0:
+                    if not self._element[index - 1].tail:
+                        self._element[index - 1].tail = ""
+                    self._element[index - 1].tail += data
+                else:
+                    if not self._element.text:
+                        self._element.text = ""
+                    self._element.text += data
+
+        def cloneNode(self):
+            element = type(self)(self.name, self.namespace)
+            if self._element.attrib:
+                element._element.attrib = copy(self._element.attrib)
+            return element
+
+        def reparentChildren(self, newParent):
+            if newParent.childNodes:
+                newParent.childNodes[-1]._element.tail += self._element.text
+            else:
+                if not newParent._element.text:
+                    newParent._element.text = ""
+                if self._element.text is not None:
+                    newParent._element.text += self._element.text
+            self._element.text = ""
+            base.Node.reparentChildren(self, newParent)
+
+    class Comment(Element):
+        def __init__(self, data):
+            # Use the superclass constructor to set all properties on the
+            # wrapper element
+            self._element = ElementTree.Comment(data)
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getData(self):
+            return self._element.text
+
+        def _setData(self, value):
+            self._element.text = value
+
+        data = property(_getData, _setData)
+
+    class DocumentType(Element):
+        def __init__(self, name, publicId, systemId):
+            Element.__init__(self, "<!DOCTYPE>")
+            self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId
+
+        def _getPublicId(self):
+            return self._element.get("publicId", "")
+
+        def _setPublicId(self, value):
+            if value is not None:
+                self._element.set("publicId", value)
+
+        publicId = property(_getPublicId, _setPublicId)
+
+        def _getSystemId(self):
+            return self._element.get("systemId", "")
+
+        def _setSystemId(self, value):
+            if value is not None:
+                self._element.set("systemId", value)
+
+        systemId = property(_getSystemId, _setSystemId)
+
+    class Document(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_ROOT")
+
+    class DocumentFragment(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_FRAGMENT")
+
+    def testSerializer(element):
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if not(hasattr(element, "tag")):
+                element = element.getroot()
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                rv.append("#document")
+                if element.text is not None:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+            elif element.tag == ElementTreeCommentType:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            else:
+                assert isinstance(element.tag, text_type), \
+                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    name = "%s %s" % (prefix, name)
+                rv.append("|%s<%s>" % (' ' * indent, name))
+
+                if hasattr(element, "attrib"):
+                    attributes = []
+                    for name, value in element.attrib.items():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            attr_string = "%s %s" % (prefix, name)
+                        else:
+                            attr_string = name
+                        attributes.append((attr_string, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                if element.text:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    def tostring(element):  # pylint:disable=unused-variable
+        """Serialize an element and its child nodes to a string"""
+        rv = []
+        filter = _ihatexml.InfosetFilter()
+
+        def serializeElement(element):
+            if isinstance(element, ElementTree.ElementTree):
+                element = element.getroot()
+
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                if element.text is not None:
+                    rv.append(element.text)
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+
+                for child in element:
+                    serializeElement(child)
+
+            elif element.tag == ElementTreeCommentType:
+                rv.append("<!--%s-->" % (element.text,))
+            else:
+                # This is assumed to be an ordinary element
+                if not element.attrib:
+                    rv.append("<%s>" % (filter.fromXmlName(element.tag),))
+                else:
+                    attr = " ".join(["%s=\"%s\"" % (
+                        filter.fromXmlName(name), value)
+                        for name, value in element.attrib.items()])
+                    rv.append("<%s %s>" % (element.tag, attr))
+                if element.text:
+                    rv.append(element.text)
+
+                for child in element:
+                    serializeElement(child)
+
+                rv.append("</%s>" % (element.tag,))
+
+            if element.tail:
+                rv.append(element.tail)
+
+        serializeElement(element)
+
+        return "".join(rv)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        documentClass = Document
+        doctypeClass = DocumentType
+        elementClass = Element
+        commentClass = Comment
+        fragmentClass = DocumentFragment
+        implementation = ElementTreeImplementation
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            if fullTree:
+                return self.document._element
+            else:
+                if self.defaultNamespace is not None:
+                    return self.document._element.find(
+                        "{%s}html" % self.defaultNamespace)
+                else:
+                    return self.document._element.find("html")
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self)._element
+
+    return locals()
+
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
@ -0,0 +1,392 @@
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+import warnings
+import re
+import sys
+
+try:
+    from collections.abc import MutableMapping
+except ImportError:
+    from collections import MutableMapping
+
+from . import base
+from ..constants import DataLossWarning
+from .. import constants
+from . import etree as etree_builders
+from .. import _ihatexml
+
+import lxml.etree as etree
+from six import PY3, binary_type
+
+
+fullTree = True
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+comment_type = etree.Comment("asd").tag
+
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name
+        self.publicId = publicId
+        self.systemId = systemId
+
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        last = self._elementTree.getroot()
+        for last in self._elementTree.getroot().itersiblings():
+            pass
+
+        last.addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    childNodes = property(_getChildNodes)
+
+
+def testSerializer(element):
+    rv = []
+    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if hasattr(element, "getroot"):
+                # Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
+                            element.docinfo.root_name,
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent + 2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, str) or isinstance(element, bytes):
+                # Text in a fragment
+                assert isinstance(element, str) or sys.version_info[0] == 2
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                # Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent + 2)
+        elif element.tag == comment_type:
+            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
+        else:
+            assert isinstance(element, etree._Element)
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
+                                          infosetFilter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>" % (' ' * indent,
+                                       infosetFilter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                attributes = []
+                for name, value in element.attrib.items():
+                    nsmatch = tag_regexp.match(name)
+                    if nsmatch is not None:
+                        ns, name = nsmatch.groups()
+                        name = infosetFilter.fromXmlName(name)
+                        prefix = constants.prefixes[ns]
+                        attr_string = "%s %s" % (prefix, name)
+                    else:
+                        attr_string = infosetFilter.fromXmlName(name)
+                    attributes.append((attr_string, value))
+
+                for name, value in sorted(attributes):
+                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+
+            if element.text:
+                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
+
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+
+        elif element.tag == comment_type:
+            rv.append("<!--%s-->" % (element.text,))
+
+        else:
+            # This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>" % (element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\"" % (name, value)
+                                 for name, value in element.attrib.items()])
+                rv.append("<%s %s>" % (element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element:
+                serializeElement(child)
+
+            rv.append("</%s>" % (element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    return "".join(rv)
+
+
+class TreeBuilder(base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document
+    implementation = etree
+
+    def __init__(self, namespaceHTMLElements, fullTree=False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(MutableMapping):
+            def __init__(self, element):
+                self._element = element
+
+            def _coerceKey(self, key):
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                else:
+                    name = infosetFilter.coerceAttribute(key)
+                return name
+
+            def __getitem__(self, key):
+                value = self._element._element.attrib[self._coerceKey(key)]
+                if not PY3 and isinstance(value, binary_type):
+                    value = value.decode("ascii")
+                return value
+
+            def __setitem__(self, key, value):
+                self._element._element.attrib[self._coerceKey(key)] = value
+
+            def __delitem__(self, key):
+                del self._element._element.attrib[self._coerceKey(key)]
+
+            def __iter__(self):
+                return iter(self._element._element.attrib)
+
+            def __len__(self):
+                return len(self._element._element.attrib)
+
+            def clear(self):
+                return self._element._element.attrib.clear()
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = infosetFilter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = infosetFilter.coerceElement(name)
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+
+            def _getName(self):
+                return infosetFilter.fromXmlName(self._name)
+
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, value):
+                attributes = self.attributes
+                attributes.clear()
+                attributes.update(value)
+
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = infosetFilter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def cloneNode(self):
+                element = type(self)(self.name, self.namespace)
+                if self._element.attrib:
+                    element._element.attrib.update(self._element.attrib)
+                return element
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = infosetFilter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = infosetFilter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = Comment
+        # self.fragmentClass = builder.DocumentFragment
+        base.TreeBuilder.__init__(self, namespaceHTMLElements)
+
+    def reset(self):
+        base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(list(element))
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name:
+            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
+            self.doctype = None
+        else:
+            coercedName = self.infosetFilter.coerceElement(name)
+            if coercedName != name:
+                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
+
+            doctype = self.doctypeClass(coercedName, publicId, systemId)
+            self.doctype = doctype
+
+    def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
+        self.initial_comments.append(data)
+
+    def insertCommentMain(self, data, parent=None):
+        if (parent == self.document and
+                self.document._elementTree.getroot()[-1].tag == comment_type):
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+        super(TreeBuilder, self).insertComment(data, parent)
+
+    def insertRoot(self, token):
+        # Because of the way libxml2 works, it doesn't seem to be possible to
+        # alter information like the doctype after the tree has been parsed.
+        # Therefore we need to use the built-in parser to create our initial
+        # tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype:
+            assert self.doctype.name
+            docStr += "<!DOCTYPE %s" % self.doctype.name
+            if (self.doctype.publicId is not None or
+                    self.doctype.systemId is not None):
+                docStr += (' PUBLIC "%s" ' %
+                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
+                if self.doctype.systemId:
+                    sysid = self.doctype.systemId
+                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
+                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
+                        sysid = sysid.replace("'", 'U00027')
+                    if sysid.find("'") >= 0:
+                        docStr += '"%s"' % sysid
+                    else:
+                        docStr += "'%s'" % sysid
+                else:
+                    docStr += "''"
+            docStr += ">"
+            if self.doctype.name != token["name"]:
+                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
+        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
+        root = etree.fromstring(docStr)
+
+        # Append the initial comments:
+        for comment_token in self.initial_comments:
+            comment = self.commentClass(comment_token["data"])
+            root.addprevious(comment._element)
+
+        # Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+
+        # Give the root element the right name
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        if namespace is None:
+            etree_tag = name
+        else:
+            etree_tag = "{%s}%s" % (namespace, name)
+        root.tag = etree_tag
+
+        # Add the root element to the internal child/open data structures
+        root_element = self.elementClass(name, namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+
+        # Reset to the default insert comment function
+        self.insertComment = self.insertCommentMain
--- a/lib/bleach/_vendor/html5lib/treewalkers/init.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/init.py
@ -0,0 +1,154 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method which takes a tree as sole argument and
+returns an iterator which generates tokens.
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .. import constants
+from .._utils import default_etree
+
+__all__ = ["getTreeWalker", "pprint"]
+
+treeWalkerCache = {}
+
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    :arg str treeType: the name of the tree type required (case-insensitive).
+        Supported values are:
+
+        * "dom": The xml.dom.minidom DOM implementation
+        * "etree": A generic walker for tree implementations exposing an
+          elementtree-like interface (known to work with ElementTree,
+          cElementTree and lxml.etree).
+        * "lxml": Optimized walker for lxml.etree
+        * "genshi": a Genshi stream
+
+    :arg implementation: A module implementing the tree type e.g.
+        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
+        tree type only).
+
+    :arg kwargs: keyword arguments passed to the etree walker--for other
+        walkers, this has no effect
+
+    :returns: a TreeWalker class
+
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType == "dom":
+            from . import dom
+            treeWalkerCache[treeType] = dom.TreeWalker
+        elif treeType == "genshi":
+            from . import genshi
+            treeWalkerCache[treeType] = genshi.TreeWalker
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers
+
+    Takes a TreeWalker instance and pretty prints the output of walking the tree.
+
+    :arg walker: a TreeWalker instance
+
+    """
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)
--- a/lib/bleach/_vendor/html5lib/treewalkers/base.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/base.py
@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters
+
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+spaceCharacters = "".join(spaceCharacters)
+
+
+class TreeWalker(object):
+    """Walks a tree yielding tokens
+
+    Tokens are dicts that all have a ``type`` field specifying the type of the
+    token.
+
+    """
+    def __init__(self, tree):
+        """Creates a TreeWalker
+
+        :arg tree: the tree to walk
+
+        """
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        """Generates an error token with the given message
+
+        :arg msg: the error message
+
+        :returns: SerializeError token
+
+        """
+        return {"type": "SerializeError", "data": msg}
+
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        """Generates an EmptyTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :arg hasChildren: whether or not to yield a SerializationError because
+            this tag shouldn't have children
+
+        :returns: EmptyTag token
+
+        """
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
+               "data": attrs}
+        if hasChildren:
+            yield self.error("Void element has children")
+
+    def startTag(self, namespace, name, attrs):
+        """Generates a StartTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :arg attrs: the attributes of the element as a dict
+
+        :returns: StartTag token
+
+        """
+        return {"type": "StartTag",
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}
+
+    def endTag(self, namespace, name):
+        """Generates an EndTag token
+
+        :arg namespace: the namespace of the token--can be ``None``
+
+        :arg name: the name of the element
+
+        :returns: EndTag token
+
+        """
+        return {"type": "EndTag",
+                "name": name,
+                "namespace": namespace}
+
+    def text(self, data):
+        """Generates SpaceCharacters and Characters tokens
+
+        Depending on what's in the data, this generates one or more
+        ``SpaceCharacters`` and ``Characters`` tokens.
+
+        For example:
+
+            >>> from html5lib.treewalkers.base import TreeWalker
+            >>> # Give it an empty tree just so it instantiates
+            >>> walker = TreeWalker([])
+            >>> list(walker.text(''))
+            []
+            >>> list(walker.text('  '))
+            [{u'data': '  ', u'type': u'SpaceCharacters'}]
+            >>> list(walker.text(' abc '))  # doctest: +NORMALIZE_WHITESPACE
+            [{u'data': ' ', u'type': u'SpaceCharacters'},
+            {u'data': u'abc', u'type': u'Characters'},
+            {u'data': u' ', u'type': u'SpaceCharacters'}]
+
+        :arg data: the text data
+
+        :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
+
+        """
+        data = data
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data) - len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        """Generates a Comment token
+
+        :arg data: the comment
+
+        :returns: Comment token
+
+        """
+        return {"type": "Comment", "data": data}
+
+    def doctype(self, name, publicId=None, systemId=None):
+        """Generates a Doctype token
+
+        :arg name:
+
+        :arg publicId:
+
+        :arg systemId:
+
+        :returns: the Doctype token
+
+        """
+        return {"type": "Doctype",
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId}
+
+    def entity(self, name):
+        """Generates an Entity token
+
+        :arg name: the entity name
+
+        :returns: an Entity token
+
+        """
+        return {"type": "Entity", "name": name}
+
+    def unknown(self, nodeType):
+        """Handles unknown node types"""
+        return self.error("Unknown node type: " + nodeType)
+
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+
+    def getFirstChild(self, node):
+        raise NotImplementedError
+
+    def getNextSibling(self, node):
+        raise NotImplementedError
+
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                namespace, name, attributes, hasChildren = details
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    for token in self.emptyTag(namespace, name, attributes,
+                                               hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    yield self.startTag(namespace, name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == ENTITY:
+                yield self.entity(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        namespace, name, attributes, hasChildren = details
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
+                            yield self.endTag(namespace, name)
+                    if self.tree is currentNode:
+                        currentNode = None
+                        break
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    else:
+                        currentNode = self.getParentNode(currentNode)
--- a/lib/bleach/_vendor/html5lib/treewalkers/dom.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/dom.py
@ -0,0 +1,43 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+
+from . import base
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            return base.DOCTYPE, node.name, node.publicId, node.systemId
+
+        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+            return base.TEXT, node.nodeValue
+
+        elif node.nodeType == Node.ELEMENT_NODE:
+            attrs = {}
+            for attr in list(node.attributes.keys()):
+                attr = node.getAttributeNode(attr)
+                if attr.namespaceURI:
+                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
+                else:
+                    attrs[(None, attr.name)] = attr.value
+            return (base.ELEMENT, node.namespaceURI, node.nodeName,
+                    attrs, node.hasChildNodes())
+
+        elif node.nodeType == Node.COMMENT_NODE:
+            return base.COMMENT, node.nodeValue
+
+        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+            return (base.DOCUMENT,)
+
+        else:
+            return base.UNKNOWN, node.nodeType
+
+    def getFirstChild(self, node):
+        return node.firstChild
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parentNode
--- a/lib/bleach/_vendor/html5lib/treewalkers/etree.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/etree.py
@ -0,0 +1,131 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import OrderedDict
+import re
+
+from six import string_types
+
+from . import base
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
+        """Given the particular ElementTree representation, this implementation,
+        to avoid using recursion, returns "nodes" as tuples with the following
+        content:
+
+        1. The current element
+
+        2. The index of the element relative to its parent
+
+        3. A stack of ancestor elements
+
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
+        """
+        def getNodeDetails(self, node):
+            if isinstance(node, tuple):  # It might be the root Element
+                elt, _, _, flag = node
+                if flag in ("text", "tail"):
+                    return base.TEXT, getattr(elt, flag)
+                else:
+                    node = elt
+
+            if not(hasattr(node, "tag")):
+                node = node.getroot()
+
+            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
+                return (base.DOCUMENT,)
+
+            elif node.tag == "<!DOCTYPE>":
+                return (base.DOCTYPE, node.text,
+                        node.get("publicId"), node.get("systemId"))
+
+            elif node.tag == ElementTreeCommentType:
+                return base.COMMENT, node.text
+
+            else:
+                assert isinstance(node.tag, string_types), type(node.tag)
+                # This is assumed to be an ordinary element
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                attrs = OrderedDict()
+                for name, value in list(node.attrib.items()):
+                    match = tag_regexp.match(name)
+                    if match:
+                        attrs[(match.group(1), match.group(2))] = value
+                    else:
+                        attrs[(None, name)] = value
+                return (base.ELEMENT, namespace, tag,
+                        attrs, len(node) or node.text)
+
+        def getFirstChild(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                element, key, parents, flag = node, None, [], None
+
+            if flag in ("text", "tail"):
+                return None
+            else:
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+
+        def getNextSibling(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key + 1], key + 1, parents, None
+                else:
+                    return None
+
+        def getParentNode(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    assert list(parents[-1]).count(parent) == 1
+                    return parent, list(parents[-1]).index(parent), parents, None
+
+    return locals()
+
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
@ -0,0 +1,215 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from collections import OrderedDict
+
+from lxml import etree
+from ..treebuilders.etree import tag_regexp
+
+from . import base
+
+from .. import _ihatexml
+
+
+def ensure_str(s):
+    if s is None:
+        return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return s.decode("ascii", "strict")
+
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+
+        try:
+            if et.docinfo.internalDTD:
+                self.children.append(Doctype(self,
+                                             ensure_str(et.docinfo.root_name),
+                                             ensure_str(et.docinfo.public_id),
+                                             ensure_str(et.docinfo.system_url)))
+        except AttributeError:
+            pass
+
+        try:
+            node = et.getroot()
+        except AttributeError:
+            node = et
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = ensure_str(self.obj.text)
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = ensure_str(self.obj.tail)
+        else:
+            self.tail = None
+
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __bool__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __unicode__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        # pylint:disable=redefined-variable-type
+        if isinstance(tree, list):
+            self.fragmentChildren = set(tree)
+            tree = FragmentRoot(tree)
+        else:
+            self.fragmentChildren = set()
+            tree = Root(tree)
+        base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = _ihatexml.InfosetFilter()
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            return base.TEXT, ensure_str(getattr(node, key))
+
+        elif isinstance(node, Root):
+            return (base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
+            return base.TEXT, ensure_str(node.obj)
+
+        elif node.tag == etree.Comment:
+            return base.COMMENT, ensure_str(node.text)
+
+        elif node.tag == etree.Entity:
+            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+
+        else:
+            # This is assumed to be an ordinary element
+            match = tag_regexp.match(ensure_str(node.tag))
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = ensure_str(node.tag)
+            attrs = OrderedDict()
+            for name, value in list(node.attrib.items()):
+                name = ensure_str(name)
+                value = ensure_str(value)
+                match = tag_regexp.match(name)
+                if match:
+                    attrs[(match.group(1), match.group(2))] = value
+                else:
+                    attrs[(None, name)] = value
+            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+                    attrs, len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), "Text nodes have no children"
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else:  # tail
+                return node.getnext()
+
+        return (node, "tail") if node.tail else node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+        elif node in self.fragmentChildren:
+            return None
+
+        return node.getparent()
--- a/lib/bleach/_vendor/html5lib/treewalkers/genshi.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/genshi.py
@ -0,0 +1,69 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
+from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+
+from . import base
+
+from ..constants import voidElements, namespaces
+
+
+class TreeWalker(base.TreeWalker):
+    def __iter__(self):
+        # Buffer the events so we can pass in the following one
+        previous = None
+        for event in self.tree:
+            if previous is not None:
+                for token in self.tokens(previous, event):
+                    yield token
+            previous = event
+
+        # Don't forget the final event!
+        if previous is not None:
+            for token in self.tokens(previous, None):
+                yield token
+
+    def tokens(self, event, next):
+        kind, data, _ = event
+        if kind == START:
+            tag, attribs = data
+            name = tag.localname
+            namespace = tag.namespace
+            converted_attribs = {}
+            for k, v in attribs:
+                if isinstance(k, QName):
+                    converted_attribs[(k.namespace, k.localname)] = v
+                else:
+                    converted_attribs[(None, k)] = v
+
+            if namespace == namespaces["html"] and name in voidElements:
+                for token in self.emptyTag(namespace, name, converted_attribs,
+                                           not next or next[0] != END or
+                                           next[1] != tag):
+                    yield token
+            else:
+                yield self.startTag(namespace, name, converted_attribs)
+
+        elif kind == END:
+            name = data.localname
+            namespace = data.namespace
+            if namespace != namespaces["html"] or name not in voidElements:
+                yield self.endTag(namespace, name)
+
+        elif kind == COMMENT:
+            yield self.comment(data)
+
+        elif kind == TEXT:
+            for token in self.text(data):
+                yield token
+
+        elif kind == DOCTYPE:
+            yield self.doctype(*data)
+
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
+                      START_CDATA, END_CDATA, PI):
+            pass
+
+        else:
+            yield self.unknown(kind)
--- a/lib/bleach/_vendor/parse.py
+++ b/lib/bleach/_vendor/parse.py
--- a/lib/bleach/_vendor/vendor.txt
+++ b/lib/bleach/_vendor/vendor.txt
@ -0,0 +1,3 @@
+html5lib==1.1 \
+    --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
+    --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
--- a/lib/bleach/_vendor/vendor_install.sh
+++ b/lib/bleach/_vendor/vendor_install.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+BLEACH_VENDOR_DIR=${BLEACH_VENDOR_DIR:-"."}
+DEST=${DEST:-"."}
+
+pip install --no-binary all --no-compile --no-deps -r "${BLEACH_VENDOR_DIR}/vendor.txt" --target "${DEST}"
+
+# install Python 3.6.14 urllib.urlparse for #536
+curl --proto '=https' --tlsv1.2 -o "${DEST}/parse.py" https://raw.githubusercontent.com/python/cpython/v3.6.14/Lib/urllib/parse.py
+(cd "${DEST}" && sha256sum parse.py > parse.py.SHA256SUM)
--- a/lib/bleach/callbacks.py
+++ b/lib/bleach/callbacks.py
@ -1,20 +1,32 @@
 """A set of basic callbacks for bleach.linkify."""
-from __future__ import unicode_literals


 def nofollow(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
+    href_key = (None, "href")
+
+    if href_key not in attrs:
        return attrs
-    rel = [x for x in attrs.get('rel', '').split(' ') if x]
-    if 'nofollow' not in [x.lower() for x in rel]:
-        rel.append('nofollow')
-    attrs['rel'] = ' '.join(rel)
+
+    if attrs[href_key].startswith("mailto:"):
+        return attrs
+
+    rel_key = (None, "rel")
+    rel_values = [val for val in attrs.get(rel_key, "").split(" ") if val]
+    if "nofollow" not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append("nofollow")
+    attrs[rel_key] = " ".join(rel_values)

    return attrs


 def target_blank(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
+    href_key = (None, "href")
+
+    if href_key not in attrs:
        return attrs
-    attrs['target'] = '_blank'
+
+    if attrs[href_key].startswith("mailto:"):
+        return attrs
+
+    attrs[(None, "target")] = "_blank"
    return attrs
--- a/lib/bleach/encoding.py
+++ b/lib/bleach/encoding.py
@ -1,62 +0,0 @@
-import datetime
-from decimal import Decimal
-import types
-import six
-
-
-def is_protected_type(obj):
-    """Determine if the object instance is of a protected type.
-
-    Objects of protected types are preserved as-is when passed to
-    force_unicode(strings_only=True).
-    """
-    return isinstance(obj, (
-        six.integer_types +
-        (types.NoneType,
-         datetime.datetime, datetime.date, datetime.time,
-         float, Decimal))
-    )
-
-
-def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
-    """
-    Similar to smart_text, except that lazy instances are resolved to
-    strings, rather than kept as lazy objects.
-
-    If strings_only is True, don't convert (some) non-string-like objects.
-    """
-    # Handle the common case first, saves 30-40% when s is an instance of
-    # six.text_type. This function gets called often in that setting.
-    if isinstance(s, six.text_type):
-        return s
-    if strings_only and is_protected_type(s):
-        return s
-    try:
-        if not isinstance(s, six.string_types):
-            if hasattr(s, '__unicode__'):
-                s = s.__unicode__()
-            else:
-                if six.PY3:
-                    if isinstance(s, bytes):
-                        s = six.text_type(s, encoding, errors)
-                    else:
-                        s = six.text_type(s)
-                else:
-                    s = six.text_type(bytes(s), encoding, errors)
-        else:
-            # Note: We use .decode() here, instead of six.text_type(s,
-            # encoding, errors), so that if s is a SafeBytes, it ends up being
-            # a SafeText at the end.
-            s = s.decode(encoding, errors)
-    except UnicodeDecodeError as e:
-        if not isinstance(s, Exception):
-            raise UnicodeDecodeError(*e.args)
-        else:
-            # If we get to here, the caller has passed in an Exception
-            # subclass populated with non-ASCII bytestring data without a
-            # working unicode method. Try to handle this without raising a
-            # further exception by individually forcing the exception args
-            # to unicode.
-            s = ' '.join([force_unicode(arg, encoding, strings_only,
-                          errors) for arg in s])
-    return s
--- a/lib/bleach/html5lib_shim.py
+++ b/lib/bleach/html5lib_shim.py
@ -0,0 +1,665 @@
+# flake8: noqa
+"""
+Shim module between Bleach and html5lib. This makes it easier to upgrade the
+html5lib library without having to change a lot of code.
+"""
+
+import re
+import string
+import warnings
+
+# ignore html5lib deprecation warnings to use bleach; we are bleach
+# apply before we import submodules that import html5lib
+warnings.filterwarnings(
+    "ignore",
+    message="html5lib's sanitizer is deprecated",
+    category=DeprecationWarning,
+    module="bleach._vendor.html5lib",
+)
+
+from bleach._vendor.html5lib import (  # noqa: E402 module level import not at top of file
+    HTMLParser,
+    getTreeWalker,
+)
+from bleach._vendor.html5lib import (
+    constants,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.constants import (  # noqa: E402 module level import not at top of file
+    namespaces,
+    prefixes,
+)
+from bleach._vendor.html5lib.constants import (
+    _ReparseException as ReparseException,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.base import (
+    Filter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+    allowed_protocols,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.filters.sanitizer import (
+    Filter as SanitizerFilter,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._inputstream import (
+    HTMLInputStream,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib.serializer import (
+    escape,
+    HTMLSerializer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._tokenizer import (
+    attributeMap,
+    HTMLTokenizer,
+)  # noqa: E402 module level import not at top of file
+from bleach._vendor.html5lib._trie import (
+    Trie,
+)  # noqa: E402 module level import not at top of file
+
+
+#: Map of entity name to expanded entity
+ENTITIES = constants.entities
+
+#: Trie of html entity string -> character representation
+ENTITIES_TRIE = Trie(ENTITIES)
+
+#: Token type constants--these never change
+TAG_TOKEN_TYPES = {
+    constants.tokenTypes["StartTag"],
+    constants.tokenTypes["EndTag"],
+    constants.tokenTypes["EmptyTag"],
+}
+CHARACTERS_TYPE = constants.tokenTypes["Characters"]
+PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
+
+
+#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
+#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
+HTML_TAGS = [
+    "a",
+    "abbr",
+    "address",
+    "area",
+    "article",
+    "aside",
+    "audio",
+    "b",
+    "base",
+    "bdi",
+    "bdo",
+    "blockquote",
+    "body",
+    "br",
+    "button",
+    "canvas",
+    "caption",
+    "cite",
+    "code",
+    "col",
+    "colgroup",
+    "data",
+    "datalist",
+    "dd",
+    "del",
+    "details",
+    "dfn",
+    "dialog",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "embed",
+    "fieldset",
+    "figcaption",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "head",
+    "header",
+    "hgroup",
+    "hr",
+    "html",
+    "i",
+    "iframe",
+    "img",
+    "input",
+    "ins",
+    "kbd",
+    "keygen",
+    "label",
+    "legend",
+    "li",
+    "link",
+    "map",
+    "mark",
+    "menu",
+    "meta",
+    "meter",
+    "nav",
+    "noscript",
+    "object",
+    "ol",
+    "optgroup",
+    "option",
+    "output",
+    "p",
+    "param",
+    "picture",
+    "pre",
+    "progress",
+    "q",
+    "rp",
+    "rt",
+    "ruby",
+    "s",
+    "samp",
+    "script",
+    "section",
+    "select",
+    "slot",
+    "small",
+    "source",
+    "span",
+    "strong",
+    "style",
+    "sub",
+    "summary",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "template",
+    "textarea",
+    "tfoot",
+    "th",
+    "thead",
+    "time",
+    "title",
+    "tr",
+    "track",
+    "u",
+    "ul",
+    "var",
+    "video",
+    "wbr",
+]
+
+
+class InputStreamWithMemory:
+    """Wraps an HTMLInputStream to remember characters since last <
+
+    This wraps existing HTMLInputStream classes to keep track of the stream
+    since the last < which marked an open tag state.
+
+    """
+
+    def __init__(self, inner_stream):
+        self._inner_stream = inner_stream
+        self.reset = self._inner_stream.reset
+        self.position = self._inner_stream.position
+        self._buffer = []
+
+    @property
+    def errors(self):
+        return self._inner_stream.errors
+
+    @property
+    def charEncoding(self):
+        return self._inner_stream.charEncoding
+
+    @property
+    def changeEncoding(self):
+        return self._inner_stream.changeEncoding
+
+    def char(self):
+        c = self._inner_stream.char()
+        # char() can return None if EOF, so ignore that
+        if c:
+            self._buffer.append(c)
+        return c
+
+    def charsUntil(self, characters, opposite=False):
+        chars = self._inner_stream.charsUntil(characters, opposite=opposite)
+        self._buffer.extend(list(chars))
+        return chars
+
+    def unget(self, char):
+        if self._buffer:
+            self._buffer.pop(-1)
+        return self._inner_stream.unget(char)
+
+    def get_tag(self):
+        """Returns the stream history since last '<'
+
+        Since the buffer starts at the last '<' as as seen by tagOpenState(),
+        we know that everything from that point to when this method is called
+        is the "tag" that is being tokenized.
+
+        """
+        return "".join(self._buffer)
+
+    def start_tag(self):
+        """Resets stream history to just '<'
+
+        This gets called by tagOpenState() which marks a '<' that denotes an
+        open tag. Any time we see that, we reset the buffer.
+
+        """
+        self._buffer = ["<"]
+
+
+class BleachHTMLTokenizer(HTMLTokenizer):
+    """Tokenizer that doesn't consume character entities"""
+
+    def __init__(self, consume_entities=False, **kwargs):
+        super(BleachHTMLTokenizer, self).__init__(**kwargs)
+
+        self.consume_entities = consume_entities
+
+        # Wrap the stream with one that remembers the history
+        self.stream = InputStreamWithMemory(self.stream)
+
+    def __iter__(self):
+        last_error_token = None
+
+        for token in super(BleachHTMLTokenizer, self).__iter__():
+            if last_error_token is not None:
+                if (
+                    last_error_token["data"] == "invalid-character-in-attribute-name"
+                    and token["type"] in TAG_TOKEN_TYPES
+                    and token.get("data")
+                ):
+                    # token["data"] is an html5lib attributeMap
+                    # (OrderedDict 3.7+ and dict otherwise)
+                    # of attr name to attr value
+                    #
+                    # Remove attribute names that have ', " or < in them
+                    # because those characters are invalid for attribute names.
+                    token["data"] = attributeMap(
+                        (attr_name, attr_value)
+                        for attr_name, attr_value in token["data"].items()
+                        if (
+                            '"' not in attr_name
+                            and "'" not in attr_name
+                            and "<" not in attr_name
+                        )
+                    )
+                    last_error_token = None
+                    yield token
+
+                elif (
+                    last_error_token["data"] == "expected-closing-tag-but-got-char"
+                    and self.parser.tags is not None
+                    and token["data"].lower().strip() not in self.parser.tags
+                ):
+                    # We've got either a malformed tag or a pseudo-tag or
+                    # something that html5lib wants to turn into a malformed
+                    # comment which Bleach clean() will drop so we interfere
+                    # with the token stream to handle it more correctly.
+                    #
+                    # If this is an allowed tag, it's malformed and we just let
+                    # the html5lib parser deal with it--we don't enter into this
+                    # block.
+                    #
+                    # If this is not an allowed tag, then we convert it to
+                    # characters and it'll get escaped in the sanitizer.
+                    token["data"] = self.stream.get_tag()
+                    token["type"] = CHARACTERS_TYPE
+
+                    last_error_token = None
+                    yield token
+
+                elif token["type"] == PARSEERROR_TYPE:
+                    # If the token is a parse error, then let the last_error_token
+                    # go, and make token the new last_error_token
+                    yield last_error_token
+                    last_error_token = token
+
+                else:
+                    yield last_error_token
+                    yield token
+                    last_error_token = None
+
+                continue
+
+            # If the token is a ParseError, we hold on to it so we can get the
+            # next token and potentially fix it.
+            if token["type"] == PARSEERROR_TYPE:
+                last_error_token = token
+                continue
+
+            yield token
+
+        if last_error_token:
+            yield last_error_token
+
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # If this tokenizer is set to consume entities, then we can let the
+        # superclass do its thing.
+        if self.consume_entities:
+            return super(BleachHTMLTokenizer, self).consumeEntity(
+                allowedChar, fromAttribute
+            )
+
+        # If this tokenizer is set to not consume entities, then we don't want
+        # to consume and convert them, so this overrides the html5lib tokenizer's
+        # consumeEntity so that it's now a no-op.
+        #
+        # However, when that gets called, it's consumed an &, so we put that back in
+        # the stream.
+        if fromAttribute:
+            self.currentToken["data"][-1][1] += "&"
+
+        else:
+            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
+
+    def tagOpenState(self):
+        # This state marks a < that is either a StartTag, EndTag, EmptyTag,
+        # or ParseError. In all cases, we want to drop any stream history
+        # we've collected so far and we do that by calling start_tag() on
+        # the input stream wrapper.
+        self.stream.start_tag()
+        return super(BleachHTMLTokenizer, self).tagOpenState()
+
+    def emitCurrentToken(self):
+        token = self.currentToken
+
+        if (
+            self.parser.tags is not None
+            and token["type"] in TAG_TOKEN_TYPES
+            and token["name"].lower() not in self.parser.tags
+        ):
+            # If this is a start/end/empty tag for a tag that's not in our
+            # allowed list, then it gets stripped or escaped. In both of these
+            # cases it gets converted to a Characters token.
+            if self.parser.strip:
+                # If we're stripping the token, we just throw in an empty
+                # string token.
+                new_data = ""
+
+            else:
+                # If we're escaping the token, we want to escape the exact
+                # original string. Since tokenizing also normalizes data
+                # and this is a tag-like thing, we've lost some information.
+                # So we go back through the stream to get the original
+                # string and use that.
+                new_data = self.stream.get_tag()
+
+            new_token = {"type": CHARACTERS_TYPE, "data": new_data}
+
+            self.currentToken = new_token
+            self.tokenQueue.append(new_token)
+            self.state = self.dataState
+            return
+
+        super(BleachHTMLTokenizer, self).emitCurrentToken()
+
+
+class BleachHTMLParser(HTMLParser):
+    """Parser that uses BleachHTMLTokenizer"""
+
+    def __init__(self, tags, strip, consume_entities, **kwargs):
+        """
+        :arg tags: list of allowed tags--everything else is either stripped or
+            escaped; if None, then this doesn't look at tags at all
+        :arg strip: whether to strip disallowed tags (True) or escape them (False);
+            if tags=None, then this doesn't have any effect
+        :arg consume_entities: whether to consume entities (default behavior) or
+            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
+
+        """
+        self.tags = [tag.lower() for tag in tags] if tags is not None else None
+        self.strip = strip
+        self.consume_entities = consume_entities
+        super(BleachHTMLParser, self).__init__(**kwargs)
+
+    def _parse(
+        self, stream, innerHTML=False, container="div", scripting=True, **kwargs
+    ):
+        # set scripting=True to parse <noscript> as though JS is enabled to
+        # match the expected context in browsers
+        #
+        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
+        #
+        # Override HTMLParser so we can swap out the tokenizer for our own.
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.scripting = scripting
+        self.tokenizer = BleachHTMLTokenizer(
+            stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
+        )
+        self.reset()
+
+        try:
+            self.mainLoop()
+        except ReparseException:
+            self.reset()
+            self.mainLoop()
+
+
+def convert_entity(value):
+    """Convert an entity (minus the & and ; part) into what it represents
+
+    This handles numeric, hex, and text entities.
+
+    :arg value: the string (minus the ``&`` and ``;`` part) to convert
+
+    :returns: unicode character or None if it's an ambiguous ampersand that
+        doesn't match a character entity
+
+    """
+    if value[0] == "#":
+        if len(value) < 2:
+            return None
+
+        if value[1] in ("x", "X"):
+            # hex-encoded code point
+            int_as_string, base = value[2:], 16
+        else:
+            # decimal code point
+            int_as_string, base = value[1:], 10
+
+        if int_as_string == "":
+            return None
+
+        code_point = int(int_as_string, base)
+        if 0 < code_point < 0x110000:
+            return chr(code_point)
+        else:
+            return None
+
+    return ENTITIES.get(value, None)
+
+
+def convert_entities(text):
+    """Converts all found entities in the text
+
+    :arg text: the text to convert entities in
+
+    :returns: unicode text with converted entities
+
+    """
+    if "&" not in text:
+        return text
+
+    new_text = []
+    for part in next_possible_entity(text):
+        if not part:
+            continue
+
+        if part.startswith("&"):
+            entity = match_entity(part)
+            if entity is not None:
+                converted = convert_entity(entity)
+
+                # If it's not an ambiguous ampersand, then replace with the
+                # unicode character. Otherwise, we leave the entity in.
+                if converted is not None:
+                    new_text.append(converted)
+                    remainder = part[len(entity) + 2 :]
+                    if part:
+                        new_text.append(remainder)
+                    continue
+
+        new_text.append(part)
+
+    return "".join(new_text)
+
+
+def match_entity(stream):
+    """Returns first entity in stream or None if no entity exists
+
+    Note: For Bleach purposes, entities must start with a "&" and end with
+    a ";". This ignoresambiguous character entities that have no ";" at the
+    end.
+
+    :arg stream: the character stream
+
+    :returns: ``None`` or the entity string without "&" or ";"
+
+    """
+    # Nix the & at the beginning
+    if stream[0] != "&":
+        raise ValueError('Stream should begin with "&"')
+
+    stream = stream[1:]
+
+    stream = list(stream)
+    possible_entity = ""
+    end_characters = "<&=;" + string.whitespace
+
+    # Handle number entities
+    if stream and stream[0] == "#":
+        possible_entity = "#"
+        stream.pop(0)
+
+        if stream and stream[0] in ("x", "X"):
+            allowed = "0123456789abcdefABCDEF"
+            possible_entity += stream.pop(0)
+        else:
+            allowed = "0123456789"
+
+        # FIXME(willkg): Do we want to make sure these are valid number
+        # entities? This doesn't do that currently.
+        while stream and stream[0] not in end_characters:
+            c = stream.pop(0)
+            if c not in allowed:
+                break
+            possible_entity += c
+
+        if possible_entity and stream and stream[0] == ";":
+            return possible_entity
+        return None
+
+    # Handle character entities
+    while stream and stream[0] not in end_characters:
+        c = stream.pop(0)
+        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
+            break
+        possible_entity += c
+
+    if possible_entity and stream and stream[0] == ";":
+        return possible_entity
+
+    return None
+
+
+AMP_SPLIT_RE = re.compile("(&)")
+
+
+def next_possible_entity(text):
+    """Takes a text and generates a list of possible entities
+
+    :arg text: the text to look at
+
+    :returns: generator where each part (except the first) starts with an
+        "&"
+
+    """
+    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
+        if i == 0:
+            yield part
+        elif i % 2 == 0:
+            yield "&" + part
+
+
+class BleachHTMLSerializer(HTMLSerializer):
+    """HTMLSerializer that undoes & -> &amp; in attributes and sets
+    escape_rcdata to True
+    """
+
+    # per the HTMLSerializer.__init__ docstring:
+    #
+    # Whether to escape characters that need to be
+    # escaped within normal elements within rcdata elements such as
+    # style.
+    #
+    escape_rcdata = True
+
+    def escape_base_amp(self, stoken):
+        """Escapes just bare & in HTML attribute values"""
+        # First, undo escaping of &. We need to do this because html5lib's
+        # HTMLSerializer expected the tokenizer to consume all the character
+        # entities and convert them to their respective characters, but the
+        # BleachHTMLTokenizer doesn't do that. For example, this fixes
+        # &amp;entity; back to &entity; .
+        stoken = stoken.replace("&amp;", "&")
+
+        # However, we do want all bare & that are not marking character
+        # entities to be changed to &amp;, so let's do that carefully here.
+        for part in next_possible_entity(stoken):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = match_entity(part)
+                # Only leave entities in that are not ambiguous. If they're
+                # ambiguous, then we escape the ampersand.
+                if entity is not None and convert_entity(entity) is not None:
+                    yield "&" + entity + ";"
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    part = part[len(entity) + 2 :]
+                    if part:
+                        yield part
+                    continue
+
+            yield part.replace("&", "&amp;")
+
+    def serialize(self, treewalker, encoding=None):
+        """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
+
+        Note that this converts & to &amp; in attribute values where the & isn't
+        already part of an unambiguous character entity.
+
+        """
+        in_tag = False
+        after_equals = False
+
+        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
+            if in_tag:
+                if stoken == ">":
+                    in_tag = False
+
+                elif after_equals:
+                    if stoken != '"':
+                        for part in self.escape_base_amp(stoken):
+                            yield part
+
+                        after_equals = False
+                        continue
+
+                elif stoken == "=":
+                    after_equals = True
+
+                yield stoken
+            else:
+                if stoken.startswith("<"):
+                    in_tag = True
+                yield stoken
--- a/lib/bleach/linkifier.py
+++ b/lib/bleach/linkifier.py
@ -0,0 +1,574 @@
+import re
+
+from bleach import callbacks as linkify_callbacks
+from bleach import html5lib_shim
+from bleach.utils import alphabetize_attributes
+
+
+#: List of default callbacks
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+       xn xxx ye yt yu za zm zw""".split()
+
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+
+
+def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
+    """Builds the url regex used by linkifier
+
+    If you want a different set of tlds or allowed protocols, pass those in
+    and stomp on the existing ``url_re``::
+
+        from bleach import linkifier
+
+        my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+
+        linker = LinkifyFilter(url_re=my_url_re)
+
+    """
+    return re.compile(
+        r"""\(*  # Match any opening parentheses.
+        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+            # /path/zz (excluding "unsafe" chars from RFC 1738,
+            # except for # and ~, which happen in practice)
+        """.format(
+            "|".join(sorted(protocols)), "|".join(sorted(tlds))
+        ),
+        re.IGNORECASE | re.VERBOSE | re.UNICODE,
+    )
+
+
+URL_RE = build_url_re()
+
+
+PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
+
+
+def build_email_re(tlds=TLDS):
+    """Builds the email regex used by linkifier
+
+    If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
+
+        from bleach import linkifier
+
+        my_email_re = linkifier.build_email_re(my_tlds_list)
+
+        linker = LinkifyFilter(email_re=my_url_re)
+
+    """
+    # open and closing braces doubled below for format string
+    return re.compile(
+        r"""(?<!//)
+        (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
+            (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)*  # dot-atom
+        |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+            |\\[\001-\011\013\014\016-\177])*"  # quoted-string
+        )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
+        """.format(
+            "|".join(tlds)
+        ),
+        re.IGNORECASE | re.MULTILINE | re.VERBOSE,
+    )
+
+
+EMAIL_RE = build_email_re()
+
+
+class Linker:
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    """
+
+    def __init__(
+        self,
+        callbacks=DEFAULT_CALLBACKS,
+        skip_tags=None,
+        parse_email=False,
+        url_re=URL_RE,
+        email_re=EMAIL_RE,
+        recognized_tags=html5lib_shim.HTML_TAGS,
+    ):
+        """Creates a Linker instance
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg re email_re: email matching regex
+
+        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
+            everything else gets escaped
+
+        :returns: linkified text as unicode
+
+        """
+        self.callbacks = callbacks
+        self.skip_tags = skip_tags
+        self.parse_email = parse_email
+        self.url_re = url_re
+        self.email_re = email_re
+
+        # Create a parser/tokenizer that allows all HTML tags and escapes
+        # anything not in that list.
+        self.parser = html5lib_shim.BleachHTMLParser(
+            tags=recognized_tags,
+            strip=False,
+            consume_entities=True,
+            namespaceHTMLElements=False,
+        )
+        self.walker = html5lib_shim.getTreeWalker("etree")
+        self.serializer = html5lib_shim.BleachHTMLSerializer(
+            quote_attr_values="always",
+            omit_optional_tags=False,
+            # linkify does not sanitize
+            sanitize=False,
+            # linkify alphabetizes
+            alphabetical_attributes=False,
+        )
+
+    def linkify(self, text):
+        """Linkify specified text
+
+        :arg str text: the text to add links to
+
+        :returns: linkified text as unicode
+
+        :raises TypeError: if ``text`` is not a text type
+
+        """
+        if not isinstance(text, str):
+            raise TypeError("argument must be of text type")
+
+        if not text:
+            return ""
+
+        dom = self.parser.parseFragment(text)
+        filtered = LinkifyFilter(
+            source=self.walker(dom),
+            callbacks=self.callbacks,
+            skip_tags=self.skip_tags,
+            parse_email=self.parse_email,
+            url_re=self.url_re,
+            email_re=self.email_re,
+        )
+        return self.serializer.render(filtered)
+
+
+class LinkifyFilter(html5lib_shim.Filter):
+    """html5lib filter that linkifies text
+
+    This will do the following:
+
+    * convert email addresses into links
+    * convert urls into links
+    * edit existing links by running them through callbacks--the default is to
+      add a ``rel="nofollow"``
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
+
+    def __init__(
+        self,
+        source,
+        callbacks=DEFAULT_CALLBACKS,
+        skip_tags=None,
+        parse_email=False,
+        url_re=URL_RE,
+        email_re=EMAIL_RE,
+    ):
+        """Creates a LinkifyFilter instance
+
+        :arg TreeWalker source: stream
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg re email_re: email matching regex
+
+        """
+        super(LinkifyFilter, self).__init__(source)
+
+        self.callbacks = callbacks or []
+        self.skip_tags = skip_tags or []
+        self.parse_email = parse_email
+
+        self.url_re = url_re
+        self.email_re = email_re
+
+    def apply_callbacks(self, attrs, is_new):
+        """Given an attrs dict and an is_new bool, runs through callbacks
+
+        Callbacks can return an adjusted attrs dict or ``None``. In the case of
+        ``None``, we stop going through callbacks and return that and the link
+        gets dropped.
+
+        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
+
+        :arg bool is_new: whether or not this link was added by linkify
+
+        :returns: adjusted attrs dict or ``None``
+
+        """
+        for cb in self.callbacks:
+            attrs = cb(attrs, is_new)
+            if attrs is None:
+                return None
+        return attrs
+
+    def extract_character_data(self, token_list):
+        """Extracts and squashes character sequences in a token stream"""
+        # FIXME(willkg): This is a terrible idea. What it does is drop all the
+        # tags from the token list and merge the Characters and SpaceCharacters
+        # tokens into a single text.
+        #
+        # So something like this::
+        #
+        #     "<span>" "<b>" "some text" "</b>" "</span>"
+        #
+        # gets converted to "some text".
+        #
+        # This gets used to figure out the ``_text`` fauxttribute value for
+        # linkify callables.
+        #
+        # I'm not really sure how else to support that ``_text`` fauxttribute and
+        # maintain some modicum of backwards compatibility with previous versions
+        # of Bleach.
+
+        out = []
+        for token in token_list:
+            token_type = token["type"]
+            if token_type in ["Characters", "SpaceCharacters"]:
+                out.append(token["data"])
+
+        return "".join(out)
+
+    def handle_email_addresses(self, src_iter):
+        """Handle email addresses in character tokens"""
+        for token in src_iter:
+            if token["type"] == "Characters":
+                text = token["data"]
+                new_tokens = []
+                end = 0
+
+                # For each email address we find in the text
+                for match in self.email_re.finditer(text):
+                    if match.start() > end:
+                        new_tokens.append(
+                            {"type": "Characters", "data": text[end : match.start()]}
+                        )
+
+                    # Run attributes through the callbacks to see what we
+                    # should do with this match
+                    attrs = {
+                        (None, "href"): "mailto:%s" % match.group(0),
+                        "_text": match.group(0),
+                    }
+                    attrs = self.apply_callbacks(attrs, True)
+
+                    if attrs is None:
+                        # Just add the text--but not as a link
+                        new_tokens.append(
+                            {"type": "Characters", "data": match.group(0)}
+                        )
+
+                    else:
+                        # Add an "a" tag for the new link
+                        _text = attrs.pop("_text", "")
+                        attrs = alphabetize_attributes(attrs)
+                        new_tokens.extend(
+                            [
+                                {"type": "StartTag", "name": "a", "data": attrs},
+                                {"type": "Characters", "data": str(_text)},
+                                {"type": "EndTag", "name": "a"},
+                            ]
+                        )
+                    end = match.end()
+
+                if new_tokens:
+                    # Yield the adjusted set of tokens and then continue
+                    # through the loop
+                    if end < len(text):
+                        new_tokens.append({"type": "Characters", "data": text[end:]})
+
+                    for new_token in new_tokens:
+                        yield new_token
+
+                    continue
+
+            yield token
+
+    def strip_non_url_bits(self, fragment):
+        """Strips non-url bits from the url
+
+        This accounts for over-eager matching by the regex.
+
+        """
+        prefix = suffix = ""
+
+        while fragment:
+            # Try removing ( from the beginning and, if it's balanced, from the
+            # end, too
+            if fragment.startswith("("):
+                prefix = prefix + "("
+                fragment = fragment[1:]
+
+                if fragment.endswith(")"):
+                    suffix = ")" + suffix
+                    fragment = fragment[:-1]
+                continue
+
+            # Now try extraneous things from the end. For example, sometimes we
+            # pick up ) at the end of a url, but the url is in a parenthesized
+            # phrase like:
+            #
+            #     "i looked at the site (at http://example.com)"
+
+            if fragment.endswith(")") and "(" not in fragment:
+                fragment = fragment[:-1]
+                suffix = ")" + suffix
+                continue
+
+            # Handle commas
+            if fragment.endswith(","):
+                fragment = fragment[:-1]
+                suffix = "," + suffix
+                continue
+
+            # Handle periods
+            if fragment.endswith("."):
+                fragment = fragment[:-1]
+                suffix = "." + suffix
+                continue
+
+            # Nothing matched, so we're done
+            break
+
+        return fragment, prefix, suffix
+
+    def handle_links(self, src_iter):
+        """Handle links in character tokens"""
+        in_a = False  # happens, if parse_email=True and if a mail was found
+        for token in src_iter:
+            if in_a:
+                if token["type"] == "EndTag" and token["name"] == "a":
+                    in_a = False
+                yield token
+                continue
+            elif token["type"] == "StartTag" and token["name"] == "a":
+                in_a = True
+                yield token
+                continue
+            if token["type"] == "Characters":
+                text = token["data"]
+                new_tokens = []
+                end = 0
+
+                for match in self.url_re.finditer(text):
+                    if match.start() > end:
+                        new_tokens.append(
+                            {"type": "Characters", "data": text[end : match.start()]}
+                        )
+
+                    url = match.group(0)
+                    prefix = suffix = ""
+
+                    # Sometimes we pick up too much in the url match, so look for
+                    # bits we should drop and remove them from the match
+                    url, prefix, suffix = self.strip_non_url_bits(url)
+
+                    # If there's no protocol, add one
+                    if PROTO_RE.search(url):
+                        href = url
+                    else:
+                        href = "http://%s" % url
+
+                    attrs = {(None, "href"): href, "_text": url}
+                    attrs = self.apply_callbacks(attrs, True)
+
+                    if attrs is None:
+                        # Just add the text
+                        new_tokens.append(
+                            {"type": "Characters", "data": prefix + url + suffix}
+                        )
+
+                    else:
+                        # Add the "a" tag!
+                        if prefix:
+                            new_tokens.append({"type": "Characters", "data": prefix})
+
+                        _text = attrs.pop("_text", "")
+                        attrs = alphabetize_attributes(attrs)
+
+                        new_tokens.extend(
+                            [
+                                {"type": "StartTag", "name": "a", "data": attrs},
+                                {"type": "Characters", "data": str(_text)},
+                                {"type": "EndTag", "name": "a"},
+                            ]
+                        )
+
+                        if suffix:
+                            new_tokens.append({"type": "Characters", "data": suffix})
+
+                    end = match.end()
+
+                if new_tokens:
+                    # Yield the adjusted set of tokens and then continue
+                    # through the loop
+                    if end < len(text):
+                        new_tokens.append({"type": "Characters", "data": text[end:]})
+
+                    for new_token in new_tokens:
+                        yield new_token
+
+                    continue
+
+            yield token
+
+    def handle_a_tag(self, token_buffer):
+        """Handle the "a" tag
+
+        This could adjust the link or drop it altogether depending on what the
+        callbacks return.
+
+        This yields the new set of tokens.
+
+        """
+        a_token = token_buffer[0]
+        if a_token["data"]:
+            attrs = a_token["data"]
+        else:
+            attrs = {}
+        text = self.extract_character_data(token_buffer)
+        attrs["_text"] = text
+
+        attrs = self.apply_callbacks(attrs, False)
+
+        if attrs is None:
+            # We're dropping the "a" tag and everything else and replacing
+            # it with character data. So emit that token.
+            yield {"type": "Characters", "data": text}
+
+        else:
+            new_text = attrs.pop("_text", "")
+            a_token["data"] = alphabetize_attributes(attrs)
+
+            if text == new_text:
+                # The callbacks didn't change the text, so we yield the new "a"
+                # token, then whatever else was there, then the end "a" token
+                yield a_token
+                for mem in token_buffer[1:]:
+                    yield mem
+
+            else:
+                # If the callbacks changed the text, then we're going to drop
+                # all the tokens between the start and end "a" tags and replace
+                # it with the new text
+                yield a_token
+                yield {"type": "Characters", "data": str(new_text)}
+                yield token_buffer[-1]
+
+    def __iter__(self):
+        in_a = False
+        in_skip_tag = None
+
+        token_buffer = []
+
+        for token in super(LinkifyFilter, self).__iter__():
+            if in_a:
+                # Handle the case where we're in an "a" tag--we want to buffer tokens
+                # until we hit an end "a" tag.
+                if token["type"] == "EndTag" and token["name"] == "a":
+                    # Add the end tag to the token buffer and then handle them
+                    # and yield anything returned
+                    token_buffer.append(token)
+                    for new_token in self.handle_a_tag(token_buffer):
+                        yield new_token
+
+                    # Clear "a" related state and continue since we've yielded all
+                    # the tokens we're going to yield
+                    in_a = False
+                    token_buffer = []
+                else:
+                    token_buffer.append(token)
+                continue
+
+            if token["type"] in ["StartTag", "EmptyTag"]:
+                if token["name"] in self.skip_tags:
+                    # Skip tags start a "special mode" where we don't linkify
+                    # anything until the end tag.
+                    in_skip_tag = token["name"]
+
+                elif token["name"] == "a":
+                    # The "a" tag is special--we switch to a slurp mode and
+                    # slurp all the tokens until the end "a" tag and then
+                    # figure out what to do with them there.
+                    in_a = True
+                    token_buffer.append(token)
+
+                    # We buffer the start tag, so we don't want to yield it,
+                    # yet
+                    continue
+
+            elif in_skip_tag and self.skip_tags:
+                # NOTE(willkg): We put this clause here since in_a and
+                # switching in and out of in_a takes precedence.
+                if token["type"] == "EndTag" and token["name"] == in_skip_tag:
+                    in_skip_tag = None
+
+            elif not in_a and not in_skip_tag and token["type"] == "Characters":
+                new_stream = iter([token])
+                if self.parse_email:
+                    new_stream = self.handle_email_addresses(new_stream)
+
+                new_stream = self.handle_links(new_stream)
+
+                for token in new_stream:
+                    yield token
+
+                # We've already yielded this token, so continue
+                continue
+
+            yield token
--- a/lib/bleach/sanitizer.py
+++ b/lib/bleach/sanitizer.py
@ -1,148 +1,645 @@
-from __future__ import unicode_literals
+from itertools import chain
 import re
-from xml.sax.saxutils import escape, unescape
+import warnings

-from html5lib.constants import tokenTypes
-from html5lib.sanitizer import HTMLSanitizerMixin
-from html5lib.tokenizer import HTMLTokenizer
+from bleach._vendor.parse import urlparse
+from xml.sax.saxutils import unescape
+
+from bleach import html5lib_shim
+from bleach.utils import alphabetize_attributes


-PROTOS = HTMLSanitizerMixin.acceptable_protocols
-PROTOS.remove('feed')
+#: List of allowed tags
+ALLOWED_TAGS = [
+    "a",
+    "abbr",
+    "acronym",
+    "b",
+    "blockquote",
+    "code",
+    "em",
+    "i",
+    "li",
+    "ol",
+    "strong",
+    "ul",
+]


-class BleachSanitizerMixin(HTMLSanitizerMixin):
-    """Mixin to replace sanitize_token() and sanitize_css()."""
+#: Map of allowed attributes by tag
+ALLOWED_ATTRIBUTES = {
+    "a": ["href", "title"],
+    "abbr": ["title"],
+    "acronym": ["title"],
+}

-    allowed_svg_properties = []
+#: List of allowed styles
+ALLOWED_STYLES = []
+
+#: List of allowed protocols
+ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
+
+#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
+INVISIBLE_CHARACTERS = "".join(
+    [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
+)
+
+#: Regexp for characters that are invisible
+INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
+
+#: String to replace invisible characters with. This can be a character, a
+#: string, or even a function that takes a Python re matchobj
+INVISIBLE_REPLACEMENT_CHAR = "?"
+
+
+class Cleaner:
+    """Cleaner for cleaning HTML fragments of malicious content
+
+    This cleaner is a security-focused function whose sole purpose is to remove
+    malicious content from a string such that it can be displayed as content in
+    a web page.
+
+    To use::
+
+        from bleach.sanitizer import Cleaner
+
+        cleaner = Cleaner()
+
+        for text in all_the_yucky_things:
+            sanitized = cleaner.clean(text)
+
+    .. Note::
+
+       This cleaner is not designed to use to transform content to be used in
+       non-web-page contexts.
+
+    .. Warning::
+
+       This cleaner is not thread-safe--the html parser has internal state.
+       Create a separate cleaner per thread!
+
+
+    """
+
+    def __init__(
+        self,
+        tags=ALLOWED_TAGS,
+        attributes=ALLOWED_ATTRIBUTES,
+        styles=ALLOWED_STYLES,
+        protocols=ALLOWED_PROTOCOLS,
+        strip=False,
+        strip_comments=True,
+        filters=None,
+    ):
+        """Initializes a Cleaner
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.sanitizer.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.sanitizer.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+        :arg bool strip: whether or not to strip disallowed elements
+
+        :arg bool strip_comments: whether or not to strip HTML comments
+
+        :arg list filters: list of html5lib Filter classes to pass streamed content through
+
+            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+            .. Warning::
+
+               Using filters changes the output of ``bleach.Cleaner.clean``.
+               Make sure the way the filters change the output are secure.
+
+        """
+        self.tags = tags
+        self.attributes = attributes
+        self.styles = styles
+        self.protocols = protocols
+        self.strip = strip
+        self.strip_comments = strip_comments
+        self.filters = filters or []
+
+        self.parser = html5lib_shim.BleachHTMLParser(
+            tags=self.tags,
+            strip=self.strip,
+            consume_entities=False,
+            namespaceHTMLElements=False,
+        )
+        self.walker = html5lib_shim.getTreeWalker("etree")
+        self.serializer = html5lib_shim.BleachHTMLSerializer(
+            quote_attr_values="always",
+            omit_optional_tags=False,
+            escape_lt_in_attrs=True,
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
+            # Bleach has its own sanitizer, so don't use the html5lib one
+            sanitize=False,
+            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+            alphabetical_attributes=False,
+        )
+
+    def clean(self, text):
+        """Cleans text and returns sanitized result as unicode
+
+        :arg str text: text to be cleaned
+
+        :returns: sanitized text as unicode
+
+        :raises TypeError: if ``text`` is not a text type
+
+        """
+        if not isinstance(text, str):
+            message = (
+                "argument cannot be of '{name}' type, must be of text type".format(
+                    name=text.__class__.__name__
+                )
+            )
+            raise TypeError(message)
+
+        if not text:
+            return ""
+
+        dom = self.parser.parseFragment(text)
+        filtered = BleachSanitizerFilter(
+            source=self.walker(dom),
+            # Bleach-sanitizer-specific things
+            attributes=self.attributes,
+            strip_disallowed_elements=self.strip,
+            strip_html_comments=self.strip_comments,
+            # html5lib-sanitizer things
+            allowed_elements=self.tags,
+            allowed_css_properties=self.styles,
+            allowed_protocols=self.protocols,
+            allowed_svg_properties=[],
+        )
+
+        # Apply any filters after the BleachSanitizerFilter
+        for filter_class in self.filters:
+            filtered = filter_class(source=filtered)
+
+        return self.serializer.render(filtered)
+
+
+def attribute_filter_factory(attributes):
+    """Generates attribute filter function for the given attributes value
+
+    The attributes value can take one of several shapes. This returns a filter
+    function appropriate to the attributes value. One nice thing about this is
+    that there's less if/then shenanigans in the ``allow_token`` method.
+
+    """
+    if callable(attributes):
+        return attributes
+
+    if isinstance(attributes, dict):
+
+        def _attr_filter(tag, attr, value):
+            if tag in attributes:
+                attr_val = attributes[tag]
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                if attr in attr_val:
+                    return True
+
+            if "*" in attributes:
+                attr_val = attributes["*"]
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                return attr in attr_val
+
+            return False
+
+        return _attr_filter
+
+    if isinstance(attributes, list):
+
+        def _attr_filter(tag, attr, value):
+            return attr in attributes
+
+        return _attr_filter
+
+    raise ValueError("attributes needs to be a callable, a list or a dict")
+
+
+class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
+    """html5lib Filter that sanitizes text
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
+
+    def __init__(
+        self,
+        source,
+        attributes=ALLOWED_ATTRIBUTES,
+        strip_disallowed_elements=False,
+        strip_html_comments=True,
+        **kwargs,
+    ):
+        """Creates a BleachSanitizerFilter instance
+
+        :arg Treewalker source: stream
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.sanitizer.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.sanitizer.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+        :arg bool strip_disallowed_elements: whether or not to strip disallowed
+            elements
+
+        :arg bool strip_html_comments: whether or not to strip HTML comments
+
+        """
+        self.attr_filter = attribute_filter_factory(attributes)
+        self.strip_disallowed_elements = strip_disallowed_elements
+        self.strip_html_comments = strip_html_comments
+
+        # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
+        warnings.filterwarnings(
+            "ignore",
+            message="html5lib's sanitizer is deprecated",
+            category=DeprecationWarning,
+            module="bleach._vendor.html5lib",
+        )
+        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
+
+    def sanitize_stream(self, token_iterator):
+        for token in token_iterator:
+            ret = self.sanitize_token(token)
+
+            if not ret:
+                continue
+
+            if isinstance(ret, list):
+                for subtoken in ret:
+                    yield subtoken
+            else:
+                yield ret
+
+    def merge_characters(self, token_iterator):
+        """Merge consecutive Characters tokens in a stream"""
+        characters_buffer = []
+
+        for token in token_iterator:
+            if characters_buffer:
+                if token["type"] == "Characters":
+                    characters_buffer.append(token)
+                    continue
+                else:
+                    # Merge all the characters tokens together into one and then
+                    # operate on it.
+                    new_token = {
+                        "data": "".join(
+                            [char_token["data"] for char_token in characters_buffer]
+                        ),
+                        "type": "Characters",
+                    }
+                    characters_buffer = []
+                    yield new_token
+
+            elif token["type"] == "Characters":
+                characters_buffer.append(token)
+                continue
+
+            yield token
+
+        new_token = {
+            "data": "".join([char_token["data"] for char_token in characters_buffer]),
+            "type": "Characters",
+        }
+        yield new_token
+
+    def __iter__(self):
+        return self.merge_characters(
+            self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
+        )

    def sanitize_token(self, token):
        """Sanitize a token either by HTML-encoding or dropping.

-        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
-        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
+        ['attribute', 'pairs'], 'tag': callable}.

-        Here callable is a function with two arguments of attribute name
-        and value. It should return true of false.
+        Here callable is a function with two arguments of attribute name and
+        value. It should return true of false.

        Also gives the option to strip tags instead of encoding.

-        """
-        if (getattr(self, 'wildcard_attributes', None) is None and
-                isinstance(self.allowed_attributes, dict)):
-            self.wildcard_attributes = self.allowed_attributes.get('*', [])
+        :arg dict token: token to sanitize
+
+        :returns: token or list of tokens
+
+        """
+        token_type = token["type"]
+        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
+            if token["name"] in self.allowed_elements:
+                return self.allow_token(token)

-        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
-                             tokenTypes['EmptyTag']):
-            if token['name'] in self.allowed_elements:
-                if 'data' in token:
-                    if isinstance(self.allowed_attributes, dict):
-                        allowed_attributes = self.allowed_attributes.get(
-                            token['name'], [])
-                        #print callable(allowed_attributes)
-                        if not callable(allowed_attributes):
-                            allowed_attributes += self.wildcard_attributes
-                    else:
-                        allowed_attributes = self.allowed_attributes
-                    attrs = dict([(name, val) for name, val in
-                                  token['data'][::-1]
-                                  if (allowed_attributes(name, val)
-                                      if callable(allowed_attributes)
-                                      else name in allowed_attributes)])
-                    for attr in self.attr_val_is_uri:
-                        if attr not in attrs:
-                            continue
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                               unescape(attrs[attr])).lower()
-                        # Remove replacement characters from unescaped
-                        # characters.
-                        val_unescaped = val_unescaped.replace("\ufffd", "")
-                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
-                            and (val_unescaped.split(':')[0] not in
-                                 self.allowed_protocols)):
-                            del attrs[attr]
-                    for attr in self.svg_attr_val_allows_ref:
-                        if attr in attrs:
-                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                                 ' ',
-                                                 unescape(attrs[attr]))
-                    if (token['name'] in self.svg_allow_local_href and
-                            'xlink:href' in attrs and
-                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
-                        del attrs['xlink:href']
-                    if 'style' in attrs:
-                        attrs['style'] = self.sanitize_css(attrs['style'])
-                    token['data'] = [(name, val) for name, val in
-                                     attrs.items()]
-                return token
            elif self.strip_disallowed_elements:
-                pass
+                return None
+
            else:
-                if token['type'] == tokenTypes['EndTag']:
-                    token['data'] = '</{0!s}>'.format(token['name'])
-                elif token['data']:
-                    attr = ' {0!s}="{1!s}"'
-                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
-                                    token['data']])
-                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
-                else:
-                    token['data'] = '<{0!s}>'.format(token['name'])
-                if token['selfClosing']:
-                    token['data'] = token['data'][:-1] + '/>'
-                token['type'] = tokenTypes['Characters']
-                del token["name"]
-                return token
-        elif token['type'] == tokenTypes['Comment']:
+                if "data" in token:
+                    # Alphabetize the attributes before calling .disallowed_token()
+                    # so that the resulting string is stable
+                    token["data"] = alphabetize_attributes(token["data"])
+                return self.disallowed_token(token)
+
+        elif token_type == "Comment":
            if not self.strip_html_comments:
+                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
+                token["data"] = html5lib_shim.escape(
+                    token["data"], entities={'"': "&quot;", "'": "&#x27;"}
+                )
                return token
            else:
+                return None
+
+        elif token_type == "Characters":
+            return self.sanitize_characters(token)
+
+        else:
+            return token
+
+    def sanitize_characters(self, token):
+        """Handles Characters tokens
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: a list of tokens
+
+        """
+        data = token.get("data", "")
+
+        if not data:
+            return token
+
+        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
+        token["data"] = data
+
+        # If there isn't a & in the data, we can return now
+        if "&" not in data:
+            return token
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in html5lib_shim.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = html5lib_shim.match_entity(part)
+                if entity is not None:
+                    if entity == "amp":
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({"type": "Characters", "data": "&"})
+                    else:
+                        new_tokens.append({"type": "Entity", "name": entity})
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    remainder = part[len(entity) + 2 :]
+                    if remainder:
+                        new_tokens.append({"type": "Characters", "data": remainder})
+                    continue
+
+            new_tokens.append({"type": "Characters", "data": part})
+
+        return new_tokens
+
+    def sanitize_uri_value(self, value, allowed_protocols):
+        """Checks a uri value to see if it's allowed
+
+        :arg value: the uri value to sanitize
+        :arg allowed_protocols: list of allowed protocols
+
+        :returns: allowed value or None
+
+        """
+        # NOTE(willkg): This transforms the value into one that's easier to
+        # match and verify, but shouldn't get returned since it's vastly
+        # different than the original value.
+
+        # Convert all character entities in the value
+        new_value = html5lib_shim.convert_entities(value)
+
+        # Nix backtick, space characters, and control characters
+        new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
+
+        # Remove REPLACEMENT characters
+        new_value = new_value.replace("\ufffd", "")
+
+        # Lowercase it--this breaks the value, but makes it easier to match
+        # against
+        new_value = new_value.lower()
+
+        try:
+            # Drop attributes with uri values that have protocols that aren't
+            # allowed
+            parsed = urlparse(new_value)
+        except ValueError:
+            # URI is impossible to parse, therefore it's not allowed
+            return None
+
+        if parsed.scheme:
+            # If urlparse found a scheme, check that
+            if parsed.scheme in allowed_protocols:
+                return value
+
+        else:
+            # Allow uris that are just an anchor
+            if new_value.startswith("#"):
+                return value
+
+            # Handle protocols that urlparse doesn't recognize like "myprotocol"
+            if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
+                return value
+
+            # If there's no protocol/scheme specified, then assume it's "http"
+            # and see if that's allowed
+            if "http" in allowed_protocols:
+                return value
+
+        return None
+
+    def allow_token(self, token):
+        """Handles the case where we're allowing the tag"""
+        if "data" in token:
+            # Loop through all the attributes and drop the ones that are not
+            # allowed, are unsafe or break other rules. Additionally, fix
+            # attribute values that need fixing.
+            #
+            # At the end of this loop, we have the final set of attributes
+            # we're keeping.
+            attrs = {}
+            for namespaced_name, val in token["data"].items():
+                namespace, name = namespaced_name
+
+                # Drop attributes that are not explicitly allowed
+                #
+                # NOTE(willkg): We pass in the attribute name--not a namespaced
+                # name.
+                if not self.attr_filter(token["name"], name, val):
+                    continue
+
+                # Drop attributes with uri values that use a disallowed protocol
+                # Sanitize attributes with uri values
+                if namespaced_name in self.attr_val_is_uri:
+                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
+                    if new_value is None:
+                        continue
+                    val = new_value
+
+                # Drop values in svg attrs with non-local IRIs
+                if namespaced_name in self.svg_attr_val_allows_ref:
+                    new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
+                    new_val = new_val.strip()
+                    if not new_val:
+                        continue
+
+                    else:
+                        # Replace the val with the unescaped version because
+                        # it's a iri
+                        val = new_val
+
+                # Drop href and xlink:href attr for svg elements with non-local IRIs
+                if (None, token["name"]) in self.svg_allow_local_href:
+                    if namespaced_name in [
+                        (None, "href"),
+                        (html5lib_shim.namespaces["xlink"], "href"),
+                    ]:
+                        if re.search(r"^\s*[^#\s]", val):
+                            continue
+
+                # If it's a style attribute, sanitize it
+                if namespaced_name == (None, "style"):
+                    val = self.sanitize_css(val)
+
+                # At this point, we want to keep the attribute, so add it in
+                attrs[namespaced_name] = val
+
+            token["data"] = alphabetize_attributes(attrs)
+
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                # If we end up with a namespace, but no name, switch them so we
+                # have a valid name to use.
+                if ns and not name:
+                    ns, name = name, ns
+
+                # Figure out namespaced name if the namespace is appropriate
+                # and exists; if the ns isn't in prefixes, then drop it.
+                if ns is None or ns not in html5lib_shim.prefixes:
+                    namespaced_name = name
+                else:
+                    namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
+
+                attrs.append(
+                    ' %s="%s"'
+                    % (
+                        namespaced_name,
+                        # NOTE(willkg): HTMLSerializer escapes attribute values
+                        # already, so if we do it here (like HTMLSerializer does),
+                        # then we end up double-escaping.
+                        v,
+                    )
+                )
+            token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
+
+        else:
+            token["data"] = "<%s>" % token["name"]
+
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
        return token

    def sanitize_css(self, style):
-        """HTMLSanitizerMixin.sanitize_css replacement.
+        """Sanitizes css in style tags"""
+        # Convert entities in the style so that it can be parsed as CSS
+        style = html5lib_shim.convert_entities(style)

-        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
-        border-*, margin-*, and padding-*. We only whitelist what's in
-        the whitelist.
+        # Drop any url values before we do anything else
+        style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)

-        """
-        # disallow urls
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+        # The gauntlet of sanitization
+
+        # Validate the css in the style tag and if it's not valid, then drop
+        # the whole thing.
+        parts = style.split(";")
+        gauntlet = re.compile(
+            r"""^(  # consider a style attribute value as composed of:
+[/:,#%!.\s\w]    # a non-newline character
+|\w-\w           # 3 characters in the form \w-\w
+|'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
+|"[\s\w]+"       # a double quoted string of [\s\w]+
+|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
+)*$""",  # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
+            flags=re.U | re.VERBOSE,
+        )

-        # gauntlet
-        # TODO: Make sure this does what it's meant to - I *think* it wants to
-        # validate style attribute contents.
-        parts = style.split(';')
-        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
-                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
        for part in parts:
            if not gauntlet.match(part):
-                return ''
+                return ""

-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ""

        clean = []
-        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
+
            if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
+                clean.append(prop + ": " + value + ";")
+
            elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
+                clean.append(prop + ": " + value + ";")

-        return ' '.join(clean)
-
-
-class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName,
-                               **kwargs)
-
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            token = self.sanitize_token(token)
-            if token:
-                yield token
+        return " ".join(clean)
--- a/lib/bleach/utils.py
+++ b/lib/bleach/utils.py
@ -0,0 +1,21 @@
+from collections import OrderedDict
+
+
+def _attr_key(attr):
+    """Returns appropriate key for sorting attribute names
+
+    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
+    ``None`` or a string. These can't be compared in Python 3, so we conver the
+    ``None`` to an empty string.
+
+    """
+    key = (attr[0][0] or ""), attr[0][1]
+    return key
+
+
+def alphabetize_attributes(attrs):
+    """Takes a dict of attributes (or None) and returns them alphabetized"""
+    if not attrs:
+        return attrs
+
+    return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])