Update bleach-4.1.0

2025-08-20 21:33:18 -07:00 · 2021-10-14 20:51:23 -07:00 · 2021-10-14 20:51:23 -07:00 · a4130d6c56
commit a4130d6c56
parent 4086529906
51 changed files with 17071 additions and 568 deletions
--- a/lib/bleach/init.py
+++ b/lib/bleach/init.py
@ -1,401 +1,131 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
+import packaging.version
 import logging
 import re
-import html5lib
+from bleach.linkifier import (
-from html5lib.sanitizer import HTMLSanitizer
+    DEFAULT_CALLBACKS,
-from html5lib.serializer.htmlserializer import HTMLSerializer
+    Linker,
-
+)
-from . import callbacks as linkify_callbacks
+from bleach.sanitizer import (
-from .encoding import force_unicode
+    ALLOWED_ATTRIBUTES,
-from .sanitizer import BleachSanitizer
+    ALLOWED_PROTOCOLS,
    ALLOWED_STYLES,
    ALLOWED_TAGS,
    Cleaner,
 )
-VERSION = (1, 4, 2)
+# yyyymmdd
-__version__ = '.'.join([str(n) for n in VERSION])
+__releasedate__ = "20210825"
-
+# x.y.z or x.y.z.dev0 -- semver
-__all__ = ['clean', 'linkify']
+__version__ = "4.1.0"
-
+VERSION = packaging.version.Version(__version__)
 log = logging.getLogger('bleach')
 ALLOWED_TAGS = [
    'a',
    'abbr',
    'acronym',
    'b',
    'blockquote',
    'code',
    'em',
    'i',
    'li',
    'ol',
    'strong',
    'ul',
 ]
 ALLOWED_ATTRIBUTES = {
    'a': ['href', 'title'],
    'abbr': ['title'],
    'acronym': ['title'],
 }
 ALLOWED_STYLES = []
 ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
       xn xxx ye yt yu za zm zw""".split()
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 PROTOCOLS = HTMLSanitizer.acceptable_protocols
 url_re = re.compile(
    r"""\(*  # Match any opening parentheses.
    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
    ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
        # /path/zz (excluding "unsafe" chars from RFC 1738,
        # except for # and ~, which happen in practice)
    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
    re.IGNORECASE | re.VERBOSE | re.UNICODE)
 proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
 punct_re = re.compile(r'([\.,]+)$')
 email_re = re.compile(
    r"""(?<!//)
    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
        |\\[\001-011\013\014\016-\177])*"  # quoted-string
    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
    """,
    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
 NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
 ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
 # a simple routine that returns the tag name with the namespace prefix
 # as returned by etree's Element.tag attribute
 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
-def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+__all__ = ["clean", "linkify"]
          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
          strip_comments=True):
    """Clean an HTML fragment and return it
-    :arg text: the text to clean
+
-    :arg tags: whitelist of allowed tags; defaults to
+def clean(
-        ``bleach.ALLOWED_TAGS``
+    text,
-    :arg attributes: whitelist of allowed attributes; defaults to
+    tags=ALLOWED_TAGS,
-        ``bleach.ALLOWED_ATTRIBUTES``
+    attributes=ALLOWED_ATTRIBUTES,
-    :arg styles: whitelist of allowed css; defaults to
+    styles=ALLOWED_STYLES,
-        ``bleach.ALLOWED_STYLES``
+    protocols=ALLOWED_PROTOCOLS,
-    :arg protocols: whitelist of allowed protocols for links; defaults
+    strip=False,
-        to ``bleach.ALLOWED_PROTOCOLS``
+    strip_comments=True,
-    :arg strip: whether or not to strip disallowed elements
+):
-    :arg strip_comments: whether or not to strip HTML comments
+    """Clean an HTML fragment of malicious content and return it
    This function is a security-focused function whose sole purpose is to
    remove malicious content from a string such that it can be displayed as
    content in a web page.
    This function is not designed to use to transform content to be used in
    non-web-page contexts.
    Example::
        import bleach
        better_text = bleach.clean(yucky_text)
    .. Note::
       If you're cleaning a lot of text and passing the same argument values or
       you want more configurability, consider using a
       :py:class:`bleach.sanitizer.Cleaner` instance.
    :arg str text: the text to clean
    :arg list tags: allowed list of tags; defaults to
        ``bleach.sanitizer.ALLOWED_TAGS``
    :arg dict attributes: allowed attributes; can be a callable, list or dict;
        defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
    :arg list styles: allowed list of css styles; defaults to
        ``bleach.sanitizer.ALLOWED_STYLES``
    :arg list protocols: allowed list of protocols for links; defaults
        to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
    :arg bool strip: whether or not to strip disallowed elements
    :arg bool strip_comments: whether or not to strip HTML comments
    :returns: cleaned text as unicode
    """
-    if not text:
+    cleaner = Cleaner(
-        return ''
+        tags=tags,
-
+        attributes=attributes,
-    text = force_unicode(text)
+        styles=styles,
-
+        protocols=protocols,
-    class s(BleachSanitizer):
+        strip=strip,
-        allowed_elements = tags
+        strip_comments=strip_comments,
-        allowed_attributes = attributes
+    )
-        allowed_css_properties = styles
+    return cleaner.clean(text)
        allowed_protocols = protocols
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments
    parser = html5lib.HTMLParser(tokenizer=s)
    return _render(parser.parseFragment(text))
-def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
-            parse_email=False, tokenizer=HTMLSanitizer):
+    """Convert URL-like strings in an HTML fragment to links
-    """Convert URL-like strings in an HTML fragment to links.
+
    This function converts strings that look like URLs, domain names and email
    addresses in text that may be an HTML fragment to links, while preserving:
    1. links already in the string
    2. urls found in attributes
    3. email addresses
    linkify does a best-effort approach and tries to recover from bad
    situations due to crazy text.
    .. Note::
       If you're linking a lot of text and passing the same argument values or
       you want more configurability, consider using a
       :py:class:`bleach.linkifier.Linker` instance.
    .. Note::
       If you have text that you want to clean and then linkify, consider using
       the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
       pass. That way you're not parsing the HTML twice.
    :arg str text: the text to linkify
    :arg list callbacks: list of callbacks to run when adjusting tag attributes;
        defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
    :arg list skip_tags: list of tags that you don't want to linkify the
        contents of; for example, you could set this to ``['pre']`` to skip
        linkifying contents of ``pre`` tags
    :arg bool parse_email: whether or not to linkify email addresses
    :returns: linkified text as unicode
    linkify() converts strings that look like URLs or domain names in a
    blob of text that may be an HTML fragment to links, while preserving
    (a) links already in the string, (b) urls found in attributes, and
    (c) email addresses.
    """
-    text = force_unicode(text)
+    linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email)
-
+    return linker.linkify(text)
    if not text:
        return ''
    parser = html5lib.HTMLParser(tokenizer=tokenizer)
    forest = parser.parseFragment(text)
    _seen = set([])
    def replace_nodes(tree, new_frag, node, index=0):
        """
        Doesn't really replace nodes, but inserts the nodes contained in
        new_frag into the treee at position index and returns the number
        of nodes inserted.
        If node is passed in, it is removed from the tree
        """
        count = 0
        new_tree = parser.parseFragment(new_frag)
        # capture any non-tag text at the start of the fragment
        if new_tree.text:
            if index == 0:
                tree.text = tree.text or ''
                tree.text += new_tree.text
            else:
                tree[index - 1].tail = tree[index - 1].tail or ''
                tree[index - 1].tail += new_tree.text
        # the put in the tagged elements into the old tree
        for n in new_tree:
            if n.tag == ETREE_TAG('a'):
                _seen.add(n)
            tree.insert(index + count, n)
            count += 1
        # if we got a node to remove...
        if node is not None:
            tree.remove(node)
        return count
    def strip_wrapping_parentheses(fragment):
        """Strips wrapping parentheses.
        Returns a tuple of the following format::
            (string stripped from wrapping parentheses,
             count of stripped opening parentheses,
             count of stripped closing parentheses)
        """
        opening_parentheses = closing_parentheses = 0
        # Count consecutive opening parentheses
        # at the beginning of the fragment (string).
        for char in fragment:
            if char == '(':
                opening_parentheses += 1
            else:
                break
        if opening_parentheses:
            newer_frag = ''
            # Cut the consecutive opening brackets from the fragment.
            fragment = fragment[opening_parentheses:]
            # Reverse the fragment for easier detection of parentheses
            # inside the URL.
            reverse_fragment = fragment[::-1]
            skip = False
            for char in reverse_fragment:
                # Remove the closing parentheses if it has a matching
                # opening parentheses (they are balanced).
                if (char == ')' and
                        closing_parentheses < opening_parentheses and
                        not skip):
                    closing_parentheses += 1
                    continue
                # Do not remove ')' from the URL itself.
                elif char != ')':
                    skip = True
                newer_frag += char
            fragment = newer_frag[::-1]
        return fragment, opening_parentheses, closing_parentheses
    def apply_callbacks(attrs, new):
        for cb in callbacks:
            attrs = cb(attrs, new)
            if attrs is None:
                return None
        return attrs
    def _render_inner(node):
        out = ['' if node.text is None else node.text]
        for subnode in node:
            out.append(_render(subnode))
            if subnode.tail:
                out.append(subnode.tail)
        return ''.join(out)
    def linkify_nodes(tree, parse_text=True):
        children = len(tree)
        current_child = -1
        # start at -1 to process the parent first
        while current_child < len(tree):
            if current_child < 0:
                node = tree
                if parse_text and node.text:
                    new_txt = old_txt = node.text
                    if parse_email:
                        new_txt = re.sub(email_re, email_repl, node.text)
                        if new_txt and new_txt != node.text:
                            node.text = ''
                            adj = replace_nodes(tree, new_txt, None, 0)
                            children += adj
                            current_child += adj
                            linkify_nodes(tree, True)
                            continue
                    new_txt = re.sub(url_re, link_repl, new_txt)
                    if new_txt != old_txt:
                        node.text = ''
                        adj = replace_nodes(tree, new_txt, None, 0)
                        children += adj
                        current_child += adj
                        continue
            else:
                node = tree[current_child]
            if parse_text and node.tail:
                new_tail = old_tail = node.tail
                if parse_email:
                    new_tail = re.sub(email_re, email_repl, new_tail)
                    if new_tail != node.tail:
                        node.tail = ''
                        adj = replace_nodes(tree, new_tail, None,
                                            current_child + 1)
                        # Insert the new nodes made from my tail into
                        # the tree right after me. current_child+1
                        children += adj
                        continue
                new_tail = re.sub(url_re, link_repl, new_tail)
                if new_tail != old_tail:
                    node.tail = ''
                    adj = replace_nodes(tree, new_tail, None,
                                        current_child + 1)
                    children += adj
            if node.tag == ETREE_TAG('a') and not (node in _seen):
                if not node.get('href', None) is None:
                    attrs = dict(node.items())
                    _text = attrs['_text'] = _render_inner(node)
                    attrs = apply_callbacks(attrs, False)
                    if attrs is None:
                        # <a> tag replaced by the text within it
                        adj = replace_nodes(tree, _text, node,
                                            current_child)
                        current_child -= 1
                        # pull back current_child by 1 to scan the
                        # new nodes again.
                    else:
                        text = force_unicode(attrs.pop('_text'))
                        for attr_key, attr_val in attrs.items():
                            node.set(attr_key, attr_val)
                        for n in reversed(list(node)):
                            node.remove(n)
                        text = parser.parseFragment(text)
                        node.text = text.text
                        for n in text:
                            node.append(n)
                        _seen.add(node)
            elif current_child >= 0:
                if node.tag == ETREE_TAG('pre') and skip_pre:
                    linkify_nodes(node, False)
                elif not (node in _seen):
                    linkify_nodes(node, True)
            current_child += 1
    def email_repl(match):
        addr = match.group(0).replace('"', '&quot;')
        link = {
            '_text': addr,
            'href': 'mailto:{0!s}'.format(addr),
        }
        link = apply_callbacks(link, True)
        if link is None:
            return addr
        _href = link.pop('href')
        _text = link.pop('_text')
        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
        return repl.format(_href, attribs, _text)
    def link_repl(match):
        url = match.group(0)
        open_brackets = close_brackets = 0
        if url.startswith('('):
            _wrapping = strip_wrapping_parentheses(url)
            url, open_brackets, close_brackets = _wrapping
        end = ''
        m = re.search(punct_re, url)
        if m:
            end = m.group(0)
            url = url[0:m.start()]
        if re.search(proto_re, url):
            href = url
        else:
            href = ''.join(['http://', url])
        link = {
            '_text': url,
            'href': href,
        }
        link = apply_callbacks(link, True)
        if link is None:
            return '(' * open_brackets + url + ')' * close_brackets
        _text = link.pop('_text')
        _href = link.pop('href')
        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
        return repl.format('(' * open_brackets,
                           _href, attribs, _text, end,
                           ')' * close_brackets)
    try:
        linkify_nodes(forest)
    except RuntimeError as e:
        # If we hit the max recursion depth, just return what we've got.
        log.exception('Probable recursion error: {0!r}'.format(e))
    return _render(forest)
 def _render(tree):
    """Try rendering as HTML, then XML, then give up."""
    return force_unicode(_serialize(tree))
 def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True,
                                alphabetical_attributes=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
--- a/lib/bleach/_vendor/README.rst
+++ b/lib/bleach/_vendor/README.rst
@ -0,0 +1,61 @@
 =======================
 Vendored library policy
 =======================
 To simplify Bleach development, we're now vendoring certain libraries that
 we use.
 Vendored libraries must follow these rules:
 1. Vendored libraries must be pure Python--no compiling.
 2. Source code for the libary is included in this directory.
 3. License must be included in this repo and in the Bleach distribution.
 4. Requirements of the library become requirements of Bleach.
 5. No modifications to the library may be made.
 Adding/Updating a vendored library
 ==================================
 Way to vendor a library or update a version:
 1. Update ``vendor.txt`` with the library, version, and hash. You can use
   `hashin <https://pypi.org/project/hashin/>`_.
 2. Remove all old files and directories of the old version.
 3. Run ``pip_install_vendor.sh`` and check everything it produced in including
   the ``.dist-info`` directory and contents.
 4. Update the bleach minor version in the next release.
 Reviewing a change involving a vendored library
 ===============================================
 Way to verify a vendored library addition/update:
 1. Pull down the branch.
 2. Delete all the old files and directories of the old version.
 3. Run ``pip_install_vendor.sh``.
 4. Run ``git diff`` and verify there are no changes.
 NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info
 Removing/Unvendoring a vendored library
 =======================================
 A vendored library might be removed for any of the following reasons:
 * it violates the vendoring policy (e.g. an incompatible license
  change)
 * a suitable replacement is found
 * bleach has the resources to test and QA new bleach releases against
  multiple versions of the previously vendored library
 To unvendor a library:
 1. Remove the library and its hashes from ``vendor.txt``.
 2. Remove library files and directories from this directory.
 3. Run ``install_vendor.sh`` and check the previously vendored library including
   the ``.dist-info`` directory and contents is not installed.
 4. Update the bleach minor version in the next release.
--- a/lib/bleach/_vendor/init.py
+++ b/lib/bleach/_vendor/init.py
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
@ -0,0 +1,66 @@
 Credits
 =======
 ``html5lib`` is written and maintained by:
 - James Graham
 - Sam Sneddon
 - Łukasz Langa
 - Will Kahn-Greene
 Patches and suggestions
 -----------------------
 (In chronological order, by first commit:)
 - Anne van Kesteren
 - Lachlan Hunt
 - lantis63
 - Sam Ruby
 - Thomas Broyer
 - Tim Fletcher
 - Mark Pilgrim
 - Ryan King
 - Philip Taylor
 - Edward Z. Yang
 - fantasai
 - Philip Jägenstedt
 - Ms2ger
 - Mohammad Taha Jahangir
 - Andy Wingo
 - Andreas Madsack
 - Karim Valiev
 - Juan Carlos Garcia Segovia
 - Mike West
 - Marc DM
 - Simon Sapin
 - Michael[tm] Smith
 - Ritwik Gupta
 - Marc Abramowitz
 - Tony Lopes
 - lilbludevil
 - Kevin
 - Drew Hubl
 - Austin Kumbera
 - Jim Baker
 - Jon Dufresne
 - Donald Stufft
 - Alex Gaynor
 - Nik Nyby
 - Jakub Wilk
 - Sigmund Cherem
 - Gabi Davar
 - Florian Mounier
 - neumond
 - Vitalik Verhovodov
 - Kovid Goyal
 - Adam Chainz
 - John Vandenberg
 - Eric Amorde
 - Benedikt Morbach
 - Jonathan Vanasco
 - Tom Most
 - Ville Skyttä
 - Hugo van Kemenade
 - Mark Vasilkov
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
@ -0,0 +1 @@
 pip
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
@ -0,0 +1,552 @@
 Metadata-Version: 2.1
 Name: html5lib
 Version: 1.1
 Summary: HTML parser based on the WHATWG HTML specification
 Home-page: https://github.com/html5lib/html5lib-python
 Maintainer: James Graham
 Maintainer-email: james@hoppipolla.co.uk
 License: MIT License
 Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 2
 Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.5
 Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: Markup :: HTML
 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
 Requires-Dist: six (>=1.9)
 Requires-Dist: webencodings
 Provides-Extra: all
 Requires-Dist: genshi ; extra == 'all'
 Requires-Dist: chardet (>=2.2) ; extra == 'all'
 Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all'
 Provides-Extra: chardet
 Requires-Dist: chardet (>=2.2) ; extra == 'chardet'
 Provides-Extra: genshi
 Requires-Dist: genshi ; extra == 'genshi'
 Provides-Extra: lxml
 Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml'
 html5lib
 ========
 .. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master
    :target: https://travis-ci.org/html5lib/html5lib-python
 html5lib is a pure-python library for parsing HTML. It is designed to
 conform to the WHATWG HTML specification, as is implemented by all major
 web browsers.
 Usage
 -----
 Simple usage follows this pattern:
 .. code-block:: python
  import html5lib
  with open("mydocument.html", "rb") as f:
      document = html5lib.parse(f)
 or:
 .. code-block:: python
  import html5lib
  document = html5lib.parse("<p>Hello World!")
 By default, the ``document`` will be an ``xml.etree`` element instance.
 Whenever possible, html5lib chooses the accelerated ``ElementTree``
 implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
 Two other tree types are supported: ``xml.dom.minidom`` and
 ``lxml.etree``. To use an alternative format, specify the name of
 a treebuilder:
 .. code-block:: python
  import html5lib
  with open("mydocument.html", "rb") as f:
      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
 When using with ``urllib2`` (Python 2), the charset from HTTP should be
 pass into html5lib as follows:
 .. code-block:: python
  from contextlib import closing
  from urllib2 import urlopen
  import html5lib
  with closing(urlopen("http://example.com/")) as f:
      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:
 .. code-block:: python
  from urllib.request import urlopen
  import html5lib
  with urlopen("http://example.com/") as f:
      document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
 To have more control over the parser, create a parser object explicitly.
 For instance, to make the parser raise exceptions on parse errors, use:
 .. code-block:: python
  import html5lib
  with open("mydocument.html", "rb") as f:
      parser = html5lib.HTMLParser(strict=True)
      document = parser.parse(f)
 When you're instantiating parser objects explicitly, pass a treebuilder
 class as the ``tree`` keyword argument to use an alternative document
 format:
 .. code-block:: python
  import html5lib
  parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
  minidom_document = parser.parse("<p>Hello World!")
 More documentation is available at https://html5lib.readthedocs.io/.
 Installation
 ------------
 html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
 .. code-block:: bash
    $ pip install html5lib
 The goal is to support a (non-strict) superset of the versions that `pip
 supports
 <https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
 Optional Dependencies
 ---------------------
 The following third-party libraries may be used for additional
 functionality:
 - ``lxml`` is supported as a tree format (for both building and
  walking) under CPython (but *not* PyPy where it is known to cause
  segfaults);
 - ``genshi`` has a treewalker (but not builder); and
 - ``chardet`` can be used as a fallback when character encoding cannot
  be determined.
 Bugs
 ----
 Please report any bugs on the `issue tracker
 <https://github.com/html5lib/html5lib-python/issues>`_.
 Tests
 -----
 Unit tests require the ``pytest`` and ``mock`` libraries and can be
 run using the ``py.test`` command in the root directory.
 Test data are contained in a separate `html5lib-tests
 <https://github.com/html5lib/html5lib-tests>`_ repository and included
 as a submodule, thus for git checkouts they must be initialized::
  $ git submodule init
  $ git submodule update
 If you have all compatible Python implementations available on your
 system, you can run tests on all of them using the ``tox`` utility,
 which can be found on PyPI.
 Questions?
 ----------
 There's a mailing list available for support on Google Groups,
 `html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
 though you may get a quicker response asking on IRC in `#whatwg on
 irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
 Change Log
 ----------
 1.1
 ~~~
 UNRELEASED
 Breaking changes:
 * Drop support for Python 3.3. (#358)
 * Drop support for Python 3.4. (#421)
 Deprecations:
 * Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
  ``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
  <https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
  use. (#443)
 Other changes:
 * Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
  ``html5lib`` keeps working in future Python versions. (#403)
 * Drop optional ``datrie`` dependency. (#442)
 1.0.1
 ~~~~~
 Released on December 7, 2017
 Breaking changes:
 * Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
 * Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
 Features:
 * Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
  Will Kahn-Greene!)
 * Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
 * Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
 * Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
 * Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
 * Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
  Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
 * Semver-compliant version number.
 Bug fixes:
 * Add support for setuptools < 18.5 to support environment markers. (Thank you,
  John Vandenberg!)
 * Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
 * Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
  you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
 * Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
  Kahn-Greene!)
 * Include license file in generated wheel package. (#350) (Thank you, Jon
  Dufresne!)
 * Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
 * Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
  Komal Dembla, Hugo!)
 1.0
 ~~~
 Released and unreleased on December 7, 2017. Badly packaged release.
 0.999999999/1.0b10
 ~~~~~~~~~~~~~~~~~~
 Released on July 15, 2016
 * Fix attribute order going to the tree builder to be document order
  instead of reverse document order(!).
 0.99999999/1.0b9
 ~~~~~~~~~~~~~~~~
 Released on July 14, 2016
 * **Added ordereddict as a mandatory dependency on Python 2.6.**
 * Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
  extras that will do the right thing based on the specific
  interpreter implementation.
 * Now requires the ``mock`` package for the testsuite.
 * Cease supporting DATrie under PyPy.
 * **Remove PullDOM support, as this hasn't ever been properly
  tested, doesn't entirely work, and as far as I can tell is
  completely unused by anyone.**
 * Move testsuite to ``py.test``.
 * **Fix #124: move to webencodings for decoding the input byte stream;
  this makes html5lib compliant with the Encoding Standard, and
  introduces a required dependency on webencodings.**
 * **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
 * **Fix comments containing double-dash with lxml 3.5 and above.**
 * **Use scripting disabled by default (as we don't implement
  scripting).**
 * **Fix #11, avoiding the XSS bug potentially caused by serializer
  allowing attribute values to be escaped out of in old browser versions,
  changing the quote_attr_values option on serializer to take one of
  three values, "always" (the old True value), "legacy" (the new option,
  and the new default), and "spec" (the old False value, and the old
  default).**
 * **Fix #72 by rewriting the sanitizer to apply only to treewalkers
  (instead of the tokenizer); as such, this will require amending all
  callers of it to use it via the treewalker API.**
 * **Drop support of charade, now that chardet is supported once more.**
 * **Replace the charset keyword argument on parse and related methods
  with a set of keyword arguments: override_encoding, transport_encoding,
  same_origin_parent_encoding, likely_encoding, and default_encoding.**
 * **Move filters._base, treebuilder._base, and treewalkers._base to .base
  to clarify their status as public.**
 * **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
  sanitizer.htmlsanitizer module and move that to sanitizer. This means
  anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
  code changes.**
 * **Rename treewalkers.lxmletree to .etree_lxml and
  treewalkers.genshistream to .genshi to have a consistent API.**
 * Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
  utils) to be underscore prefixed to clarify their status as private.
 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~
 Released on September 10, 2015
 * Fix #195: fix the sanitizer to drop broken URLs (it threw an
  exception between 0.9999 and 0.999999).
 0.999999/1.0b7
 ~~~~~~~~~~~~~~
 Released on July 7, 2015
 * Fix #189: fix the sanitizer to allow relative URLs again (as it did
  prior to 0.9999/1.0b5).
 0.99999/1.0b6
 ~~~~~~~~~~~~~
 Released on April 30, 2015
 * Fix #188: fix the sanitizer to not throw an exception when sanitizing
  bogus data URLs.
 0.9999/1.0b5
 ~~~~~~~~~~~~
 Released on April 29, 2015
 * Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
  this sounds, this has no known security implications.  No known version
  of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
  Chrome (1 to current), or Opera (12 to current) will run any script
  provided in these attributes.
 * Pass error message to the ParseError exception in strict parsing mode.
 * Allow data URIs in the sanitizer, with a whitelist of content-types.
 * Add support for Python implementations that don't support lone
  surrogates (read: Jython). Fixes #2.
 * Remove localization of error messages. This functionality was totally
  unused (and untested that everything was localizable), so we may as
  well follow numerous browsers in not supporting translating technical
  strings.
 * Expose treewalkers.pprint as a public API.
 * Add a documentEncoding property to HTML5Parser, fix #121.
 0.999
 ~~~~~
 Released on December 23, 2013
 * Fix #127: add work-around for CPython issue #20007: .read(0) on
  http.client.HTTPResponse drops the rest of the content.
 * Fix #115: lxml treewalker can now deal with fragments containing, at
  their root level, text nodes with non-ASCII characters on Python 2.
 0.99
 ~~~~
 Released on September 10, 2013
 * No library changes from 1.0b3; released as 0.99 as pip has changed
  behaviour from 1.4 to avoid installing pre-release versions per
  PEP 440.
 1.0b3
 ~~~~~
 Released on July 24, 2013
 * Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
  implementation using it should be moved to
  ``NonRecursiveTreeWalker``, as everything bundled with html5lib has
  for years.
 * Fix #67 so that ``BufferedStream`` to correctly returns a bytes
  object, thereby fixing any case where html5lib is passed a
  non-seekable RawIOBase-like object.
 1.0b2
 ~~~~~
 Released on June 27, 2013
 * Removed reordering of attributes within the serializer. There is now
  an ``alphabetical_attributes`` option which preserves the previous
  behaviour through a new filter. This allows attribute order to be
  preserved through html5lib if the tree builder preserves order.
 * Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
  ``treeadapters.sax.to_sax`` which is generic and supports any
  treewalker; it also resolves all known bugs with ``dom2sax``.
 * Fix treewalker assertions on hitting bytes strings on
  Python 2. Previous to 1.0b1, treewalkers coped with mixed
  bytes/unicode data on Python 2; this reintroduces this prior
  behaviour on Python 2. Behaviour is unchanged on Python 3.
 1.0b1
 ~~~~~
 Released on May 17, 2013
 * Implementation updated to implement the `HTML specification
  <http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
  2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
 * Python 3.2+ supported in a single codebase using the ``six`` library.
 * Removed support for Python 2.5 and older.
 * Removed the deprecated Beautiful Soup 3 treebuilder.
  ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
  since it doesn't support namespaces, foreign content like SVG and
  MathML is parsed incorrectly.
 * Removed ``simpletree`` from the package. The default tree builder is
  now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
  available, and ``xml.etree.ElementTree`` otherwise).
 * Removed the ``XHTMLSerializer`` as it never actually guaranteed its
  output was well-formed XML, and hence provided little of use.
 * Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
  longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
  return the default DOM treebuilder, which uses ``xml.dom.minidom``.
 * Optional heuristic character encoding detection now based on
  ``charade`` for Python 2.6 - 3.3 compatibility.
 * Optional ``Genshi`` treewalker support fixed.
 * Many bugfixes, including:
  * #33: null in attribute value breaks XML AttValue;
  * #4: nested, indirect descendant, <button> causes infinite loop;
  * `Google Code 215
    <http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
    detect seekable streams;
  * `Google Code 206
    <http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
    support for <video preload=...>, <audio preload=...>;
  * `Google Code 205
    <http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
    support for <video poster=...>;
  * `Google Code 202
    <http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
    file breaks InputStream.
 * Source code is now mostly PEP 8 compliant.
 * Test harness has been improved and now depends on ``nose``.
 * Documentation updated and moved to https://html5lib.readthedocs.io/.
 0.95
 ~~~~
 Released on February 11, 2012
 0.90
 ~~~~
 Released on January 17, 2010
 0.11.1
 ~~~~~~
 Released on June 12, 2008
 0.11
 ~~~~
 Released on June 10, 2008
 0.10
 ~~~~
 Released on October 7, 2007
 0.9
 ~~~
 Released on March 11, 2007
 0.2
 ~~~
 Released on January 8, 2007
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
@ -0,0 +1,41 @@
 html5lib-1.1.dist-info/AUTHORS.rst,sha256=DrNAMifoDpuQyJn-KW-H6K8Tt2a5rKnV2UF4-DRrGUI,983
 html5lib-1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
 html5lib-1.1.dist-info/LICENSE,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
 html5lib-1.1.dist-info/METADATA,sha256=Y3w-nd_22HQnQRy3yypVsV_ke2FF94uUD4-vGpc2DnI,16076
 html5lib-1.1.dist-info/RECORD,,
 html5lib-1.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 html5lib-1.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
 html5lib-1.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
 html5lib/__init__.py,sha256=pWnYcfZ69wNLrdQL7bpr49FUi8O8w0KhKCOHsyRgYGQ,1143
 html5lib/_ihatexml.py,sha256=ifOwF7pXqmyThIXc3boWc96s4MDezqRrRVp7FwDYUFs,16728
 html5lib/_inputstream.py,sha256=IKuMiY8rzb7pqIGCpbvTqsxysLEpgEHWYvYEFu4LUAI,32300
 html5lib/_tokenizer.py,sha256=WvJQa2Mli4NtTmhLXkX8Jy5FcWttqCaiDTiKyaw8D-k,77028
 html5lib/_trie/__init__.py,sha256=nqfgO910329BEVJ5T4psVwQtjd2iJyEXQ2-X8c1YxwU,109
 html5lib/_trie/_base.py,sha256=CaybYyMro8uERQYjby2tTeSUatnWDfWroUN9N7ety5w,1013
 html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
 html5lib/_utils.py,sha256=AxAJSG15eyarCgKMnlUwzs1X6jFHXqEvhlYEOxAFmis,4919
 html5lib/constants.py,sha256=Ll-yzLU_jcjyAI_h57zkqZ7aQWE5t5xA4y_jQgoUUhw,83464
 html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
 html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
 html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
 html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
 html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
 html5lib/filters/sanitizer.py,sha256=XGNSdzIqDTaHot1V-rRj1V_XOolApJ7n95tHP9JcgNU,26885
 html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
 html5lib/html5parser.py,sha256=w5hZJh0cvD3g4CS196DiTmuGpSKCMYe1GS46-yf_WZQ,117174
 html5lib/serializer.py,sha256=K2kfoLyMPMFPfdusfR30SrxNkf0mJB92-P5_RntyaaI,15747
 html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
 html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
 html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
 html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
 html5lib/treebuilders/base.py,sha256=oeZNGEB-kt90YJGVH05gb5a8E7ids2AbYwGRsVCieWk,14553
 html5lib/treebuilders/dom.py,sha256=22whb0C71zXIsai5mamg6qzBEiigcBIvaDy4Asw3at0,8925
 html5lib/treebuilders/etree.py,sha256=EbmHx-wQ-11MVucTPtF7Ul92-mQGN3Udu_KfDn-Ifhk,12824
 html5lib/treebuilders/etree_lxml.py,sha256=OazDHZGO_q4FnVs4Dhs4hzzn2JwGAOs-rfV8LAlUGW4,14754
 html5lib/treewalkers/__init__.py,sha256=OBPtc1TU5mGyy18QDMxKEyYEz0wxFUUNj5v0-XgmYhY,5719
 html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
 html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
 html5lib/treewalkers/etree.py,sha256=gkD4tfEfRWPsEGvgHHJxZmKZXUvBzVVGz3v5C_MIiOE,4539
 html5lib/treewalkers/etree_lxml.py,sha256=eLedbn6nPjlpebibsWVijey7WEpzDwxU3ubwUoudBuA,6345
 html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
@ -0,0 +1,6 @@
 Wheel-Version: 1.0
 Generator: bdist_wheel (0.34.2)
 Root-Is-Purelib: true
 Tag: py2-none-any
 Tag: py3-none-any
--- a/lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
+++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
@ -0,0 +1 @@
 html5lib
--- a/lib/bleach/_vendor/html5lib/init.py
+++ b/lib/bleach/_vendor/html5lib/init.py
@ -0,0 +1,35 @@
 """
 HTML parsing library based on the `WHATWG HTML specification
 <https://whatwg.org/html>`_. The parser is designed to be compatible with
 existing HTML found in the wild and implements well-defined error recovery that
 is largely compatible with modern desktop web browsers.
 Example usage::
    import html5lib
    with open("my_document.html", "rb") as f:
        tree = html5lib.parse(f)
 For convenience, this module re-exports the following names:
 * :func:`~.html5parser.parse`
 * :func:`~.html5parser.parseFragment`
 * :class:`~.html5parser.HTMLParser`
 * :func:`~.treebuilders.getTreeBuilder`
 * :func:`~.treewalkers.getTreeWalker`
 * :func:`~.serializer.serialize`
 """
 from __future__ import absolute_import, division, unicode_literals
 from .html5parser import HTMLParser, parse, parseFragment
 from .treebuilders import getTreeBuilder
 from .treewalkers import getTreeWalker
 from .serializer import serialize
 __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
 # this has to be at the top level, see how setup.py parses this
 #: Distribution version number.
 __version__ = "1.1"
--- a/lib/bleach/_vendor/html5lib/_ihatexml.py
+++ b/lib/bleach/_vendor/html5lib/_ihatexml.py
@ -0,0 +1,289 @@
 from __future__ import absolute_import, division, unicode_literals
 import re
 import warnings
 from .constants import DataLossWarning
 baseChar = """
 [#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
 [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
 [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
 [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
 [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
 [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
 [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
 [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
 [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
 [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
 [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
 [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
 [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
 [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
 [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
 [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
 [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
 [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
 [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
 [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
 [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
 [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
 [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
 [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
 [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
 [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
 [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
 [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
 [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
 [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
 #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
 #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
 #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
 [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
 [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
 #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
 [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
 [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
 [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
 [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
 [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
 #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
 [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
 [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
 [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
 [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
 ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
 combiningCharacter = """
 [#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
 [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
 [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
 [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
 #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
 [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
 [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
 #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
 [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
 [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
 #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
 [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
 [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
 [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
 [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
 [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
 #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
 [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
 #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
 [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
 [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
 #x3099 | #x309A"""
 digit = """
 [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
 [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
 [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
 [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
 extender = """
 #x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
 #[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
 letter = " | ".join([baseChar, ideographic])
 # Without the
 name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
                   extender])
 nameFirst = " | ".join([letter, "_"])
 reChar = re.compile(r"#x([\d|A-F]{4,4})")
 reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
 def charStringToList(chars):
    charRanges = [item.strip() for item in chars.split(" | ")]
    rv = []
    for item in charRanges:
        foundMatch = False
        for regexp in (reChar, reCharRange):
            match = regexp.match(item)
            if match is not None:
                rv.append([hexToInt(item) for item in match.groups()])
                if len(rv[-1]) == 1:
                    rv[-1] = rv[-1] * 2
                foundMatch = True
                break
        if not foundMatch:
            assert len(item) == 1
            rv.append([ord(item)] * 2)
    rv = normaliseCharList(rv)
    return rv
 def normaliseCharList(charList):
    charList = sorted(charList)
    for item in charList:
        assert item[1] >= item[0]
    rv = []
    i = 0
    while i < len(charList):
        j = 1
        rv.append(charList[i])
        while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
            rv[-1][1] = charList[i + j][1]
            j += 1
        i += j
    return rv
 # We don't really support characters above the BMP :(
 max_unicode = int("FFFF", 16)
 def missingRanges(charList):
    rv = []
    if charList[0] != 0:
        rv.append([0, charList[0][0] - 1])
    for i, item in enumerate(charList[:-1]):
        rv.append([item[1] + 1, charList[i + 1][0] - 1])
    if charList[-1][1] != max_unicode:
        rv.append([charList[-1][1] + 1, max_unicode])
    return rv
 def listToRegexpStr(charList):
    rv = []
    for item in charList:
        if item[0] == item[1]:
            rv.append(escapeRegexp(chr(item[0])))
        else:
            rv.append(escapeRegexp(chr(item[0])) + "-" +
                      escapeRegexp(chr(item[1])))
    return "[%s]" % "".join(rv)
 def hexToInt(hex_str):
    return int(hex_str, 16)
 def escapeRegexp(string):
    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
                         "[", "]", "|", "(", ")", "-")
    for char in specialCharacters:
        string = string.replace(char, "\\" + char)
    return string
 # output from the above
 nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
    def __init__(self,
                 dropXmlnsLocalName=False,
                 dropXmlnsAttrNs=False,
                 preventDoubleDashComments=False,
                 preventDashAtCommentEnd=False,
                 replaceFormFeedCharacters=True,
                 preventSingleQuotePubid=False):
        self.dropXmlnsLocalName = dropXmlnsLocalName
        self.dropXmlnsAttrNs = dropXmlnsAttrNs
        self.preventDoubleDashComments = preventDoubleDashComments
        self.preventDashAtCommentEnd = preventDashAtCommentEnd
        self.replaceFormFeedCharacters = replaceFormFeedCharacters
        self.preventSingleQuotePubid = preventSingleQuotePubid
        self.replaceCache = {}
    def coerceAttribute(self, name, namespace=None):
        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
            return None
        elif (self.dropXmlnsAttrNs and
              namespace == "http://www.w3.org/2000/xmlns/"):
            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
            return None
        else:
            return self.toXmlName(name)
    def coerceElement(self, name):
        return self.toXmlName(name)
    def coerceComment(self, data):
        if self.preventDoubleDashComments:
            while "--" in data:
                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
                data = data.replace("--", "- -")
            if data.endswith("-"):
                warnings.warn("Comments cannot end in a dash", DataLossWarning)
                data += " "
        return data
    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
            for _ in range(data.count("\x0C")):
                warnings.warn("Text cannot contain U+000C", DataLossWarning)
            data = data.replace("\x0C", " ")
        # Other non-xml characters
        return data
    def coercePubid(self, data):
        dataOutput = data
        for char in nonPubidCharRegexp.findall(data):
            warnings.warn("Coercing non-XML pubid", DataLossWarning)
            replacement = self.getReplacementCharacter(char)
            dataOutput = dataOutput.replace(char, replacement)
        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
        return dataOutput
    def toXmlName(self, name):
        nameFirst = name[0]
        nameRest = name[1:]
        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
        if m:
            warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
            nameFirstOutput = self.getReplacementCharacter(nameFirst)
        else:
            nameFirstOutput = nameFirst
        nameRestOutput = nameRest
        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
        for char in replaceChars:
            warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
            replacement = self.getReplacementCharacter(char)
            nameRestOutput = nameRestOutput.replace(char, replacement)
        return nameFirstOutput + nameRestOutput
    def getReplacementCharacter(self, char):
        if char in self.replaceCache:
            replacement = self.replaceCache[char]
        else:
            replacement = self.escapeChar(char)
        return replacement
    def fromXmlName(self, name):
        for item in set(self.replacementRegexp.findall(name)):
            name = name.replace(item, self.unescapeChar(item))
        return name
    def escapeChar(self, char):
        replacement = "U%05X" % ord(char)
        self.replaceCache[char] = replacement
        return replacement
    def unescapeChar(self, charcode):
        return chr(int(charcode[1:], 16))
--- a/lib/bleach/_vendor/html5lib/_inputstream.py
+++ b/lib/bleach/_vendor/html5lib/_inputstream.py
@ -0,0 +1,918 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from six.moves import http_client, urllib
 import codecs
 import re
 from io import BytesIO, StringIO
 import webencodings
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from .constants import _ReparseException
 from . import _utils
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
 if _utils.supports_lone_surrogates:
    # Use one extra step of indirection and create surrogates with
    # eval. Not using this indirection would introduce an illegal
    # unicode literal on platforms not supporting such lone
    # surrogates.
    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
                                    "]")
 else:
    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                              0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
                              0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
                              0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
                              0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
                              0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                              0x10FFFE, 0x10FFFF}
 ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
 # Cache for charsUntil()
 charsUntilRegEx = {}
 class BufferedStream(object):
    """Buffering for streams that do not have buffering of their own
    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    """
    def __init__(self, stream):
        self.stream = stream
        self.buffer = []
        self.position = [-1, 0]  # chunk number, offset
    def tell(self):
        pos = 0
        for chunk in self.buffer[:self.position[0]]:
            pos += len(chunk)
        pos += self.position[1]
        return pos
    def seek(self, pos):
        assert pos <= self._bufferedBytes()
        offset = pos
        i = 0
        while len(self.buffer[i]) < offset:
            offset -= len(self.buffer[i])
            i += 1
        self.position = [i, offset]
    def read(self, bytes):
        if not self.buffer:
            return self._readStream(bytes)
        elif (self.position[0] == len(self.buffer) and
              self.position[1] == len(self.buffer[-1])):
            return self._readStream(bytes)
        else:
            return self._readFromBuffer(bytes)
    def _bufferedBytes(self):
        return sum([len(item) for item in self.buffer])
    def _readStream(self, bytes):
        data = self.stream.read(bytes)
        self.buffer.append(data)
        self.position[0] += 1
        self.position[1] = len(data)
        return data
    def _readFromBuffer(self, bytes):
        remainingBytes = bytes
        rv = []
        bufferIndex = self.position[0]
        bufferOffset = self.position[1]
        while bufferIndex < len(self.buffer) and remainingBytes != 0:
            assert remainingBytes > 0
            bufferedData = self.buffer[bufferIndex]
            if remainingBytes <= len(bufferedData) - bufferOffset:
                bytesToRead = remainingBytes
                self.position = [bufferIndex, bufferOffset + bytesToRead]
            else:
                bytesToRead = len(bufferedData) - bufferOffset
                self.position = [bufferIndex, len(bufferedData)]
                bufferIndex += 1
            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
            remainingBytes -= bytesToRead
            bufferOffset = 0
        if remainingBytes:
            rv.append(self._readStream(remainingBytes))
        return b"".join(rv)
 def HTMLInputStream(source, **kwargs):
    # Work around Python bug #20007: read(0) closes the connection.
    # http://bugs.python.org/issue20007
    if (isinstance(source, http_client.HTTPResponse) or
        # Also check for addinfourl wrapping HTTPResponse
        (isinstance(source, urllib.response.addbase) and
         isinstance(source.fp, http_client.HTTPResponse))):
        isUnicode = False
    elif hasattr(source, "read"):
        isUnicode = isinstance(source.read(0), text_type)
    else:
        isUnicode = isinstance(source, text_type)
    if isUnicode:
        encodings = [x for x in kwargs if x.endswith("_encoding")]
        if encodings:
            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
        return HTMLUnicodeInputStream(source, **kwargs)
    else:
        return HTMLBinaryInputStream(source, **kwargs)
 class HTMLUnicodeInputStream(object):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.
    """
    _defaultChunkSize = 10240
    def __init__(self, source):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.
        source can be either a file-object, local filename or a string.
        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        """
        if not _utils.supports_lone_surrogates:
            # Such platforms will have already checked for such
            # surrogate errors, so no need to do this checking.
            self.reportCharacterErrors = None
        elif len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
        # List of where new lines occur
        self.newLines = [0]
        self.charEncoding = (lookupEncoding("utf-8"), "certain")
        self.dataStream = self.openStream(source)
        self.reset()
    def reset(self):
        self.chunk = ""
        self.chunkSize = 0
        self.chunkOffset = 0
        self.errors = []
        # number of (complete) lines in previous chunks
        self.prevNumLines = 0
        # number of columns in the last line of the previous chunk
        self.prevNumCols = 0
        # Deal with CR LF and surrogates split over chunk boundaries
        self._bufferedCharacter = None
    def openStream(self, source):
        """Produces a file object from source.
        source can be either a file object, local filename or a string.
        """
        # Already a file object
        if hasattr(source, 'read'):
            stream = source
        else:
            stream = StringIO(source)
        return stream
    def _position(self, offset):
        chunk = self.chunk
        nLines = chunk.count('\n', 0, offset)
        positionLine = self.prevNumLines + nLines
        lastLinePos = chunk.rfind('\n', 0, offset)
        if lastLinePos == -1:
            positionColumn = self.prevNumCols + offset
        else:
            positionColumn = offset - (lastLinePos + 1)
        return (positionLine, positionColumn)
    def position(self):
        """Returns (line, col) of the current position in the stream."""
        line, col = self._position(self.chunkOffset)
        return (line + 1, col)
    def char(self):
        """ Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        """
        # Read a new chunk from the input stream if necessary
        if self.chunkOffset >= self.chunkSize:
            if not self.readChunk():
                return EOF
        chunkOffset = self.chunkOffset
        char = self.chunk[chunkOffset]
        self.chunkOffset = chunkOffset + 1
        return char
    def readChunk(self, chunkSize=None):
        if chunkSize is None:
            chunkSize = self._defaultChunkSize
        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
        self.chunk = ""
        self.chunkSize = 0
        self.chunkOffset = 0
        data = self.dataStream.read(chunkSize)
        # Deal with CR LF and surrogates broken across chunks
        if self._bufferedCharacter:
            data = self._bufferedCharacter + data
            self._bufferedCharacter = None
        elif not data:
            # We have no more data, bye-bye stream
            return False
        if len(data) > 1:
            lastv = ord(data[-1])
            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
                self._bufferedCharacter = data[-1]
                data = data[:-1]
        if self.reportCharacterErrors:
            self.reportCharacterErrors(data)
        # Replace invalid characters
        data = data.replace("\r\n", "\n")
        data = data.replace("\r", "\n")
        self.chunk = data
        self.chunkSize = len(data)
        return True
    def characterErrorsUCS4(self, data):
        for _ in range(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")
    def characterErrorsUCS2(self, data):
        # Someone picked the wrong compile option
        # You lose
        skip = False
        for match in invalid_unicode_re.finditer(data):
            if skip:
                continue
            codepoint = ord(match.group())
            pos = match.start()
            # Pretty sure there should be endianness issues here
            if _utils.isSurrogatePair(data[pos:pos + 2]):
                # We have a surrogate pair!
                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
                if char_val in non_bmp_invalid_codepoints:
                    self.errors.append("invalid-codepoint")
                skip = True
            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
                  pos == len(data) - 1):
                self.errors.append("invalid-codepoint")
            else:
                skip = False
                self.errors.append("invalid-codepoint")
    def charsUntil(self, characters, opposite=False):
        """ Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
        """
        # Use a cache of regexps to find the required characters
        try:
            chars = charsUntilRegEx[(characters, opposite)]
        except KeyError:
            if __debug__:
                for c in characters:
                    assert(ord(c) < 128)
            regex = "".join(["\\x%02x" % ord(c) for c in characters])
            if not opposite:
                regex = "^%s" % regex
            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
        rv = []
        while True:
            # Find the longest matching prefix
            m = chars.match(self.chunk, self.chunkOffset)
            if m is None:
                # If nothing matched, and it wasn't because we ran out of chunk,
                # then stop
                if self.chunkOffset != self.chunkSize:
                    break
            else:
                end = m.end()
                # If not the whole chunk matched, return everything
                # up to the part that didn't match
                if end != self.chunkSize:
                    rv.append(self.chunk[self.chunkOffset:end])
                    self.chunkOffset = end
                    break
            # If the whole remainder of the chunk matched,
            # use it all and read the next chunk
            rv.append(self.chunk[self.chunkOffset:])
            if not self.readChunk():
                # Reached EOF
                break
        r = "".join(rv)
        return r
    def unget(self, char):
        # Only one character is allowed to be ungotten at once - it must
        # be consumed again before any further call to unget
        if char is not EOF:
            if self.chunkOffset == 0:
                # unget is called quite rarely, so it's a good idea to do
                # more work here if it saves a bit of work in the frequently
                # called char and charsUntil.
                # So, just prepend the ungotten character onto the current
                # chunk:
                self.chunk = char + self.chunk
                self.chunkSize += 1
            else:
                self.chunkOffset -= 1
                assert self.chunk[self.chunkOffset] == char
 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.
    """
    def __init__(self, source, override_encoding=None, transport_encoding=None,
                 same_origin_parent_encoding=None, likely_encoding=None,
                 default_encoding="windows-1252", useChardet=True):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.
        source can be either a file-object, local filename or a string.
        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        """
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
        self.rawStream = self.openStream(source)
        HTMLUnicodeInputStream.__init__(self, self.rawStream)
        # Encoding Information
        # Number of bytes to use when looking for a meta element with
        # encoding information
        self.numBytesMeta = 1024
        # Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
        # Things from args
        self.override_encoding = override_encoding
        self.transport_encoding = transport_encoding
        self.same_origin_parent_encoding = same_origin_parent_encoding
        self.likely_encoding = likely_encoding
        self.default_encoding = default_encoding
        # Determine encoding
        self.charEncoding = self.determineEncoding(useChardet)
        assert self.charEncoding[0] is not None
        # Call superclass
        self.reset()
    def reset(self):
        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
        HTMLUnicodeInputStream.reset(self)
    def openStream(self, source):
        """Produces a file object from source.
        source can be either a file object, local filename or a string.
        """
        # Already a file object
        if hasattr(source, 'read'):
            stream = source
        else:
            stream = BytesIO(source)
        try:
            stream.seek(stream.tell())
        except Exception:
            stream = BufferedStream(stream)
        return stream
    def determineEncoding(self, chardet=True):
        # BOMs take precedence over everything
        # This will also read past the BOM if present
        charEncoding = self.detectBOM(), "certain"
        if charEncoding[0] is not None:
            return charEncoding
        # If we've been overridden, we've been overridden
        charEncoding = lookupEncoding(self.override_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding
        # Now check the transport layer
        charEncoding = lookupEncoding(self.transport_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding
        # Look for meta elements with encoding information
        charEncoding = self.detectEncodingMeta(), "tentative"
        if charEncoding[0] is not None:
            return charEncoding
        # Parent document encoding
        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
            return charEncoding
        # "likely" encoding
        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding
        # Guess with chardet, if available
        if chardet:
            try:
                from chardet.universaldetector import UniversalDetector
            except ImportError:
                pass
            else:
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
                if encoding is not None:
                    return encoding, "tentative"
        # Try the default encoding
        charEncoding = lookupEncoding(self.default_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding
        # Fallback to html5lib's default if even that hasn't worked
        return lookupEncoding("windows-1252"), "tentative"
    def changeEncoding(self, newEncoding):
        assert self.charEncoding[1] != "certain"
        newEncoding = lookupEncoding(newEncoding)
        if newEncoding is None:
            return
        if newEncoding.name in ("utf-16be", "utf-16le"):
            newEncoding = lookupEncoding("utf-8")
            assert newEncoding is not None
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.charEncoding = (newEncoding, "certain")
            self.reset()
            raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
        }
        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        assert isinstance(string, bytes)
        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2])  # UTF-16
                seek = 2
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        if encoding:
            self.rawStream.seek(seek)
            return lookupEncoding(encoding)
        else:
            self.rawStream.seek(0)
            return None
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        assert isinstance(buffer, bytes)
        parser = EncodingParser(buffer)
        self.rawStream.seek(0)
        encoding = parser.getEncoding()
        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
            encoding = lookupEncoding("utf-8")
        return encoding
 class EncodingBytes(bytes):
    """String-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raised"""
    def __new__(self, value):
        assert isinstance(value, bytes)
        return bytes.__new__(self, value.lower())
    def __init__(self, value):
        # pylint:disable=unused-argument
        self._position = -1
    def __iter__(self):
        return self
    def __next__(self):
        p = self._position = self._position + 1
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
        return self[p:p + 1]
    def next(self):
        # Py2 compat
        return self.__next__()
    def previous(self):
        p = self._position
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
        self._position = p = p - 1
        return self[p:p + 1]
    def setPosition(self, position):
        if self._position >= len(self):
            raise StopIteration
        self._position = position
    def getPosition(self):
        if self._position >= len(self):
            raise StopIteration
        if self._position >= 0:
            return self._position
        else:
            return None
    position = property(getPosition, setPosition)
    def getCurrentByte(self):
        return self[self.position:self.position + 1]
    currentByte = property(getCurrentByte)
    def skip(self, chars=spaceCharactersBytes):
        """Skip past a list of characters"""
        p = self.position               # use property for the error-checking
        while p < len(self):
            c = self[p:p + 1]
            if c not in chars:
                self._position = p
                return c
            p += 1
        self._position = p
        return None
    def skipUntil(self, chars):
        p = self.position
        while p < len(self):
            c = self[p:p + 1]
            if c in chars:
                self._position = p
                return c
            p += 1
        self._position = p
        return None
    def matchBytes(self, bytes):
        """Look for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone"""
        rv = self.startswith(bytes, self.position)
        if rv:
            self.position += len(bytes)
        return rv
    def jumpTo(self, bytes):
        """Look for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the match"""
        try:
            self._position = self.index(bytes, self.position) + len(bytes) - 1
        except ValueError:
            raise StopIteration
        return True
 class EncodingParser(object):
    """Mini parser for detecting character encoding from meta elements"""
    def __init__(self, data):
        """string - the data to work on for encoding detection"""
        self.data = EncodingBytes(data)
        self.encoding = None
    def getEncoding(self):
        if b"<meta" not in self.data:
            return None
        methodDispatch = (
            (b"<!--", self.handleComment),
            (b"<meta", self.handleMeta),
            (b"</", self.handlePossibleEndTag),
            (b"<!", self.handleOther),
            (b"<?", self.handleOther),
            (b"<", self.handlePossibleStartTag))
        for _ in self.data:
            keepParsing = True
            try:
                self.data.jumpTo(b"<")
            except StopIteration:
                break
            for key, method in methodDispatch:
                if self.data.matchBytes(key):
                    try:
                        keepParsing = method()
                        break
                    except StopIteration:
                        keepParsing = False
                        break
            if not keepParsing:
                break
        return self.encoding
    def handleComment(self):
        """Skip over comments"""
        return self.data.jumpTo(b"-->")
    def handleMeta(self):
        if self.data.currentByte not in spaceCharactersBytes:
            # if we have <meta not followed by a space so just keep going
            return True
        # We have a valid meta element we want to search for attributes
        hasPragma = False
        pendingEncoding = None
        while True:
            # Try to find the next attribute after the current position
            attr = self.getAttribute()
            if attr is None:
                return True
            else:
                if attr[0] == b"http-equiv":
                    hasPragma = attr[1] == b"content-type"
                    if hasPragma and pendingEncoding is not None:
                        self.encoding = pendingEncoding
                        return False
                elif attr[0] == b"charset":
                    tentativeEncoding = attr[1]
                    codec = lookupEncoding(tentativeEncoding)
                    if codec is not None:
                        self.encoding = codec
                        return False
                elif attr[0] == b"content":
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
                    if tentativeEncoding is not None:
                        codec = lookupEncoding(tentativeEncoding)
                        if codec is not None:
                            if hasPragma:
                                self.encoding = codec
                                return False
                            else:
                                pendingEncoding = codec
    def handlePossibleStartTag(self):
        return self.handlePossibleTag(False)
    def handlePossibleEndTag(self):
        next(self.data)
        return self.handlePossibleTag(True)
    def handlePossibleTag(self, endTag):
        data = self.data
        if data.currentByte not in asciiLettersBytes:
            # If the next byte is not an ascii letter either ignore this
            # fragment (possible start tag case) or treat it according to
            # handleOther
            if endTag:
                data.previous()
                self.handleOther()
            return True
        c = data.skipUntil(spacesAngleBrackets)
        if c == b"<":
            # return to the first step in the overall "two step" algorithm
            # reprocessing the < byte
            data.previous()
        else:
            # Read all attributes
            attr = self.getAttribute()
            while attr is not None:
                attr = self.getAttribute()
        return True
    def handleOther(self):
        return self.data.jumpTo(b">")
    def getAttribute(self):
        """Return a name,value pair for the next attribute in the stream,
        if one is found, or None"""
        data = self.data
        # Step 1 (skip chars)
        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
        assert c is None or len(c) == 1
        # Step 2
        if c in (b">", None):
            return None
        # Step 3
        attrName = []
        attrValue = []
        # Step 4 attribute name
        while True:
            if c == b"=" and attrName:
                break
            elif c in spaceCharactersBytes:
                # Step 6!
                c = data.skip()
                break
            elif c in (b"/", b">"):
                return b"".join(attrName), b""
            elif c in asciiUppercaseBytes:
                attrName.append(c.lower())
            elif c is None:
                return None
            else:
                attrName.append(c)
            # Step 5
            c = next(data)
        # Step 7
        if c != b"=":
            data.previous()
            return b"".join(attrName), b""
        # Step 8
        next(data)
        # Step 9
        c = data.skip()
        # Step 10
        if c in (b"'", b'"'):
            # 10.1
            quoteChar = c
            while True:
                # 10.2
                c = next(data)
                # 10.3
                if c == quoteChar:
                    next(data)
                    return b"".join(attrName), b"".join(attrValue)
                # 10.4
                elif c in asciiUppercaseBytes:
                    attrValue.append(c.lower())
                # 10.5
                else:
                    attrValue.append(c)
        elif c == b">":
            return b"".join(attrName), b""
        elif c in asciiUppercaseBytes:
            attrValue.append(c.lower())
        elif c is None:
            return None
        else:
            attrValue.append(c)
        # Step 11
        while True:
            c = next(data)
            if c in spacesAngleBrackets:
                return b"".join(attrName), b"".join(attrValue)
            elif c in asciiUppercaseBytes:
                attrValue.append(c.lower())
            elif c is None:
                return None
            else:
                attrValue.append(c)
 class ContentAttrParser(object):
    def __init__(self, data):
        assert isinstance(data, bytes)
        self.data = data
    def parse(self):
        try:
            # Check if the attr name is charset
            # otherwise return
            self.data.jumpTo(b"charset")
            self.data.position += 1
            self.data.skip()
            if not self.data.currentByte == b"=":
                # If there is no = sign keep looking for attrs
                return None
            self.data.position += 1
            self.data.skip()
            # Look for an encoding between matching quote marks
            if self.data.currentByte in (b'"', b"'"):
                quoteMark = self.data.currentByte
                self.data.position += 1
                oldPosition = self.data.position
                if self.data.jumpTo(quoteMark):
                    return self.data[oldPosition:self.data.position]
                else:
                    return None
            else:
                # Unquoted value
                oldPosition = self.data.position
                try:
                    self.data.skipUntil(spaceCharactersBytes)
                    return self.data[oldPosition:self.data.position]
                except StopIteration:
                    # Return the whole remaining value
                    return self.data[oldPosition:]
        except StopIteration:
            return None
 def lookupEncoding(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
    if isinstance(encoding, bytes):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None
    if encoding is not None:
        try:
            return webencodings.lookup(encoding)
        except AttributeError:
            return None
    else:
        return None
--- a/lib/bleach/_vendor/html5lib/_tokenizer.py
+++ b/lib/bleach/_vendor/html5lib/_tokenizer.py
--- a/lib/bleach/_vendor/html5lib/_trie/init.py
+++ b/lib/bleach/_vendor/html5lib/_trie/init.py
@ -0,0 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
 from .py import Trie
 __all__ = ["Trie"]
--- a/lib/bleach/_vendor/html5lib/_trie/_base.py
+++ b/lib/bleach/_vendor/html5lib/_trie/_base.py
@ -0,0 +1,40 @@
 from __future__ import absolute_import, division, unicode_literals
 try:
    from collections.abc import Mapping
 except ImportError:  # Python 2.7
    from collections import Mapping
 class Trie(Mapping):
    """Abstract base class for tries"""
    def keys(self, prefix=None):
        # pylint:disable=arguments-differ
        keys = super(Trie, self).keys()
        if prefix is None:
            return set(keys)
        return {x for x in keys if x.startswith(prefix)}
    def has_keys_with_prefix(self, prefix):
        for key in self.keys():
            if key.startswith(prefix):
                return True
        return False
    def longest_prefix(self, prefix):
        if prefix in self:
            return prefix
        for i in range(1, len(prefix) + 1):
            if prefix[:-i] in self:
                return prefix[:-i]
        raise KeyError(prefix)
    def longest_prefix_item(self, prefix):
        lprefix = self.longest_prefix(prefix)
        return (lprefix, self[lprefix])
--- a/lib/bleach/_vendor/html5lib/_trie/py.py
+++ b/lib/bleach/_vendor/html5lib/_trie/py.py
@ -0,0 +1,67 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from bisect import bisect_left
 from ._base import Trie as ABCTrie
 class Trie(ABCTrie):
    def __init__(self, data):
        if not all(isinstance(x, text_type) for x in data.keys()):
            raise TypeError("All keys must be strings")
        self._data = data
        self._keys = sorted(data.keys())
        self._cachestr = ""
        self._cachepoints = (0, len(data))
    def __contains__(self, key):
        return key in self._data
    def __len__(self):
        return len(self._data)
    def __iter__(self):
        return iter(self._data)
    def __getitem__(self, key):
        return self._data[key]
    def keys(self, prefix=None):
        if prefix is None or prefix == "" or not self._keys:
            return set(self._keys)
        if prefix.startswith(self._cachestr):
            lo, hi = self._cachepoints
            start = i = bisect_left(self._keys, prefix, lo, hi)
        else:
            start = i = bisect_left(self._keys, prefix)
        keys = set()
        if start == len(self._keys):
            return keys
        while self._keys[i].startswith(prefix):
            keys.add(self._keys[i])
            i += 1
        self._cachestr = prefix
        self._cachepoints = (start, i)
        return keys
    def has_keys_with_prefix(self, prefix):
        if prefix in self._data:
            return True
        if prefix.startswith(self._cachestr):
            lo, hi = self._cachepoints
            i = bisect_left(self._keys, prefix, lo, hi)
        else:
            i = bisect_left(self._keys, prefix)
        if i == len(self._keys):
            return False
        return self._keys[i].startswith(prefix)
--- a/lib/bleach/_vendor/html5lib/_utils.py
+++ b/lib/bleach/_vendor/html5lib/_utils.py
@ -0,0 +1,159 @@
 from __future__ import absolute_import, division, unicode_literals
 from types import ModuleType
 try:
    from collections.abc import Mapping
 except ImportError:
    from collections import Mapping
 from six import text_type, PY3
 if PY3:
    import xml.etree.ElementTree as default_etree
 else:
    try:
        import xml.etree.cElementTree as default_etree
    except ImportError:
        import xml.etree.ElementTree as default_etree
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
           "surrogatePairToCodepoint", "moduleFactoryFactory",
           "supports_lone_surrogates"]
 # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
 # caught by the below test. In general this would be any platform
 # using UTF-16 as its encoding of unicode strings, such as
 # Jython. This is because UTF-16 itself is based on the use of such
 # surrogates, and there is no mechanism to further escape such
 # escapes.
 try:
    _x = eval('"\\uD800"')  # pylint:disable=eval-used
    if not isinstance(_x, text_type):
        # We need this with u"" because of http://bugs.jython.org/issue2039
        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
        assert isinstance(_x, text_type)
 except Exception:
    supports_lone_surrogates = False
 else:
    supports_lone_surrogates = True
 class MethodDispatcher(dict):
    """Dict with 2 special properties:
    On initiation, keys that are lists, sets or tuples are converted to
    multiple keys so accessing any one of the items in the original
    list-like object returns the matching value
    md = MethodDispatcher({("foo", "bar"):"baz"})
    md["foo"] == "baz"
    A default value which can be set through the default attribute.
    """
    def __init__(self, items=()):
        _dictEntries = []
        for name, value in items:
            if isinstance(name, (list, tuple, frozenset, set)):
                for item in name:
                    _dictEntries.append((item, value))
            else:
                _dictEntries.append((name, value))
        dict.__init__(self, _dictEntries)
        assert len(self) == len(_dictEntries)
        self.default = None
    def __getitem__(self, key):
        return dict.get(self, key, self.default)
    def __get__(self, instance, owner=None):
        return BoundMethodDispatcher(instance, self)
 class BoundMethodDispatcher(Mapping):
    """Wraps a MethodDispatcher, binding its return values to `instance`"""
    def __init__(self, instance, dispatcher):
        self.instance = instance
        self.dispatcher = dispatcher
    def __getitem__(self, key):
        # see https://docs.python.org/3/reference/datamodel.html#object.__get__
        # on a function, __get__ is used to bind a function to an instance as a bound method
        return self.dispatcher[key].__get__(self.instance)
    def get(self, key, default):
        if key in self.dispatcher:
            return self[key]
        else:
            return default
    def __iter__(self):
        return iter(self.dispatcher)
    def __len__(self):
        return len(self.dispatcher)
    def __contains__(self, key):
        return key in self.dispatcher
 # Some utility functions to deal with weirdness around UCS2 vs UCS4
 # python builds
 def isSurrogatePair(data):
    return (len(data) == 2 and
            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
 def surrogatePairToCodepoint(data):
    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
                (ord(data[1]) - 0xDC00))
    return char_val
 # Module Factory Factory (no, this isn't Java, I know)
 # Here to stop this being duplicated all over the place.
 def moduleFactoryFactory(factory):
    moduleCache = {}
    def moduleFactory(baseModule, *args, **kwargs):
        if isinstance(ModuleType.__name__, type("")):
            name = "_%s_factory" % baseModule.__name__
        else:
            name = b"_%s_factory" % baseModule.__name__
        kwargs_tuple = tuple(kwargs.items())
        try:
            return moduleCache[name][args][kwargs_tuple]
        except KeyError:
            mod = ModuleType(name)
            objs = factory(baseModule, *args, **kwargs)
            mod.__dict__.update(objs)
            if "name" not in moduleCache:
                moduleCache[name] = {}
            if "args" not in moduleCache[name]:
                moduleCache[name][args] = {}
            if "kwargs" not in moduleCache[name][args]:
                moduleCache[name][args][kwargs_tuple] = {}
            moduleCache[name][args][kwargs_tuple] = mod
            return mod
    return moduleFactory
 def memoize(func):
    cache = {}
    def wrapped(*args, **kwargs):
        key = (tuple(args), tuple(kwargs.items()))
        if key not in cache:
            cache[key] = func(*args, **kwargs)
        return cache[key]
    return wrapped
--- a/lib/bleach/_vendor/html5lib/constants.py
+++ b/lib/bleach/_vendor/html5lib/constants.py
--- a/lib/bleach/_vendor/html5lib/filters/init.py
+++ b/lib/bleach/_vendor/html5lib/filters/init.py
--- a/lib/bleach/_vendor/html5lib/filters/alphabeticalattributes.py
+++ b/lib/bleach/_vendor/html5lib/filters/alphabeticalattributes.py
@ -0,0 +1,29 @@
 from __future__ import absolute_import, division, unicode_literals
 from . import base
 from collections import OrderedDict
 def _attr_key(attr):
    """Return an appropriate key for an attribute for sorting
    Attributes have a namespace that can be either ``None`` or a string. We
    can't compare the two because they're different types, so we convert
    ``None`` to an empty string first.
    """
    return (attr[0][0] or ''), attr[0][1]
 class Filter(base.Filter):
    """Alphabetizes attributes for elements"""
    def __iter__(self):
        for token in base.Filter.__iter__(self):
            if token["type"] in ("StartTag", "EmptyTag"):
                attrs = OrderedDict()
                for name, value in sorted(token["data"].items(),
                                          key=_attr_key):
                    attrs[name] = value
                token["data"] = attrs
            yield token
--- a/lib/bleach/_vendor/html5lib/filters/base.py
+++ b/lib/bleach/_vendor/html5lib/filters/base.py
@ -0,0 +1,12 @@
 from __future__ import absolute_import, division, unicode_literals
 class Filter(object):
    def __init__(self, source):
        self.source = source
    def __iter__(self):
        return iter(self.source)
    def __getattr__(self, name):
        return getattr(self.source, name)
--- a/lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
+++ b/lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
@ -0,0 +1,73 @@
 from __future__ import absolute_import, division, unicode_literals
 from . import base
 class Filter(base.Filter):
    """Injects ``<meta charset=ENCODING>`` tag into head of document"""
    def __init__(self, source, encoding):
        """Creates a Filter
        :arg source: the source token stream
        :arg encoding: the encoding to set
        """
        base.Filter.__init__(self, source)
        self.encoding = encoding
    def __iter__(self):
        state = "pre_head"
        meta_found = (self.encoding is None)
        pending = []
        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag":
                if token["name"].lower() == "head":
                    state = "in_head"
            elif type == "EmptyTag":
                if token["name"].lower() == "meta":
                    # replace charset with actual encoding
                    has_http_equiv_content_type = False
                    for (namespace, name), value in token["data"].items():
                        if namespace is not None:
                            continue
                        elif name.lower() == 'charset':
                            token["data"][(namespace, name)] = self.encoding
                            meta_found = True
                            break
                        elif name == 'http-equiv' and value.lower() == 'content-type':
                            has_http_equiv_content_type = True
                    else:
                        if has_http_equiv_content_type and (None, "content") in token["data"]:
                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
                            meta_found = True
                elif token["name"].lower() == "head" and not meta_found:
                    # insert meta into empty head
                    yield {"type": "StartTag", "name": "head",
                           "data": token["data"]}
                    yield {"type": "EmptyTag", "name": "meta",
                           "data": {(None, "charset"): self.encoding}}
                    yield {"type": "EndTag", "name": "head"}
                    meta_found = True
                    continue
            elif type == "EndTag":
                if token["name"].lower() == "head" and pending:
                    # insert meta into head (if necessary) and flush pending queue
                    yield pending.pop(0)
                    if not meta_found:
                        yield {"type": "EmptyTag", "name": "meta",
                               "data": {(None, "charset"): self.encoding}}
                    while pending:
                        yield pending.pop(0)
                    meta_found = True
                    state = "post_head"
            if state == "in_head":
                pending.append(token)
            else:
                yield token
--- a/lib/bleach/_vendor/html5lib/filters/lint.py
+++ b/lib/bleach/_vendor/html5lib/filters/lint.py
@ -0,0 +1,93 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from . import base
 from ..constants import namespaces, voidElements
 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
 class Filter(base.Filter):
    """Lints the token stream for errors
    If it finds any errors, it'll raise an ``AssertionError``.
    """
    def __init__(self, source, require_matching_tags=True):
        """Creates a Filter
        :arg source: the source token stream
        :arg require_matching_tags: whether or not to require matching tags
        """
        super(Filter, self).__init__(source)
        self.require_matching_tags = require_matching_tags
    def __iter__(self):
        open_elements = []
        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
                namespace = token["namespace"]
                name = token["name"]
                assert namespace is None or isinstance(namespace, text_type)
                assert namespace != ""
                assert isinstance(name, text_type)
                assert name != ""
                assert isinstance(token["data"], dict)
                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    assert type == "EmptyTag"
                else:
                    assert type == "StartTag"
                if type == "StartTag" and self.require_matching_tags:
                    open_elements.append((namespace, name))
                for (namespace, name), value in token["data"].items():
                    assert namespace is None or isinstance(namespace, text_type)
                    assert namespace != ""
                    assert isinstance(name, text_type)
                    assert name != ""
                    assert isinstance(value, text_type)
            elif type == "EndTag":
                namespace = token["namespace"]
                name = token["name"]
                assert namespace is None or isinstance(namespace, text_type)
                assert namespace != ""
                assert isinstance(name, text_type)
                assert name != ""
                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
                elif self.require_matching_tags:
                    start = open_elements.pop()
                    assert start == (namespace, name)
            elif type == "Comment":
                data = token["data"]
                assert isinstance(data, text_type)
            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
                assert isinstance(data, text_type)
                assert data != ""
                if type == "SpaceCharacters":
                    assert data.strip(spaceCharacters) == ""
            elif type == "Doctype":
                name = token["name"]
                assert name is None or isinstance(name, text_type)
                assert token["publicId"] is None or isinstance(name, text_type)
                assert token["systemId"] is None or isinstance(name, text_type)
            elif type == "Entity":
                assert isinstance(token["name"], text_type)
            elif type == "SerializerError":
                assert isinstance(token["data"], text_type)
            else:
                assert False, "Unknown token type: %(type)s" % {"type": type}
            yield token
--- a/lib/bleach/_vendor/html5lib/filters/optionaltags.py
+++ b/lib/bleach/_vendor/html5lib/filters/optionaltags.py
@ -0,0 +1,207 @@
 from __future__ import absolute_import, division, unicode_literals
 from . import base
 class Filter(base.Filter):
    """Removes optional tags from the token stream"""
    def slider(self):
        previous1 = previous2 = None
        for token in self.source:
            if previous1 is not None:
                yield previous2, previous1, token
            previous2 = previous1
            previous1 = token
        if previous1 is not None:
            yield previous2, previous1, None
    def __iter__(self):
        for previous, token, next in self.slider():
            type = token["type"]
            if type == "StartTag":
                if (token["data"] or
                        not self.is_optional_start(token["name"], previous, next)):
                    yield token
            elif type == "EndTag":
                if not self.is_optional_end(token["name"], next):
                    yield token
            else:
                yield token
    def is_optional_start(self, tagname, previous, next):
        type = next and next["type"] or None
        if tagname in 'html':
            # An html element's start tag may be omitted if the first thing
            # inside the html element is not a space character or a comment.
            return type not in ("Comment", "SpaceCharacters")
        elif tagname == 'head':
            # A head element's start tag may be omitted if the first thing
            # inside the head element is an element.
            # XXX: we also omit the start tag if the head element is empty
            if type in ("StartTag", "EmptyTag"):
                return True
            elif type == "EndTag":
                return next["name"] == "head"
        elif tagname == 'body':
            # A body element's start tag may be omitted if the first thing
            # inside the body element is not a space character or a comment,
            # except if the first thing inside the body element is a script
            # or style element and the node immediately preceding the body
            # element is a head element whose end tag has been omitted.
            if type in ("Comment", "SpaceCharacters"):
                return False
            elif type == "StartTag":
                # XXX: we do not look at the preceding event, so we never omit
                # the body element's start tag if it's followed by a script or
                # a style element.
                return next["name"] not in ('script', 'style')
            else:
                return True
        elif tagname == 'colgroup':
            # A colgroup element's start tag may be omitted if the first thing
            # inside the colgroup element is a col element, and if the element
            # is not immediately preceded by another colgroup element whose
            # end tag has been omitted.
            if type in ("StartTag", "EmptyTag"):
                # XXX: we do not look at the preceding event, so instead we never
                # omit the colgroup element's end tag when it is immediately
                # followed by another colgroup element. See is_optional_end.
                return next["name"] == "col"
            else:
                return False
        elif tagname == 'tbody':
            # A tbody element's start tag may be omitted if the first thing
            # inside the tbody element is a tr element, and if the element is
            # not immediately preceded by a tbody, thead, or tfoot element
            # whose end tag has been omitted.
            if type == "StartTag":
                # omit the thead and tfoot elements' end tag when they are
                # immediately followed by a tbody element. See is_optional_end.
                if previous and previous['type'] == 'EndTag' and \
                        previous['name'] in ('tbody', 'thead', 'tfoot'):
                    return False
                return next["name"] == 'tr'
            else:
                return False
        return False
    def is_optional_end(self, tagname, next):
        type = next and next["type"] or None
        if tagname in ('html', 'head', 'body'):
            # An html element's end tag may be omitted if the html element
            # is not immediately followed by a space character or a comment.
            return type not in ("Comment", "SpaceCharacters")
        elif tagname in ('li', 'optgroup', 'tr'):
            # A li element's end tag may be omitted if the li element is
            # immediately followed by another li element or if there is
            # no more content in the parent element.
            # An optgroup element's end tag may be omitted if the optgroup
            # element is immediately followed by another optgroup element,
            # or if there is no more content in the parent element.
            # A tr element's end tag may be omitted if the tr element is
            # immediately followed by another tr element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] == tagname
            else:
                return type == "EndTag" or type is None
        elif tagname in ('dt', 'dd'):
            # A dt element's end tag may be omitted if the dt element is
            # immediately followed by another dt element or a dd element.
            # A dd element's end tag may be omitted if the dd element is
            # immediately followed by another dd element or a dt element,
            # or if there is no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('dt', 'dd')
            elif tagname == 'dd':
                return type == "EndTag" or type is None
            else:
                return False
        elif tagname == 'p':
            # A p element's end tag may be omitted if the p element is
            # immediately followed by an address, article, aside,
            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
            # nav, ol, p, pre, section, table, or ul, element, or if
            # there is no more content in the parent element.
            if type in ("StartTag", "EmptyTag"):
                return next["name"] in ('address', 'article', 'aside',
                                        'blockquote', 'datagrid', 'dialog',
                                        'dir', 'div', 'dl', 'fieldset', 'footer',
                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                                        'header', 'hr', 'menu', 'nav', 'ol',
                                        'p', 'pre', 'section', 'table', 'ul')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'option':
            # An option element's end tag may be omitted if the option
            # element is immediately followed by another option element,
            # or if it is immediately followed by an <code>optgroup</code>
            # element, or if there is no more content in the parent
            # element.
            if type == "StartTag":
                return next["name"] in ('option', 'optgroup')
            else:
                return type == "EndTag" or type is None
        elif tagname in ('rt', 'rp'):
            # An rt element's end tag may be omitted if the rt element is
            # immediately followed by an rt or rp element, or if there is
            # no more content in the parent element.
            # An rp element's end tag may be omitted if the rp element is
            # immediately followed by an rt or rp element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('rt', 'rp')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'colgroup':
            # A colgroup element's end tag may be omitted if the colgroup
            # element is not immediately followed by a space character or
            # a comment.
            if type in ("Comment", "SpaceCharacters"):
                return False
            elif type == "StartTag":
                # XXX: we also look for an immediately following colgroup
                # element. See is_optional_start.
                return next["name"] != 'colgroup'
            else:
                return True
        elif tagname in ('thead', 'tbody'):
            # A thead element's end tag may be omitted if the thead element
            # is immediately followed by a tbody or tfoot element.
            # A tbody element's end tag may be omitted if the tbody element
            # is immediately followed by a tbody or tfoot element, or if
            # there is no more content in the parent element.
            # A tfoot element's end tag may be omitted if the tfoot element
            # is immediately followed by a tbody element, or if there is no
            # more content in the parent element.
            # XXX: we never omit the end tag when the following element is
            # a tbody. See is_optional_start.
            if type == "StartTag":
                return next["name"] in ['tbody', 'tfoot']
            elif tagname == 'tbody':
                return type == "EndTag" or type is None
            else:
                return False
        elif tagname == 'tfoot':
            # A tfoot element's end tag may be omitted if the tfoot element
            # is immediately followed by a tbody element, or if there is no
            # more content in the parent element.
            # XXX: we never omit the end tag when the following element is
            # a tbody. See is_optional_start.
            if type == "StartTag":
                return next["name"] == 'tbody'
            else:
                return type == "EndTag" or type is None
        elif tagname in ('td', 'th'):
            # A td element's end tag may be omitted if the td element is
            # immediately followed by a td or th element, or if there is
            # no more content in the parent element.
            # A th element's end tag may be omitted if the th element is
            # immediately followed by a td or th element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('td', 'th')
            else:
                return type == "EndTag" or type is None
        return False
--- a/lib/bleach/_vendor/html5lib/filters/sanitizer.py
+++ b/lib/bleach/_vendor/html5lib/filters/sanitizer.py
@ -0,0 +1,916 @@
 """Deprecated from html5lib 1.1.
 See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
 information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
 is recommended as a replacement. Please let us know in the aforementioned issue
 if Bleach is unsuitable for your needs.
 """
 from __future__ import absolute_import, division, unicode_literals
 import re
 import warnings
 from xml.sax.saxutils import escape, unescape
 from six.moves import urllib_parse as urlparse
 from . import base
 from ..constants import namespaces, prefixes
 __all__ = ["Filter"]
 _deprecation_msg = (
    "html5lib's sanitizer is deprecated; see " +
    "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
    "us know if Bleach is unsuitable for your needs"
 )
 warnings.warn(_deprecation_msg, DeprecationWarning)
 allowed_elements = frozenset((
    (namespaces['html'], 'a'),
    (namespaces['html'], 'abbr'),
    (namespaces['html'], 'acronym'),
    (namespaces['html'], 'address'),
    (namespaces['html'], 'area'),
    (namespaces['html'], 'article'),
    (namespaces['html'], 'aside'),
    (namespaces['html'], 'audio'),
    (namespaces['html'], 'b'),
    (namespaces['html'], 'big'),
    (namespaces['html'], 'blockquote'),
    (namespaces['html'], 'br'),
    (namespaces['html'], 'button'),
    (namespaces['html'], 'canvas'),
    (namespaces['html'], 'caption'),
    (namespaces['html'], 'center'),
    (namespaces['html'], 'cite'),
    (namespaces['html'], 'code'),
    (namespaces['html'], 'col'),
    (namespaces['html'], 'colgroup'),
    (namespaces['html'], 'command'),
    (namespaces['html'], 'datagrid'),
    (namespaces['html'], 'datalist'),
    (namespaces['html'], 'dd'),
    (namespaces['html'], 'del'),
    (namespaces['html'], 'details'),
    (namespaces['html'], 'dfn'),
    (namespaces['html'], 'dialog'),
    (namespaces['html'], 'dir'),
    (namespaces['html'], 'div'),
    (namespaces['html'], 'dl'),
    (namespaces['html'], 'dt'),
    (namespaces['html'], 'em'),
    (namespaces['html'], 'event-source'),
    (namespaces['html'], 'fieldset'),
    (namespaces['html'], 'figcaption'),
    (namespaces['html'], 'figure'),
    (namespaces['html'], 'footer'),
    (namespaces['html'], 'font'),
    (namespaces['html'], 'form'),
    (namespaces['html'], 'header'),
    (namespaces['html'], 'h1'),
    (namespaces['html'], 'h2'),
    (namespaces['html'], 'h3'),
    (namespaces['html'], 'h4'),
    (namespaces['html'], 'h5'),
    (namespaces['html'], 'h6'),
    (namespaces['html'], 'hr'),
    (namespaces['html'], 'i'),
    (namespaces['html'], 'img'),
    (namespaces['html'], 'input'),
    (namespaces['html'], 'ins'),
    (namespaces['html'], 'keygen'),
    (namespaces['html'], 'kbd'),
    (namespaces['html'], 'label'),
    (namespaces['html'], 'legend'),
    (namespaces['html'], 'li'),
    (namespaces['html'], 'm'),
    (namespaces['html'], 'map'),
    (namespaces['html'], 'menu'),
    (namespaces['html'], 'meter'),
    (namespaces['html'], 'multicol'),
    (namespaces['html'], 'nav'),
    (namespaces['html'], 'nextid'),
    (namespaces['html'], 'ol'),
    (namespaces['html'], 'output'),
    (namespaces['html'], 'optgroup'),
    (namespaces['html'], 'option'),
    (namespaces['html'], 'p'),
    (namespaces['html'], 'pre'),
    (namespaces['html'], 'progress'),
    (namespaces['html'], 'q'),
    (namespaces['html'], 's'),
    (namespaces['html'], 'samp'),
    (namespaces['html'], 'section'),
    (namespaces['html'], 'select'),
    (namespaces['html'], 'small'),
    (namespaces['html'], 'sound'),
    (namespaces['html'], 'source'),
    (namespaces['html'], 'spacer'),
    (namespaces['html'], 'span'),
    (namespaces['html'], 'strike'),
    (namespaces['html'], 'strong'),
    (namespaces['html'], 'sub'),
    (namespaces['html'], 'sup'),
    (namespaces['html'], 'table'),
    (namespaces['html'], 'tbody'),
    (namespaces['html'], 'td'),
    (namespaces['html'], 'textarea'),
    (namespaces['html'], 'time'),
    (namespaces['html'], 'tfoot'),
    (namespaces['html'], 'th'),
    (namespaces['html'], 'thead'),
    (namespaces['html'], 'tr'),
    (namespaces['html'], 'tt'),
    (namespaces['html'], 'u'),
    (namespaces['html'], 'ul'),
    (namespaces['html'], 'var'),
    (namespaces['html'], 'video'),
    (namespaces['mathml'], 'maction'),
    (namespaces['mathml'], 'math'),
    (namespaces['mathml'], 'merror'),
    (namespaces['mathml'], 'mfrac'),
    (namespaces['mathml'], 'mi'),
    (namespaces['mathml'], 'mmultiscripts'),
    (namespaces['mathml'], 'mn'),
    (namespaces['mathml'], 'mo'),
    (namespaces['mathml'], 'mover'),
    (namespaces['mathml'], 'mpadded'),
    (namespaces['mathml'], 'mphantom'),
    (namespaces['mathml'], 'mprescripts'),
    (namespaces['mathml'], 'mroot'),
    (namespaces['mathml'], 'mrow'),
    (namespaces['mathml'], 'mspace'),
    (namespaces['mathml'], 'msqrt'),
    (namespaces['mathml'], 'mstyle'),
    (namespaces['mathml'], 'msub'),
    (namespaces['mathml'], 'msubsup'),
    (namespaces['mathml'], 'msup'),
    (namespaces['mathml'], 'mtable'),
    (namespaces['mathml'], 'mtd'),
    (namespaces['mathml'], 'mtext'),
    (namespaces['mathml'], 'mtr'),
    (namespaces['mathml'], 'munder'),
    (namespaces['mathml'], 'munderover'),
    (namespaces['mathml'], 'none'),
    (namespaces['svg'], 'a'),
    (namespaces['svg'], 'animate'),
    (namespaces['svg'], 'animateColor'),
    (namespaces['svg'], 'animateMotion'),
    (namespaces['svg'], 'animateTransform'),
    (namespaces['svg'], 'clipPath'),
    (namespaces['svg'], 'circle'),
    (namespaces['svg'], 'defs'),
    (namespaces['svg'], 'desc'),
    (namespaces['svg'], 'ellipse'),
    (namespaces['svg'], 'font-face'),
    (namespaces['svg'], 'font-face-name'),
    (namespaces['svg'], 'font-face-src'),
    (namespaces['svg'], 'g'),
    (namespaces['svg'], 'glyph'),
    (namespaces['svg'], 'hkern'),
    (namespaces['svg'], 'linearGradient'),
    (namespaces['svg'], 'line'),
    (namespaces['svg'], 'marker'),
    (namespaces['svg'], 'metadata'),
    (namespaces['svg'], 'missing-glyph'),
    (namespaces['svg'], 'mpath'),
    (namespaces['svg'], 'path'),
    (namespaces['svg'], 'polygon'),
    (namespaces['svg'], 'polyline'),
    (namespaces['svg'], 'radialGradient'),
    (namespaces['svg'], 'rect'),
    (namespaces['svg'], 'set'),
    (namespaces['svg'], 'stop'),
    (namespaces['svg'], 'svg'),
    (namespaces['svg'], 'switch'),
    (namespaces['svg'], 'text'),
    (namespaces['svg'], 'title'),
    (namespaces['svg'], 'tspan'),
    (namespaces['svg'], 'use'),
 ))
 allowed_attributes = frozenset((
    # HTML attributes
    (None, 'abbr'),
    (None, 'accept'),
    (None, 'accept-charset'),
    (None, 'accesskey'),
    (None, 'action'),
    (None, 'align'),
    (None, 'alt'),
    (None, 'autocomplete'),
    (None, 'autofocus'),
    (None, 'axis'),
    (None, 'background'),
    (None, 'balance'),
    (None, 'bgcolor'),
    (None, 'bgproperties'),
    (None, 'border'),
    (None, 'bordercolor'),
    (None, 'bordercolordark'),
    (None, 'bordercolorlight'),
    (None, 'bottompadding'),
    (None, 'cellpadding'),
    (None, 'cellspacing'),
    (None, 'ch'),
    (None, 'challenge'),
    (None, 'char'),
    (None, 'charoff'),
    (None, 'choff'),
    (None, 'charset'),
    (None, 'checked'),
    (None, 'cite'),
    (None, 'class'),
    (None, 'clear'),
    (None, 'color'),
    (None, 'cols'),
    (None, 'colspan'),
    (None, 'compact'),
    (None, 'contenteditable'),
    (None, 'controls'),
    (None, 'coords'),
    (None, 'data'),
    (None, 'datafld'),
    (None, 'datapagesize'),
    (None, 'datasrc'),
    (None, 'datetime'),
    (None, 'default'),
    (None, 'delay'),
    (None, 'dir'),
    (None, 'disabled'),
    (None, 'draggable'),
    (None, 'dynsrc'),
    (None, 'enctype'),
    (None, 'end'),
    (None, 'face'),
    (None, 'for'),
    (None, 'form'),
    (None, 'frame'),
    (None, 'galleryimg'),
    (None, 'gutter'),
    (None, 'headers'),
    (None, 'height'),
    (None, 'hidefocus'),
    (None, 'hidden'),
    (None, 'high'),
    (None, 'href'),
    (None, 'hreflang'),
    (None, 'hspace'),
    (None, 'icon'),
    (None, 'id'),
    (None, 'inputmode'),
    (None, 'ismap'),
    (None, 'keytype'),
    (None, 'label'),
    (None, 'leftspacing'),
    (None, 'lang'),
    (None, 'list'),
    (None, 'longdesc'),
    (None, 'loop'),
    (None, 'loopcount'),
    (None, 'loopend'),
    (None, 'loopstart'),
    (None, 'low'),
    (None, 'lowsrc'),
    (None, 'max'),
    (None, 'maxlength'),
    (None, 'media'),
    (None, 'method'),
    (None, 'min'),
    (None, 'multiple'),
    (None, 'name'),
    (None, 'nohref'),
    (None, 'noshade'),
    (None, 'nowrap'),
    (None, 'open'),
    (None, 'optimum'),
    (None, 'pattern'),
    (None, 'ping'),
    (None, 'point-size'),
    (None, 'poster'),
    (None, 'pqg'),
    (None, 'preload'),
    (None, 'prompt'),
    (None, 'radiogroup'),
    (None, 'readonly'),
    (None, 'rel'),
    (None, 'repeat-max'),
    (None, 'repeat-min'),
    (None, 'replace'),
    (None, 'required'),
    (None, 'rev'),
    (None, 'rightspacing'),
    (None, 'rows'),
    (None, 'rowspan'),
    (None, 'rules'),
    (None, 'scope'),
    (None, 'selected'),
    (None, 'shape'),
    (None, 'size'),
    (None, 'span'),
    (None, 'src'),
    (None, 'start'),
    (None, 'step'),
    (None, 'style'),
    (None, 'summary'),
    (None, 'suppress'),
    (None, 'tabindex'),
    (None, 'target'),
    (None, 'template'),
    (None, 'title'),
    (None, 'toppadding'),
    (None, 'type'),
    (None, 'unselectable'),
    (None, 'usemap'),
    (None, 'urn'),
    (None, 'valign'),
    (None, 'value'),
    (None, 'variable'),
    (None, 'volume'),
    (None, 'vspace'),
    (None, 'vrml'),
    (None, 'width'),
    (None, 'wrap'),
    (namespaces['xml'], 'lang'),
    # MathML attributes
    (None, 'actiontype'),
    (None, 'align'),
    (None, 'columnalign'),
    (None, 'columnalign'),
    (None, 'columnalign'),
    (None, 'columnlines'),
    (None, 'columnspacing'),
    (None, 'columnspan'),
    (None, 'depth'),
    (None, 'display'),
    (None, 'displaystyle'),
    (None, 'equalcolumns'),
    (None, 'equalrows'),
    (None, 'fence'),
    (None, 'fontstyle'),
    (None, 'fontweight'),
    (None, 'frame'),
    (None, 'height'),
    (None, 'linethickness'),
    (None, 'lspace'),
    (None, 'mathbackground'),
    (None, 'mathcolor'),
    (None, 'mathvariant'),
    (None, 'mathvariant'),
    (None, 'maxsize'),
    (None, 'minsize'),
    (None, 'other'),
    (None, 'rowalign'),
    (None, 'rowalign'),
    (None, 'rowalign'),
    (None, 'rowlines'),
    (None, 'rowspacing'),
    (None, 'rowspan'),
    (None, 'rspace'),
    (None, 'scriptlevel'),
    (None, 'selection'),
    (None, 'separator'),
    (None, 'stretchy'),
    (None, 'width'),
    (None, 'width'),
    (namespaces['xlink'], 'href'),
    (namespaces['xlink'], 'show'),
    (namespaces['xlink'], 'type'),
    # SVG attributes
    (None, 'accent-height'),
    (None, 'accumulate'),
    (None, 'additive'),
    (None, 'alphabetic'),
    (None, 'arabic-form'),
    (None, 'ascent'),
    (None, 'attributeName'),
    (None, 'attributeType'),
    (None, 'baseProfile'),
    (None, 'bbox'),
    (None, 'begin'),
    (None, 'by'),
    (None, 'calcMode'),
    (None, 'cap-height'),
    (None, 'class'),
    (None, 'clip-path'),
    (None, 'color'),
    (None, 'color-rendering'),
    (None, 'content'),
    (None, 'cx'),
    (None, 'cy'),
    (None, 'd'),
    (None, 'dx'),
    (None, 'dy'),
    (None, 'descent'),
    (None, 'display'),
    (None, 'dur'),
    (None, 'end'),
    (None, 'fill'),
    (None, 'fill-opacity'),
    (None, 'fill-rule'),
    (None, 'font-family'),
    (None, 'font-size'),
    (None, 'font-stretch'),
    (None, 'font-style'),
    (None, 'font-variant'),
    (None, 'font-weight'),
    (None, 'from'),
    (None, 'fx'),
    (None, 'fy'),
    (None, 'g1'),
    (None, 'g2'),
    (None, 'glyph-name'),
    (None, 'gradientUnits'),
    (None, 'hanging'),
    (None, 'height'),
    (None, 'horiz-adv-x'),
    (None, 'horiz-origin-x'),
    (None, 'id'),
    (None, 'ideographic'),
    (None, 'k'),
    (None, 'keyPoints'),
    (None, 'keySplines'),
    (None, 'keyTimes'),
    (None, 'lang'),
    (None, 'marker-end'),
    (None, 'marker-mid'),
    (None, 'marker-start'),
    (None, 'markerHeight'),
    (None, 'markerUnits'),
    (None, 'markerWidth'),
    (None, 'mathematical'),
    (None, 'max'),
    (None, 'min'),
    (None, 'name'),
    (None, 'offset'),
    (None, 'opacity'),
    (None, 'orient'),
    (None, 'origin'),
    (None, 'overline-position'),
    (None, 'overline-thickness'),
    (None, 'panose-1'),
    (None, 'path'),
    (None, 'pathLength'),
    (None, 'points'),
    (None, 'preserveAspectRatio'),
    (None, 'r'),
    (None, 'refX'),
    (None, 'refY'),
    (None, 'repeatCount'),
    (None, 'repeatDur'),
    (None, 'requiredExtensions'),
    (None, 'requiredFeatures'),
    (None, 'restart'),
    (None, 'rotate'),
    (None, 'rx'),
    (None, 'ry'),
    (None, 'slope'),
    (None, 'stemh'),
    (None, 'stemv'),
    (None, 'stop-color'),
    (None, 'stop-opacity'),
    (None, 'strikethrough-position'),
    (None, 'strikethrough-thickness'),
    (None, 'stroke'),
    (None, 'stroke-dasharray'),
    (None, 'stroke-dashoffset'),
    (None, 'stroke-linecap'),
    (None, 'stroke-linejoin'),
    (None, 'stroke-miterlimit'),
    (None, 'stroke-opacity'),
    (None, 'stroke-width'),
    (None, 'systemLanguage'),
    (None, 'target'),
    (None, 'text-anchor'),
    (None, 'to'),
    (None, 'transform'),
    (None, 'type'),
    (None, 'u1'),
    (None, 'u2'),
    (None, 'underline-position'),
    (None, 'underline-thickness'),
    (None, 'unicode'),
    (None, 'unicode-range'),
    (None, 'units-per-em'),
    (None, 'values'),
    (None, 'version'),
    (None, 'viewBox'),
    (None, 'visibility'),
    (None, 'width'),
    (None, 'widths'),
    (None, 'x'),
    (None, 'x-height'),
    (None, 'x1'),
    (None, 'x2'),
    (namespaces['xlink'], 'actuate'),
    (namespaces['xlink'], 'arcrole'),
    (namespaces['xlink'], 'href'),
    (namespaces['xlink'], 'role'),
    (namespaces['xlink'], 'show'),
    (namespaces['xlink'], 'title'),
    (namespaces['xlink'], 'type'),
    (namespaces['xml'], 'base'),
    (namespaces['xml'], 'lang'),
    (namespaces['xml'], 'space'),
    (None, 'y'),
    (None, 'y1'),
    (None, 'y2'),
    (None, 'zoomAndPan'),
 ))
 attr_val_is_uri = frozenset((
    (None, 'href'),
    (None, 'src'),
    (None, 'cite'),
    (None, 'action'),
    (None, 'longdesc'),
    (None, 'poster'),
    (None, 'background'),
    (None, 'datasrc'),
    (None, 'dynsrc'),
    (None, 'lowsrc'),
    (None, 'ping'),
    (namespaces['xlink'], 'href'),
    (namespaces['xml'], 'base'),
 ))
 svg_attr_val_allows_ref = frozenset((
    (None, 'clip-path'),
    (None, 'color-profile'),
    (None, 'cursor'),
    (None, 'fill'),
    (None, 'filter'),
    (None, 'marker'),
    (None, 'marker-start'),
    (None, 'marker-mid'),
    (None, 'marker-end'),
    (None, 'mask'),
    (None, 'stroke'),
 ))
 svg_allow_local_href = frozenset((
    (None, 'altGlyph'),
    (None, 'animate'),
    (None, 'animateColor'),
    (None, 'animateMotion'),
    (None, 'animateTransform'),
    (None, 'cursor'),
    (None, 'feImage'),
    (None, 'filter'),
    (None, 'linearGradient'),
    (None, 'pattern'),
    (None, 'radialGradient'),
    (None, 'textpath'),
    (None, 'tref'),
    (None, 'set'),
    (None, 'use')
 ))
 allowed_css_properties = frozenset((
    'azimuth',
    'background-color',
    'border-bottom-color',
    'border-collapse',
    'border-color',
    'border-left-color',
    'border-right-color',
    'border-top-color',
    'clear',
    'color',
    'cursor',
    'direction',
    'display',
    'elevation',
    'float',
    'font',
    'font-family',
    'font-size',
    'font-style',
    'font-variant',
    'font-weight',
    'height',
    'letter-spacing',
    'line-height',
    'overflow',
    'pause',
    'pause-after',
    'pause-before',
    'pitch',
    'pitch-range',
    'richness',
    'speak',
    'speak-header',
    'speak-numeral',
    'speak-punctuation',
    'speech-rate',
    'stress',
    'text-align',
    'text-decoration',
    'text-indent',
    'unicode-bidi',
    'vertical-align',
    'voice-family',
    'volume',
    'white-space',
    'width',
 ))
 allowed_css_keywords = frozenset((
    'auto',
    'aqua',
    'black',
    'block',
    'blue',
    'bold',
    'both',
    'bottom',
    'brown',
    'center',
    'collapse',
    'dashed',
    'dotted',
    'fuchsia',
    'gray',
    'green',
    '!important',
    'italic',
    'left',
    'lime',
    'maroon',
    'medium',
    'none',
    'navy',
    'normal',
    'nowrap',
    'olive',
    'pointer',
    'purple',
    'red',
    'right',
    'solid',
    'silver',
    'teal',
    'top',
    'transparent',
    'underline',
    'white',
    'yellow',
 ))
 allowed_svg_properties = frozenset((
    'fill',
    'fill-opacity',
    'fill-rule',
    'stroke',
    'stroke-width',
    'stroke-linecap',
    'stroke-linejoin',
    'stroke-opacity',
 ))
 allowed_protocols = frozenset((
    'ed2k',
    'ftp',
    'http',
    'https',
    'irc',
    'mailto',
    'news',
    'gopher',
    'nntp',
    'telnet',
    'webcal',
    'xmpp',
    'callto',
    'feed',
    'urn',
    'aim',
    'rsync',
    'tag',
    'ssh',
    'sftp',
    'rtsp',
    'afs',
    'data',
 ))
 allowed_content_types = frozenset((
    'image/png',
    'image/jpeg',
    'image/gif',
    'image/webp',
    'image/bmp',
    'text/plain',
 ))
 data_content_type = re.compile(r'''
                                ^
                                # Match a content type <application>/<type>
                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
                                # Match any character set and encoding
                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
                                # Assume the rest is data
                                ,.*
                                $
                                ''',
                               re.VERBOSE)
 class Filter(base.Filter):
    """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
    def __init__(self,
                 source,
                 allowed_elements=allowed_elements,
                 allowed_attributes=allowed_attributes,
                 allowed_css_properties=allowed_css_properties,
                 allowed_css_keywords=allowed_css_keywords,
                 allowed_svg_properties=allowed_svg_properties,
                 allowed_protocols=allowed_protocols,
                 allowed_content_types=allowed_content_types,
                 attr_val_is_uri=attr_val_is_uri,
                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
                 svg_allow_local_href=svg_allow_local_href):
        """Creates a Filter
        :arg allowed_elements: set of elements to allow--everything else will
            be escaped
        :arg allowed_attributes: set of attributes to allow in
            elements--everything else will be stripped
        :arg allowed_css_properties: set of CSS properties to allow--everything
            else will be stripped
        :arg allowed_css_keywords: set of CSS keywords to allow--everything
            else will be stripped
        :arg allowed_svg_properties: set of SVG properties to allow--everything
            else will be removed
        :arg allowed_protocols: set of allowed protocols for URIs
        :arg allowed_content_types: set of allowed content types for ``data`` URIs.
        :arg attr_val_is_uri: set of attributes that have URI values--values
            that have a scheme not listed in ``allowed_protocols`` are removed
        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
            references
        :arg svg_allow_local_href: set of SVG elements that can have local
            hrefs--these are removed
        """
        super(Filter, self).__init__(source)
        warnings.warn(_deprecation_msg, DeprecationWarning)
        self.allowed_elements = allowed_elements
        self.allowed_attributes = allowed_attributes
        self.allowed_css_properties = allowed_css_properties
        self.allowed_css_keywords = allowed_css_keywords
        self.allowed_svg_properties = allowed_svg_properties
        self.allowed_protocols = allowed_protocols
        self.allowed_content_types = allowed_content_types
        self.attr_val_is_uri = attr_val_is_uri
        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
        self.svg_allow_local_href = svg_allow_local_href
    def __iter__(self):
        for token in base.Filter.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
    # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
    # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
    # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
    # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
    # allowed.
    #
    #   sanitize_html('<script> do_nasty_stuff() </script>')
    #    => &lt;script> do_nasty_stuff() &lt;/script>
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
    def sanitize_token(self, token):
        # accommodate filters which use token_type differently
        token_type = token["type"]
        if token_type in ("StartTag", "EndTag", "EmptyTag"):
            name = token["name"]
            namespace = token["namespace"]
            if ((namespace, name) in self.allowed_elements or
                (namespace is None and
                 (namespaces["html"], name) in self.allowed_elements)):
                return self.allowed_token(token)
            else:
                return self.disallowed_token(token)
        elif token_type == "Comment":
            pass
        else:
            return token
    def allowed_token(self, token):
        if "data" in token:
            attrs = token["data"]
            attr_names = set(attrs.keys())
            # Remove forbidden attributes
            for to_remove in (attr_names - self.allowed_attributes):
                del token["data"][to_remove]
                attr_names.remove(to_remove)
            # Remove attributes with disallowed URL values
            for attr in (attr_names & self.attr_val_is_uri):
                assert attr in attrs
                # I don't have a clue where this regexp comes from or why it matches those
                # characters, nor why we call unescape. I just know it's always been here.
                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
                # this will do is remove *more* than it otherwise would.
                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                try:
                    uri = urlparse.urlparse(val_unescaped)
                except ValueError:
                    uri = None
                    del attrs[attr]
                if uri and uri.scheme:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = data_content_type.match(uri.path)
                        if not m:
                            del attrs[attr]
                        elif m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]
            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
                                                                     attrs[(namespaces['xlink'], 'href')])):
                del attrs[(namespaces['xlink'], 'href')]
            if (None, 'style') in attrs:
                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
            token["data"] = attrs
        return token
    def disallowed_token(self, token):
        token_type = token["type"]
        if token_type == "EndTag":
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
            assert token_type in ("StartTag", "EmptyTag")
            attrs = []
            for (ns, name), v in token["data"].items():
                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
        else:
            token["data"] = "<%s>" % token["name"]
        if token.get("selfClosing"):
            token["data"] = token["data"][:-1] + "/>"
        token["type"] = "Characters"
        del token["name"]
        return token
    def sanitize_css(self, style):
        # disallow urls
        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # gauntlet
        if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
            return ''
        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''
        clean = []
        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
                                                'padding']:
                for keyword in value.split():
                    if keyword not in self.allowed_css_keywords and \
                            not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
                        break
                else:
                    clean.append(prop + ': ' + value + ';')
            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)
--- a/lib/bleach/_vendor/html5lib/filters/whitespace.py
+++ b/lib/bleach/_vendor/html5lib/filters/whitespace.py
@ -0,0 +1,38 @@
 from __future__ import absolute_import, division, unicode_literals
 import re
 from . import base
 from ..constants import rcdataElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
 SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
 class Filter(base.Filter):
    """Collapses whitespace except in pre, textarea, and script elements"""
    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
    def __iter__(self):
        preserve = 0
        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag" \
                    and (preserve or token["name"] in self.spacePreserveElements):
                preserve += 1
            elif type == "EndTag" and preserve:
                preserve -= 1
            elif not preserve and type == "SpaceCharacters" and token["data"]:
                # Test on token["data"] above to not introduce spaces where there were not
                token["data"] = " "
            elif not preserve and type == "Characters":
                token["data"] = collapse_spaces(token["data"])
            yield token
 def collapse_spaces(text):
    return SPACES_REGEX.sub(' ', text)
--- a/lib/bleach/_vendor/html5lib/html5parser.py
+++ b/lib/bleach/_vendor/html5lib/html5parser.py
--- a/lib/bleach/_vendor/html5lib/serializer.py
+++ b/lib/bleach/_vendor/html5lib/serializer.py
@ -0,0 +1,409 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 import re
 from codecs import register_error, xmlcharrefreplace_errors
 from .constants import voidElements, booleanAttributes, spaceCharacters
 from .constants import rcdataElements, entities, xmlEntities
 from . import treewalkers, _utils
 from xml.sax.saxutils import escape
 _quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
 _quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
 _quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
                                   "\u3000]")
 _encode_entity_map = {}
 _is_ucs4 = len("\U0010FFFF") == 1
 for k, v in list(entities.items()):
    # skip multi-character entities
    if ((_is_ucs4 and len(v) > 1) or
            (not _is_ucs4 and len(v) > 2)):
        continue
    if v != "&":
        if len(v) == 2:
            v = _utils.surrogatePairToCodepoint(v)
        else:
            v = ord(v)
        if v not in _encode_entity_map or k.islower():
            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
            _encode_entity_map[v] = k
 def htmlentityreplace_errors(exc):
    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
        res = []
        codepoints = []
        skip = False
        for i, c in enumerate(exc.object[exc.start:exc.end]):
            if skip:
                skip = False
                continue
            index = i + exc.start
            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
                skip = True
            else:
                codepoint = ord(c)
            codepoints.append(codepoint)
        for cp in codepoints:
            e = _encode_entity_map.get(cp)
            if e:
                res.append("&")
                res.append(e)
                if not e.endswith(";"):
                    res.append(";")
            else:
                res.append("&#x%s;" % (hex(cp)[2:]))
        return ("".join(res), exc.end)
    else:
        return xmlcharrefreplace_errors(exc)
 register_error("htmlentityreplace", htmlentityreplace_errors)
 def serialize(input, tree="etree", encoding=None, **serializer_opts):
    """Serializes the input token stream using the specified treewalker
    :arg input: the token stream to serialize
    :arg tree: the treewalker to use
    :arg encoding: the encoding to use
    :arg serializer_opts: any options to pass to the
        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
    :returns: the tree serialized as a string
    Example:
    >>> from html5lib.html5parser import parse
    >>> from html5lib.serializer import serialize
    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
    >>> serialize(token_stream, omit_optional_tags=False)
    '<html><head></head><body><p>Hi!</p></body></html>'
    """
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    s = HTMLSerializer(**serializer_opts)
    return s.render(walker(input), encoding)
 class HTMLSerializer(object):
    # attribute quoting options
    quote_attr_values = "legacy"  # be secure by default
    quote_char = '"'
    use_best_quote_char = True
    # tag syntax options
    omit_optional_tags = True
    minimize_boolean_attributes = True
    use_trailing_solidus = False
    space_before_trailing_solidus = True
    # escaping options
    escape_lt_in_attrs = False
    escape_rcdata = False
    resolve_entities = True
    # miscellaneous options
    alphabetical_attributes = False
    inject_meta_charset = True
    strip_whitespace = False
    sanitize = False
    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
               "omit_optional_tags", "minimize_boolean_attributes",
               "use_trailing_solidus", "space_before_trailing_solidus",
               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
               "alphabetical_attributes", "inject_meta_charset",
               "strip_whitespace", "sanitize")
    def __init__(self, **kwargs):
        """Initialize HTMLSerializer
        :arg inject_meta_charset: Whether or not to inject the meta charset.
            Defaults to ``True``.
        :arg quote_attr_values: Whether to quote attribute values that don't
            require quoting per legacy browser behavior (``"legacy"``), when
            required by the standard (``"spec"``), or always (``"always"``).
            Defaults to ``"legacy"``.
        :arg quote_char: Use given quote character for attribute quoting.
            Defaults to ``"`` which will use double quotes unless attribute
            value contains a double quote, in which case single quotes are
            used.
        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
            values.
            Defaults to ``False``.
        :arg escape_rcdata: Whether to escape characters that need to be
            escaped within normal elements within rcdata elements such as
            style.
            Defaults to ``False``.
        :arg resolve_entities: Whether to resolve named character entities that
            appear in the source tree. The XML predefined entities &lt; &gt;
            &amp; &quot; &apos; are unaffected by this setting.
            Defaults to ``True``.
        :arg strip_whitespace: Whether to remove semantically meaningless
            whitespace. (This compresses all whitespace to a single space
            except within ``pre``.)
            Defaults to ``False``.
        :arg minimize_boolean_attributes: Shortens boolean attributes to give
            just the attribute value, for example::
              <input disabled="disabled">
            becomes::
              <input disabled>
            Defaults to ``True``.
        :arg use_trailing_solidus: Includes a close-tag slash at the end of the
            start tag of void elements (empty elements whose end tag is
            forbidden). E.g. ``<hr/>``.
            Defaults to ``False``.
        :arg space_before_trailing_solidus: Places a space immediately before
            the closing slash in a tag using a trailing solidus. E.g.
            ``<hr />``. Requires ``use_trailing_solidus=True``.
            Defaults to ``True``.
        :arg sanitize: Strip all unsafe or unknown constructs from output.
            See :py:class:`html5lib.filters.sanitizer.Filter`.
            Defaults to ``False``.
        :arg omit_optional_tags: Omit start/end tags that are optional.
            Defaults to ``True``.
        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
            Defaults to ``False``.
        """
        unexpected_args = frozenset(kwargs) - frozenset(self.options)
        if len(unexpected_args) > 0:
            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
        if 'quote_char' in kwargs:
            self.use_best_quote_char = False
        for attr in self.options:
            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
        self.errors = []
        self.strict = False
    def encode(self, string):
        assert(isinstance(string, text_type))
        if self.encoding:
            return string.encode(self.encoding, "htmlentityreplace")
        else:
            return string
    def encodeStrict(self, string):
        assert(isinstance(string, text_type))
        if self.encoding:
            return string.encode(self.encoding, "strict")
        else:
            return string
    def serialize(self, treewalker, encoding=None):
        # pylint:disable=too-many-nested-blocks
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from .filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # Alphabetical attributes is here under the assumption that none of
        # the later filters add or change order of attributes; it needs to be
        # before the sanitizer so escaped elements come out correctly
        if self.alphabetical_attributes:
            from .filters.alphabeticalattributes import Filter
            treewalker = Filter(treewalker)
        # WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from .filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from .filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from .filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
                doctype = "<!DOCTYPE %s" % token["name"]
                if token["publicId"]:
                    doctype += ' PUBLIC "%s"' % token["publicId"]
                elif token["systemId"]:
                    doctype += " SYSTEM"
                if token["systemId"]:
                    if token["systemId"].find('"') >= 0:
                        if token["systemId"].find("'") >= 0:
                            self.serializeError("System identifier contains both single and double quote characters")
                        quote_char = "'"
                    else:
                        quote_char = '"'
                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
                doctype += ">"
                yield self.encodeStrict(doctype)
            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
                        self.serializeError("Unexpected </ in CDATA")
                    yield self.encode(token["data"])
                else:
                    yield self.encode(escape(token["data"]))
            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
                yield self.encodeStrict("<%s" % name)
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError("Unexpected child element of a CDATA element")
                for (_, attr_name), attr_value in token["data"].items():
                    # TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
                    yield self.encodeStrict(' ')
                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
                        (k not in booleanAttributes.get(name, tuple()) and
                         k not in booleanAttributes.get("", tuple())):
                        yield self.encodeStrict("=")
                        if self.quote_attr_values == "always" or len(v) == 0:
                            quote_attr = True
                        elif self.quote_attr_values == "spec":
                            quote_attr = _quoteAttributeSpec.search(v) is not None
                        elif self.quote_attr_values == "legacy":
                            quote_attr = _quoteAttributeLegacy.search(v) is not None
                        else:
                            raise ValueError("quote_attr_values must be one of: "
                                             "'always', 'spec', or 'legacy'")
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs:
                            v = v.replace("<", "&lt;")
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if "'" in v and '"' not in v:
                                    quote_char = '"'
                                elif '"' in v and "'" not in v:
                                    quote_char = "'"
                            if quote_char == "'":
                                v = v.replace("'", "&#39;")
                            else:
                                v = v.replace('"', "&quot;")
                            yield self.encodeStrict(quote_char)
                            yield self.encode(v)
                            yield self.encodeStrict(quote_char)
                        else:
                            yield self.encode(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        yield self.encodeStrict(" /")
                    else:
                        yield self.encodeStrict("/")
                yield self.encode(">")
            elif type == "EndTag":
                name = token["name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError("Unexpected child element of a CDATA element")
                yield self.encodeStrict("</%s>" % name)
            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError("Comment contains --")
                yield self.encodeStrict("<!--%s-->" % token["data"])
            elif type == "Entity":
                name = token["name"]
                key = name + ";"
                if key not in entities:
                    self.serializeError("Entity %s not recognized" % name)
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = "&%s;" % name
                yield self.encodeStrict(data)
            else:
                self.serializeError(token["data"])
    def render(self, treewalker, encoding=None):
        """Serializes the stream from the treewalker into a string
        :arg treewalker: the treewalker to serialize
        :arg encoding: the string encoding to use
        :returns: the serialized tree
        Example:
        >>> from html5lib import parse, getTreeWalker
        >>> from html5lib.serializer import HTMLSerializer
        >>> token_stream = parse('<html><body>Hi!</body></html>')
        >>> walker = getTreeWalker('etree')
        >>> serializer = HTMLSerializer(omit_optional_tags=False)
        >>> serializer.render(walker(token_stream))
        '<html><head></head><body>Hi!</body></html>'
        """
        if encoding:
            return b"".join(list(self.serialize(treewalker, encoding)))
        else:
            return "".join(list(self.serialize(treewalker)))
    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
        # XXX The idea is to make data mandatory.
        self.errors.append(data)
        if self.strict:
            raise SerializeError
 class SerializeError(Exception):
    """Error in serialized tree"""
    pass
--- a/lib/bleach/_vendor/html5lib/treeadapters/init.py
+++ b/lib/bleach/_vendor/html5lib/treeadapters/init.py
@ -0,0 +1,30 @@
 """Tree adapters let you convert from one tree structure to another
 Example:
 .. code-block:: python
   import html5lib
   from html5lib.treeadapters import genshi
   doc = '<html><body>Hi!</body></html>'
   treebuilder = html5lib.getTreeBuilder('etree')
   parser = html5lib.HTMLParser(tree=treebuilder)
   tree = parser.parse(doc)
   TreeWalker = html5lib.getTreeWalker('etree')
   genshi_tree = genshi.to_genshi(TreeWalker(tree))
 """
 from __future__ import absolute_import, division, unicode_literals
 from . import sax
 __all__ = ["sax"]
 try:
    from . import genshi  # noqa
 except ImportError:
    pass
 else:
    __all__.append("genshi")
--- a/lib/bleach/_vendor/html5lib/treeadapters/genshi.py
+++ b/lib/bleach/_vendor/html5lib/treeadapters/genshi.py
@ -0,0 +1,54 @@
 from __future__ import absolute_import, division, unicode_literals
 from genshi.core import QName, Attrs
 from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
 def to_genshi(walker):
    """Convert a tree to a genshi tree
    :arg walker: the treewalker to use to walk the tree to convert it
    :returns: generator of genshi nodes
    """
    text = []
    for token in walker:
        type = token["type"]
        if type in ("Characters", "SpaceCharacters"):
            text.append(token["data"])
        elif text:
            yield TEXT, "".join(text), (None, -1, -1)
            text = []
        if type in ("StartTag", "EmptyTag"):
            if token["namespace"]:
                name = "{%s}%s" % (token["namespace"], token["name"])
            else:
                name = token["name"]
            attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
                           for attr, value in token["data"].items()])
            yield (START, (QName(name), attrs), (None, -1, -1))
            if type == "EmptyTag":
                type = "EndTag"
        if type == "EndTag":
            if token["namespace"]:
                name = "{%s}%s" % (token["namespace"], token["name"])
            else:
                name = token["name"]
            yield END, QName(name), (None, -1, -1)
        elif type == "Comment":
            yield COMMENT, token["data"], (None, -1, -1)
        elif type == "Doctype":
            yield DOCTYPE, (token["name"], token["publicId"],
                            token["systemId"]), (None, -1, -1)
        else:
            pass  # FIXME: What to do?
    if text:
        yield TEXT, "".join(text), (None, -1, -1)
--- a/lib/bleach/_vendor/html5lib/treeadapters/sax.py
+++ b/lib/bleach/_vendor/html5lib/treeadapters/sax.py
@ -0,0 +1,50 @@
 from __future__ import absolute_import, division, unicode_literals
 from xml.sax.xmlreader import AttributesNSImpl
 from ..constants import adjustForeignAttributes, unadjustForeignAttributes
 prefix_mapping = {}
 for prefix, localName, namespace in adjustForeignAttributes.values():
    if prefix is not None:
        prefix_mapping[prefix] = namespace
 def to_sax(walker, handler):
    """Call SAX-like content handler based on treewalker walker
    :arg walker: the treewalker to use to walk the tree to convert it
    :arg handler: SAX handler to use
    """
    handler.startDocument()
    for prefix, namespace in prefix_mapping.items():
        handler.startPrefixMapping(prefix, namespace)
    for token in walker:
        type = token["type"]
        if type == "Doctype":
            continue
        elif type in ("StartTag", "EmptyTag"):
            attrs = AttributesNSImpl(token["data"],
                                     unadjustForeignAttributes)
            handler.startElementNS((token["namespace"], token["name"]),
                                   token["name"],
                                   attrs)
            if type == "EmptyTag":
                handler.endElementNS((token["namespace"], token["name"]),
                                     token["name"])
        elif type == "EndTag":
            handler.endElementNS((token["namespace"], token["name"]),
                                 token["name"])
        elif type in ("Characters", "SpaceCharacters"):
            handler.characters(token["data"])
        elif type == "Comment":
            pass
        else:
            assert False, "Unknown token type"
    for prefix, namespace in prefix_mapping.items():
        handler.endPrefixMapping(prefix)
    handler.endDocument()
--- a/lib/bleach/_vendor/html5lib/treebuilders/init.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/init.py
@ -0,0 +1,88 @@
 """A collection of modules for building different kinds of trees from HTML
 documents.
 To create a treebuilder for a new type of tree, you need to do
 implement several things:
 1. A set of classes for various types of elements: Document, Doctype, Comment,
   Element. These must implement the interface of ``base.treebuilders.Node``
   (although comment nodes have a different signature for their constructor,
   see ``treebuilders.etree.Comment``) Textual content may also be implemented
   as another node type, or not, as your tree implementation requires.
 2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
   from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
   * ``documentClass`` - the class to use for the bottommost node of a document
   * ``elementClass`` - the class to use for HTML Elements
   * ``commentClass`` - the class to use for comments
   * ``doctypeClass`` - the class to use for doctypes
   It also has one required method:
   * ``getDocument`` - Returns the root node of the complete document tree
 3. If you wish to run the unit tests, you must also create a ``testSerializer``
   method on your treebuilder which accepts a node and returns a string
   containing Node and its children serialized according to the format used in
   the unittests
 """
 from __future__ import absolute_import, division, unicode_literals
 from .._utils import default_etree
 treeBuilderCache = {}
 def getTreeBuilder(treeType, implementation=None, **kwargs):
    """Get a TreeBuilder class for various types of trees with built-in support
    :arg treeType: the name of the tree type required (case-insensitive). Supported
        values are:
        * "dom" - A generic builder for DOM implementations, defaulting to a
          xml.dom.minidom based implementation.
        * "etree" - A generic builder for tree implementations exposing an
          ElementTree-like interface, defaulting to xml.etree.cElementTree if
          available and xml.etree.ElementTree if not.
        * "lxml" - A etree-based builder for lxml.etree, handling limitations
          of lxml's implementation.
    :arg implementation: (Currently applies to the "etree" and "dom" tree
        types). A module implementing the tree type e.g. xml.etree.ElementTree
        or xml.etree.cElementTree.
    :arg kwargs: Any additional options to pass to the TreeBuilder when
        creating it.
    Example:
    >>> from html5lib.treebuilders import getTreeBuilder
    >>> builder = getTreeBuilder('etree')
    """
    treeType = treeType.lower()
    if treeType not in treeBuilderCache:
        if treeType == "dom":
            from . import dom
            # Come up with a sane default (pref. from the stdlib)
            if implementation is None:
                from xml.dom import minidom
                implementation = minidom
            # NEVER cache here, caching is done in the dom submodule
            return dom.getDomModule(implementation, **kwargs).TreeBuilder
        elif treeType == "lxml":
            from . import etree_lxml
            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
        elif treeType == "etree":
            from . import etree
            if implementation is None:
                implementation = default_etree
            # NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
        else:
            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
    return treeBuilderCache.get(treeType)
--- a/lib/bleach/_vendor/html5lib/treebuilders/base.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/base.py
@ -0,0 +1,417 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from ..constants import scopingElements, tableInsertModeElements, namespaces
 # The scope markers are inserted when entering object elements,
 # marquees, table cells, and table captions, and are used to prevent formatting
 # from "leaking" into tables, object elements, and marquees.
 Marker = None
 listElementsMap = {
    None: (frozenset(scopingElements), False),
    "button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
    "list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
                                          (namespaces["html"], "ul")}), False),
    "table": (frozenset([(namespaces["html"], "html"),
                         (namespaces["html"], "table")]), False),
    "select": (frozenset([(namespaces["html"], "optgroup"),
                          (namespaces["html"], "option")]), True)
 }
 class Node(object):
    """Represents an item in the tree"""
    def __init__(self, name):
        """Creates a Node
        :arg name: The tag name associated with the node
        """
        # The tag name associated with the node
        self.name = name
        # The parent of the current node (or None for the document node)
        self.parent = None
        # The value of the current node (applies to text nodes and comments)
        self.value = None
        # A dict holding name -> value pairs for attributes of the node
        self.attributes = {}
        # A list of child nodes of the current node. This must include all
        # elements but not necessarily other node types.
        self.childNodes = []
        # A list of miscellaneous flags that can be set on the node.
        self._flags = []
    def __str__(self):
        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
                                  for name, value in
                                  self.attributes.items()])
        if attributesStr:
            return "<%s %s>" % (self.name, attributesStr)
        else:
            return "<%s>" % (self.name)
    def __repr__(self):
        return "<%s>" % (self.name)
    def appendChild(self, node):
        """Insert node as a child of the current node
        :arg node: the node to insert
        """
        raise NotImplementedError
    def insertText(self, data, insertBefore=None):
        """Insert data as text in the current node, positioned before the
        start of node insertBefore or to the end of the node's text.
        :arg data: the data to insert
        :arg insertBefore: True if you want to insert the text before the node
            and False if you want to insert it after the node
        """
        raise NotImplementedError
    def insertBefore(self, node, refNode):
        """Insert node as a child of the current node, before refNode in the
        list of child nodes. Raises ValueError if refNode is not a child of
        the current node
        :arg node: the node to insert
        :arg refNode: the child node to insert the node before
        """
        raise NotImplementedError
    def removeChild(self, node):
        """Remove node from the children of the current node
        :arg node: the child node to remove
        """
        raise NotImplementedError
    def reparentChildren(self, newParent):
        """Move all the children of the current node to newParent.
        This is needed so that trees that don't store text as nodes move the
        text in the correct way
        :arg newParent: the node to move all this node's children to
        """
        # XXX - should this method be made more general?
        for child in self.childNodes:
            newParent.appendChild(child)
        self.childNodes = []
    def cloneNode(self):
        """Return a shallow copy of the current node i.e. a node with the same
        name and attributes but with no parent or child nodes
        """
        raise NotImplementedError
    def hasContent(self):
        """Return true if the node has children or text, false otherwise
        """
        raise NotImplementedError
 class ActiveFormattingElements(list):
    def append(self, node):
        equalCount = 0
        if node != Marker:
            for element in self[::-1]:
                if element == Marker:
                    break
                if self.nodesEqual(element, node):
                    equalCount += 1
                if equalCount == 3:
                    self.remove(element)
                    break
        list.append(self, node)
    def nodesEqual(self, node1, node2):
        if not node1.nameTuple == node2.nameTuple:
            return False
        if not node1.attributes == node2.attributes:
            return False
        return True
 class TreeBuilder(object):
    """Base treebuilder implementation
    * documentClass - the class to use for the bottommost node of a document
    * elementClass - the class to use for HTML Elements
    * commentClass - the class to use for comments
    * doctypeClass - the class to use for doctypes
    """
    # pylint:disable=not-callable
    # Document class
    documentClass = None
    # The class to use for creating a node
    elementClass = None
    # The class to use for creating comments
    commentClass = None
    # The class to use for creating doctypes
    doctypeClass = None
    # Fragment class
    fragmentClass = None
    def __init__(self, namespaceHTMLElements):
        """Create a TreeBuilder
        :arg namespaceHTMLElements: whether or not to namespace HTML elements
        """
        if namespaceHTMLElements:
            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
        else:
            self.defaultNamespace = None
        self.reset()
    def reset(self):
        self.openElements = []
        self.activeFormattingElements = ActiveFormattingElements()
        # XXX - rename these to headElement, formElement
        self.headPointer = None
        self.formPointer = None
        self.insertFromTable = False
        self.document = self.documentClass()
    def elementInScope(self, target, variant=None):
        # If we pass a node in we match that. if we pass a string
        # match any node with that name
        exactNode = hasattr(target, "nameTuple")
        if not exactNode:
            if isinstance(target, text_type):
                target = (namespaces["html"], target)
            assert isinstance(target, tuple)
        listElements, invert = listElementsMap[variant]
        for node in reversed(self.openElements):
            if exactNode and node == target:
                return True
            elif not exactNode and node.nameTuple == target:
                return True
            elif (invert ^ (node.nameTuple in listElements)):
                return False
        assert False  # We should never reach this point
    def reconstructActiveFormattingElements(self):
        # Within this algorithm the order of steps described in the
        # specification is not quite the same as the order of steps in the
        # code. It should still do the same though.
        # Step 1: stop the algorithm when there's nothing to do.
        if not self.activeFormattingElements:
            return
        # Step 2 and step 3: we start with the last element. So i is -1.
        i = len(self.activeFormattingElements) - 1
        entry = self.activeFormattingElements[i]
        if entry == Marker or entry in self.openElements:
            return
        # Step 6
        while entry != Marker and entry not in self.openElements:
            if i == 0:
                # This will be reset to 0 below
                i = -1
                break
            i -= 1
            # Step 5: let entry be one earlier in the list.
            entry = self.activeFormattingElements[i]
        while True:
            # Step 7
            i += 1
            # Step 8
            entry = self.activeFormattingElements[i]
            clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
            # Step 9
            element = self.insertElement({"type": "StartTag",
                                          "name": clone.name,
                                          "namespace": clone.namespace,
                                          "data": clone.attributes})
            # Step 10
            self.activeFormattingElements[i] = element
            # Step 11
            if element == self.activeFormattingElements[-1]:
                break
    def clearActiveFormattingElements(self):
        entry = self.activeFormattingElements.pop()
        while self.activeFormattingElements and entry != Marker:
            entry = self.activeFormattingElements.pop()
    def elementInActiveFormattingElements(self, name):
        """Check if an element exists between the end of the active
        formatting elements and the last marker. If it does, return it, else
        return false"""
        for item in self.activeFormattingElements[::-1]:
            # Check for Marker first because if it's a Marker it doesn't have a
            # name attribute.
            if item == Marker:
                break
            elif item.name == name:
                return item
        return False
    def insertRoot(self, token):
        element = self.createElement(token)
        self.openElements.append(element)
        self.document.appendChild(element)
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        doctype = self.doctypeClass(name, publicId, systemId)
        self.document.appendChild(doctype)
    def insertComment(self, token, parent=None):
        if parent is None:
            parent = self.openElements[-1]
        parent.appendChild(self.commentClass(token["data"]))
    def createElement(self, token):
        """Create an element but don't insert it anywhere"""
        name = token["name"]
        namespace = token.get("namespace", self.defaultNamespace)
        element = self.elementClass(name, namespace)
        element.attributes = token["data"]
        return element
    def _getInsertFromTable(self):
        return self._insertFromTable
    def _setInsertFromTable(self, value):
        """Switch the function used to insert an element from the
        normal one to the misnested table one and back again"""
        self._insertFromTable = value
        if value:
            self.insertElement = self.insertElementTable
        else:
            self.insertElement = self.insertElementNormal
    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
    def insertElementNormal(self, token):
        name = token["name"]
        assert isinstance(name, text_type), "Element %s not unicode" % name
        namespace = token.get("namespace", self.defaultNamespace)
        element = self.elementClass(name, namespace)
        element.attributes = token["data"]
        self.openElements[-1].appendChild(element)
        self.openElements.append(element)
        return element
    def insertElementTable(self, token):
        """Create an element and insert it into the tree"""
        element = self.createElement(token)
        if self.openElements[-1].name not in tableInsertModeElements:
            return self.insertElementNormal(token)
        else:
            # We should be in the InTable mode. This means we want to do
            # special magic element rearranging
            parent, insertBefore = self.getTableMisnestedNodePosition()
            if insertBefore is None:
                parent.appendChild(element)
            else:
                parent.insertBefore(element, insertBefore)
            self.openElements.append(element)
        return element
    def insertText(self, data, parent=None):
        """Insert text data."""
        if parent is None:
            parent = self.openElements[-1]
        if (not self.insertFromTable or (self.insertFromTable and
                                         self.openElements[-1].name
                                         not in tableInsertModeElements)):
            parent.insertText(data)
        else:
            # We should be in the InTable mode. This means we want to do
            # special magic element rearranging
            parent, insertBefore = self.getTableMisnestedNodePosition()
            parent.insertText(data, insertBefore)
    def getTableMisnestedNodePosition(self):
        """Get the foster parent element, and sibling to insert before
        (or None) when inserting a misnested table node"""
        # The foster parent element is the one which comes before the most
        # recently opened table element
        # XXX - this is really inelegant
        lastTable = None
        fosterParent = None
        insertBefore = None
        for elm in self.openElements[::-1]:
            if elm.name == "table":
                lastTable = elm
                break
        if lastTable:
            # XXX - we should really check that this parent is actually a
            # node here
            if lastTable.parent:
                fosterParent = lastTable.parent
                insertBefore = lastTable
            else:
                fosterParent = self.openElements[
                    self.openElements.index(lastTable) - 1]
        else:
            fosterParent = self.openElements[0]
        return fosterParent, insertBefore
    def generateImpliedEndTags(self, exclude=None):
        name = self.openElements[-1].name
        # XXX td, th and tr are not actually needed
        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
                name != exclude):
            self.openElements.pop()
            # XXX This is not entirely what the specification says. We should
            # investigate it more closely.
            self.generateImpliedEndTags(exclude)
    def getDocument(self):
        """Return the final tree"""
        return self.document
    def getFragment(self):
        """Return the final fragment"""
        # assert self.innerHTML
        fragment = self.fragmentClass()
        self.openElements[0].reparentChildren(fragment)
        return fragment
    def testSerializer(self, node):
        """Serialize the subtree of node in the format required by unit tests
        :arg node: the node from which to start serializing
        """
        raise NotImplementedError
--- a/lib/bleach/_vendor/html5lib/treebuilders/dom.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/dom.py
@ -0,0 +1,239 @@
 from __future__ import absolute_import, division, unicode_literals
 try:
    from collections.abc import MutableMapping
 except ImportError:  # Python 2.7
    from collections import MutableMapping
 from xml.dom import minidom, Node
 import weakref
 from . import base
 from .. import constants
 from ..constants import namespaces
 from .._utils import moduleFactoryFactory
 def getDomBuilder(DomImplementation):
    Dom = DomImplementation
    class AttrList(MutableMapping):
        def __init__(self, element):
            self.element = element
        def __iter__(self):
            return iter(self.element.attributes.keys())
        def __setitem__(self, name, value):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
                attr = self.element.ownerDocument.createAttribute(name)
                attr.value = value
                self.element.attributes[name] = attr
        def __len__(self):
            return len(self.element.attributes)
        def items(self):
            return list(self.element.attributes.items())
        def values(self):
            return list(self.element.attributes.values())
        def __getitem__(self, name):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
                return self.element.attributes[name].value
        def __delitem__(self, name):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
                del self.element.attributes[name]
    class NodeBuilder(base.Node):
        def __init__(self, element):
            base.Node.__init__(self, element.nodeName)
            self.element = element
        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
                             self.element.namespaceURI or None)
        def appendChild(self, node):
            node.parent = self
            self.element.appendChild(node.element)
        def insertText(self, data, insertBefore=None):
            text = self.element.ownerDocument.createTextNode(data)
            if insertBefore:
                self.element.insertBefore(text, insertBefore.element)
            else:
                self.element.appendChild(text)
        def insertBefore(self, node, refNode):
            self.element.insertBefore(node.element, refNode.element)
            node.parent = self
        def removeChild(self, node):
            if node.element.parentNode == self.element:
                self.element.removeChild(node.element)
            node.parent = None
        def reparentChildren(self, newParent):
            while self.element.hasChildNodes():
                child = self.element.firstChild
                self.element.removeChild(child)
                newParent.element.appendChild(child)
            self.childNodes = []
        def getAttributes(self):
            return AttrList(self.element)
        def setAttributes(self, attributes):
            if attributes:
                for name, value in list(attributes.items()):
                    if isinstance(name, tuple):
                        if name[0] is not None:
                            qualifiedName = (name[0] + ":" + name[1])
                        else:
                            qualifiedName = name[1]
                        self.element.setAttributeNS(name[2], qualifiedName,
                                                    value)
                    else:
                        self.element.setAttribute(
                            name, value)
        attributes = property(getAttributes, setAttributes)
        def cloneNode(self):
            return NodeBuilder(self.element.cloneNode(False))
        def hasContent(self):
            return self.element.hasChildNodes()
        def getNameTuple(self):
            if self.namespace is None:
                return namespaces["html"], self.name
            else:
                return self.namespace, self.name
        nameTuple = property(getNameTuple)
    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
        def documentClass(self):
            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
            return weakref.proxy(self)
        def insertDoctype(self, token):
            name = token["name"]
            publicId = token["publicId"]
            systemId = token["systemId"]
            domimpl = Dom.getDOMImplementation()
            doctype = domimpl.createDocumentType(name, publicId, systemId)
            self.document.appendChild(NodeBuilder(doctype))
            if Dom == minidom:
                doctype.ownerDocument = self.dom
        def elementClass(self, name, namespace=None):
            if namespace is None and self.defaultNamespace is None:
                node = self.dom.createElement(name)
            else:
                node = self.dom.createElementNS(namespace, name)
            return NodeBuilder(node)
        def commentClass(self, data):
            return NodeBuilder(self.dom.createComment(data))
        def fragmentClass(self):
            return NodeBuilder(self.dom.createDocumentFragment())
        def appendChild(self, node):
            self.dom.appendChild(node.element)
        def testSerializer(self, element):
            return testSerializer(element)
        def getDocument(self):
            return self.dom
        def getFragment(self):
            return base.TreeBuilder.getFragment(self).element
        def insertText(self, data, parent=None):
            data = data
            if parent != self:
                base.TreeBuilder.insertText(self, data, parent)
            else:
                # HACK: allow text nodes as children of the document node
                if hasattr(self.dom, '_child_node_types'):
                    # pylint:disable=protected-access
                    if Node.TEXT_NODE not in self.dom._child_node_types:
                        self.dom._child_node_types = list(self.dom._child_node_types)
                        self.dom._child_node_types.append(Node.TEXT_NODE)
                self.dom.appendChild(self.dom.createTextNode(data))
        implementation = DomImplementation
        name = None
    def testSerializer(element):
        element.normalize()
        rv = []
        def serializeElement(element, indent=0):
            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
                if element.name:
                    if element.publicId or element.systemId:
                        publicId = element.publicId or ""
                        systemId = element.systemId or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, element.name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
            elif element.nodeType == Node.DOCUMENT_NODE:
                rv.append("#document")
            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
                rv.append("#document-fragment")
            elif element.nodeType == Node.COMMENT_NODE:
                rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
            elif element.nodeType == Node.TEXT_NODE:
                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
            else:
                if (hasattr(element, "namespaceURI") and
                        element.namespaceURI is not None):
                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
                                      element.nodeName)
                else:
                    name = element.nodeName
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.hasAttributes():
                    attributes = []
                    for i in range(len(element.attributes)):
                        attr = element.attributes.item(i)
                        name = attr.nodeName
                        value = attr.value
                        ns = attr.namespaceURI
                        if ns:
                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
                        else:
                            name = attr.nodeName
                        attributes.append((name, value))
                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
            indent += 2
            for child in element.childNodes:
                serializeElement(child, indent)
        serializeElement(element, 0)
        return "\n".join(rv)
    return locals()
 # The actual means to get a module!
 getDomModule = moduleFactoryFactory(getDomBuilder)
--- a/lib/bleach/_vendor/html5lib/treebuilders/etree.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/etree.py
@ -0,0 +1,343 @@
 from __future__ import absolute_import, division, unicode_literals
 # pylint:disable=protected-access
 from six import text_type
 import re
 from copy import copy
 from . import base
 from .. import _ihatexml
 from .. import constants
 from ..constants import namespaces
 from .._utils import moduleFactoryFactory
 tag_regexp = re.compile("{([^}]*)}(.*)")
 def getETreeBuilder(ElementTreeImplementation, fullTree=False):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag
    class Element(base.Node):
        def __init__(self, name, namespace=None):
            self._name = name
            self._namespace = namespace
            self._element = ElementTree.Element(self._getETreeTag(name,
                                                                  namespace))
            if namespace is None:
                self.nameTuple = namespaces["html"], self._name
            else:
                self.nameTuple = self._namespace, self._name
            self.parent = None
            self._childNodes = []
            self._flags = []
        def _getETreeTag(self, name, namespace):
            if namespace is None:
                etree_tag = name
            else:
                etree_tag = "{%s}%s" % (namespace, name)
            return etree_tag
        def _setName(self, name):
            self._name = name
            self._element.tag = self._getETreeTag(self._name, self._namespace)
        def _getName(self):
            return self._name
        name = property(_getName, _setName)
        def _setNamespace(self, namespace):
            self._namespace = namespace
            self._element.tag = self._getETreeTag(self._name, self._namespace)
        def _getNamespace(self):
            return self._namespace
        namespace = property(_getNamespace, _setNamespace)
        def _getAttributes(self):
            return self._element.attrib
        def _setAttributes(self, attributes):
            el_attrib = self._element.attrib
            el_attrib.clear()
            if attributes:
                # calling .items _always_ allocates, and the above truthy check is cheaper than the
                # allocation on average
                for key, value in attributes.items():
                    if isinstance(key, tuple):
                        name = "{%s}%s" % (key[2], key[1])
                    else:
                        name = key
                    el_attrib[name] = value
        attributes = property(_getAttributes, _setAttributes)
        def _getChildNodes(self):
            return self._childNodes
        def _setChildNodes(self, value):
            del self._element[:]
            self._childNodes = []
            for element in value:
                self.insertChild(element)
        childNodes = property(_getChildNodes, _setChildNodes)
        def hasContent(self):
            """Return true if the node has children or text"""
            return bool(self._element.text or len(self._element))
        def appendChild(self, node):
            self._childNodes.append(node)
            self._element.append(node._element)
            node.parent = self
        def insertBefore(self, node, refNode):
            index = list(self._element).index(refNode._element)
            self._element.insert(index, node._element)
            node.parent = self
        def removeChild(self, node):
            self._childNodes.remove(node)
            self._element.remove(node._element)
            node.parent = None
        def insertText(self, data, insertBefore=None):
            if not(len(self._element)):
                if not self._element.text:
                    self._element.text = ""
                self._element.text += data
            elif insertBefore is None:
                # Insert the text as the tail of the last child element
                if not self._element[-1].tail:
                    self._element[-1].tail = ""
                self._element[-1].tail += data
            else:
                # Insert the text before the specified node
                children = list(self._element)
                index = children.index(insertBefore._element)
                if index > 0:
                    if not self._element[index - 1].tail:
                        self._element[index - 1].tail = ""
                    self._element[index - 1].tail += data
                else:
                    if not self._element.text:
                        self._element.text = ""
                    self._element.text += data
        def cloneNode(self):
            element = type(self)(self.name, self.namespace)
            if self._element.attrib:
                element._element.attrib = copy(self._element.attrib)
            return element
        def reparentChildren(self, newParent):
            if newParent.childNodes:
                newParent.childNodes[-1]._element.tail += self._element.text
            else:
                if not newParent._element.text:
                    newParent._element.text = ""
                if self._element.text is not None:
                    newParent._element.text += self._element.text
            self._element.text = ""
            base.Node.reparentChildren(self, newParent)
    class Comment(Element):
        def __init__(self, data):
            # Use the superclass constructor to set all properties on the
            # wrapper element
            self._element = ElementTree.Comment(data)
            self.parent = None
            self._childNodes = []
            self._flags = []
        def _getData(self):
            return self._element.text
        def _setData(self, value):
            self._element.text = value
        data = property(_getData, _setData)
    class DocumentType(Element):
        def __init__(self, name, publicId, systemId):
            Element.__init__(self, "<!DOCTYPE>")
            self._element.text = name
            self.publicId = publicId
            self.systemId = systemId
        def _getPublicId(self):
            return self._element.get("publicId", "")
        def _setPublicId(self, value):
            if value is not None:
                self._element.set("publicId", value)
        publicId = property(_getPublicId, _setPublicId)
        def _getSystemId(self):
            return self._element.get("systemId", "")
        def _setSystemId(self, value):
            if value is not None:
                self._element.set("systemId", value)
        systemId = property(_getSystemId, _setSystemId)
    class Document(Element):
        def __init__(self):
            Element.__init__(self, "DOCUMENT_ROOT")
    class DocumentFragment(Element):
        def __init__(self):
            Element.__init__(self, "DOCUMENT_FRAGMENT")
    def testSerializer(element):
        rv = []
        def serializeElement(element, indent=0):
            if not(hasattr(element, "tag")):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
                if element.get("publicId") or element.get("systemId"):
                    publicId = element.get("publicId") or ""
                    systemId = element.get("systemId") or ""
                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
                              (element.text, publicId, systemId))
                else:
                    rv.append("<!DOCTYPE %s>" % (element.text,))
            elif element.tag == "DOCUMENT_ROOT":
                rv.append("#document")
                if element.text is not None:
                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
                if element.tail is not None:
                    raise TypeError("Document node cannot have tail")
                if hasattr(element, "attrib") and len(element.attrib):
                    raise TypeError("Document node cannot have attributes")
            elif element.tag == ElementTreeCommentType:
                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
            else:
                assert isinstance(element.tag, text_type), \
                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
                nsmatch = tag_regexp.match(element.tag)
                if nsmatch is None:
                    name = element.tag
                else:
                    ns, name = nsmatch.groups()
                    prefix = constants.prefixes[ns]
                    name = "%s %s" % (prefix, name)
                rv.append("|%s<%s>" % (' ' * indent, name))
                if hasattr(element, "attrib"):
                    attributes = []
                    for name, value in element.attrib.items():
                        nsmatch = tag_regexp.match(name)
                        if nsmatch is not None:
                            ns, name = nsmatch.groups()
                            prefix = constants.prefixes[ns]
                            attr_string = "%s %s" % (prefix, name)
                        else:
                            attr_string = name
                        attributes.append((attr_string, value))
                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
                if element.text:
                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
            indent += 2
            for child in element:
                serializeElement(child, indent)
            if element.tail:
                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
        serializeElement(element, 0)
        return "\n".join(rv)
    def tostring(element):  # pylint:disable=unused-variable
        """Serialize an element and its child nodes to a string"""
        rv = []
        filter = _ihatexml.InfosetFilter()
        def serializeElement(element):
            if isinstance(element, ElementTree.ElementTree):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
                if element.get("publicId") or element.get("systemId"):
                    publicId = element.get("publicId") or ""
                    systemId = element.get("systemId") or ""
                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
                              (element.text, publicId, systemId))
                else:
                    rv.append("<!DOCTYPE %s>" % (element.text,))
            elif element.tag == "DOCUMENT_ROOT":
                if element.text is not None:
                    rv.append(element.text)
                if element.tail is not None:
                    raise TypeError("Document node cannot have tail")
                if hasattr(element, "attrib") and len(element.attrib):
                    raise TypeError("Document node cannot have attributes")
                for child in element:
                    serializeElement(child)
            elif element.tag == ElementTreeCommentType:
                rv.append("<!--%s-->" % (element.text,))
            else:
                # This is assumed to be an ordinary element
                if not element.attrib:
                    rv.append("<%s>" % (filter.fromXmlName(element.tag),))
                else:
                    attr = " ".join(["%s=\"%s\"" % (
                        filter.fromXmlName(name), value)
                        for name, value in element.attrib.items()])
                    rv.append("<%s %s>" % (element.tag, attr))
                if element.text:
                    rv.append(element.text)
                for child in element:
                    serializeElement(child)
                rv.append("</%s>" % (element.tag,))
            if element.tail:
                rv.append(element.tail)
        serializeElement(element)
        return "".join(rv)
    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
        documentClass = Document
        doctypeClass = DocumentType
        elementClass = Element
        commentClass = Comment
        fragmentClass = DocumentFragment
        implementation = ElementTreeImplementation
        def testSerializer(self, element):
            return testSerializer(element)
        def getDocument(self):
            if fullTree:
                return self.document._element
            else:
                if self.defaultNamespace is not None:
                    return self.document._element.find(
                        "{%s}html" % self.defaultNamespace)
                else:
                    return self.document._element.find("html")
        def getFragment(self):
            return base.TreeBuilder.getFragment(self)._element
    return locals()
 getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
+++ b/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
@ -0,0 +1,392 @@
 """Module for supporting the lxml.etree library. The idea here is to use as much
 of the native library as possible, without using fragile hacks like custom element
 names that break between releases. The downside of this is that we cannot represent
 all possible trees; specifically the following are known to cause problems:
 Text or comments as siblings of the root element
 Docypes with no name
 When any of these things occur, we emit a DataLossWarning
 """
 from __future__ import absolute_import, division, unicode_literals
 # pylint:disable=protected-access
 import warnings
 import re
 import sys
 try:
    from collections.abc import MutableMapping
 except ImportError:
    from collections import MutableMapping
 from . import base
 from ..constants import DataLossWarning
 from .. import constants
 from . import etree as etree_builders
 from .. import _ihatexml
 import lxml.etree as etree
 from six import PY3, binary_type
 fullTree = True
 tag_regexp = re.compile("{([^}]*)}(.*)")
 comment_type = etree.Comment("asd").tag
 class DocumentType(object):
    def __init__(self, name, publicId, systemId):
        self.name = name
        self.publicId = publicId
        self.systemId = systemId
 class Document(object):
    def __init__(self):
        self._elementTree = None
        self._childNodes = []
    def appendChild(self, element):
        last = self._elementTree.getroot()
        for last in self._elementTree.getroot().itersiblings():
            pass
        last.addnext(element._element)
    def _getChildNodes(self):
        return self._childNodes
    childNodes = property(_getChildNodes)
 def testSerializer(element):
    rv = []
    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
    def serializeElement(element, indent=0):
        if not hasattr(element, "tag"):
            if hasattr(element, "getroot"):
                # Full tree case
                rv.append("#document")
                if element.docinfo.internalDTD:
                    if not (element.docinfo.public_id or
                            element.docinfo.system_url):
                        dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
                    else:
                        dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
                            element.docinfo.root_name,
                            element.docinfo.public_id,
                            element.docinfo.system_url)
                    rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
                next_element = element.getroot()
                while next_element.getprevious() is not None:
                    next_element = next_element.getprevious()
                while next_element is not None:
                    serializeElement(next_element, indent + 2)
                    next_element = next_element.getnext()
            elif isinstance(element, str) or isinstance(element, bytes):
                # Text in a fragment
                assert isinstance(element, str) or sys.version_info[0] == 2
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                # Fragment case
                rv.append("#document-fragment")
                for next_element in element:
                    serializeElement(next_element, indent + 2)
        elif element.tag == comment_type:
            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
            if hasattr(element, "tail") and element.tail:
                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
        else:
            assert isinstance(element, etree._Element)
            nsmatch = etree_builders.tag_regexp.match(element.tag)
            if nsmatch is not None:
                ns = nsmatch.group(1)
                tag = nsmatch.group(2)
                prefix = constants.prefixes[ns]
                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
                                          infosetFilter.fromXmlName(tag)))
            else:
                rv.append("|%s<%s>" % (' ' * indent,
                                       infosetFilter.fromXmlName(element.tag)))
            if hasattr(element, "attrib"):
                attributes = []
                for name, value in element.attrib.items():
                    nsmatch = tag_regexp.match(name)
                    if nsmatch is not None:
                        ns, name = nsmatch.groups()
                        name = infosetFilter.fromXmlName(name)
                        prefix = constants.prefixes[ns]
                        attr_string = "%s %s" % (prefix, name)
                    else:
                        attr_string = infosetFilter.fromXmlName(name)
                    attributes.append((attr_string, value))
                for name, value in sorted(attributes):
                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
            if element.text:
                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
            indent += 2
            for child in element:
                serializeElement(child, indent)
            if hasattr(element, "tail") and element.tail:
                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
    serializeElement(element, 0)
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    def serializeElement(element):
        if not hasattr(element, "tag"):
            if element.docinfo.internalDTD:
                if element.docinfo.doctype:
                    dtd_str = element.docinfo.doctype
                else:
                    dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
                rv.append(dtd_str)
            serializeElement(element.getroot())
        elif element.tag == comment_type:
            rv.append("<!--%s-->" % (element.text,))
        else:
            # This is assumed to be an ordinary element
            if not element.attrib:
                rv.append("<%s>" % (element.tag,))
            else:
                attr = " ".join(["%s=\"%s\"" % (name, value)
                                 for name, value in element.attrib.items()])
                rv.append("<%s %s>" % (element.tag, attr))
            if element.text:
                rv.append(element.text)
            for child in element:
                serializeElement(child)
            rv.append("</%s>" % (element.tag,))
        if hasattr(element, "tail") and element.tail:
            rv.append(element.tail)
    serializeElement(element)
    return "".join(rv)
 class TreeBuilder(base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = None
    commentClass = None
    fragmentClass = Document
    implementation = etree
    def __init__(self, namespaceHTMLElements, fullTree=False):
        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
        self.namespaceHTMLElements = namespaceHTMLElements
        class Attributes(MutableMapping):
            def __init__(self, element):
                self._element = element
            def _coerceKey(self, key):
                if isinstance(key, tuple):
                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
                else:
                    name = infosetFilter.coerceAttribute(key)
                return name
            def __getitem__(self, key):
                value = self._element._element.attrib[self._coerceKey(key)]
                if not PY3 and isinstance(value, binary_type):
                    value = value.decode("ascii")
                return value
            def __setitem__(self, key, value):
                self._element._element.attrib[self._coerceKey(key)] = value
            def __delitem__(self, key):
                del self._element._element.attrib[self._coerceKey(key)]
            def __iter__(self):
                return iter(self._element._element.attrib)
            def __len__(self):
                return len(self._element._element.attrib)
            def clear(self):
                return self._element._element.attrib.clear()
        class Element(builder.Element):
            def __init__(self, name, namespace):
                name = infosetFilter.coerceElement(name)
                builder.Element.__init__(self, name, namespace=namespace)
                self._attributes = Attributes(self)
            def _setName(self, name):
                self._name = infosetFilter.coerceElement(name)
                self._element.tag = self._getETreeTag(
                    self._name, self._namespace)
            def _getName(self):
                return infosetFilter.fromXmlName(self._name)
            name = property(_getName, _setName)
            def _getAttributes(self):
                return self._attributes
            def _setAttributes(self, value):
                attributes = self.attributes
                attributes.clear()
                attributes.update(value)
            attributes = property(_getAttributes, _setAttributes)
            def insertText(self, data, insertBefore=None):
                data = infosetFilter.coerceCharacters(data)
                builder.Element.insertText(self, data, insertBefore)
            def cloneNode(self):
                element = type(self)(self.name, self.namespace)
                if self._element.attrib:
                    element._element.attrib.update(self._element.attrib)
                return element
        class Comment(builder.Comment):
            def __init__(self, data):
                data = infosetFilter.coerceComment(data)
                builder.Comment.__init__(self, data)
            def _setData(self, data):
                data = infosetFilter.coerceComment(data)
                self._element.text = data
            def _getData(self):
                return self._element.text
            data = property(_getData, _setData)
        self.elementClass = Element
        self.commentClass = Comment
        # self.fragmentClass = builder.DocumentFragment
        base.TreeBuilder.__init__(self, namespaceHTMLElements)
    def reset(self):
        base.TreeBuilder.reset(self)
        self.insertComment = self.insertCommentInitial
        self.initial_comments = []
        self.doctype = None
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        if fullTree:
            return self.document._elementTree
        else:
            return self.document._elementTree.getroot()
    def getFragment(self):
        fragment = []
        element = self.openElements[0]._element
        if element.text:
            fragment.append(element.text)
        fragment.extend(list(element))
        if element.tail:
            fragment.append(element.tail)
        return fragment
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        if not name:
            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
            self.doctype = None
        else:
            coercedName = self.infosetFilter.coerceElement(name)
            if coercedName != name:
                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
            doctype = self.doctypeClass(coercedName, publicId, systemId)
            self.doctype = doctype
    def insertCommentInitial(self, data, parent=None):
        assert parent is None or parent is self.document
        assert self.document._elementTree is None
        self.initial_comments.append(data)
    def insertCommentMain(self, data, parent=None):
        if (parent == self.document and
                self.document._elementTree.getroot()[-1].tag == comment_type):
            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
        super(TreeBuilder, self).insertComment(data, parent)
    def insertRoot(self, token):
        # Because of the way libxml2 works, it doesn't seem to be possible to
        # alter information like the doctype after the tree has been parsed.
        # Therefore we need to use the built-in parser to create our initial
        # tree, after which we can add elements like normal
        docStr = ""
        if self.doctype:
            assert self.doctype.name
            docStr += "<!DOCTYPE %s" % self.doctype.name
            if (self.doctype.publicId is not None or
                    self.doctype.systemId is not None):
                docStr += (' PUBLIC "%s" ' %
                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
                if self.doctype.systemId:
                    sysid = self.doctype.systemId
                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
                        sysid = sysid.replace("'", 'U00027')
                    if sysid.find("'") >= 0:
                        docStr += '"%s"' % sysid
                    else:
                        docStr += "'%s'" % sysid
                else:
                    docStr += "''"
            docStr += ">"
            if self.doctype.name != token["name"]:
                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
        root = etree.fromstring(docStr)
        # Append the initial comments:
        for comment_token in self.initial_comments:
            comment = self.commentClass(comment_token["data"])
            root.addprevious(comment._element)
        # Create the root document and add the ElementTree to it
        self.document = self.documentClass()
        self.document._elementTree = root.getroottree()
        # Give the root element the right name
        name = token["name"]
        namespace = token.get("namespace", self.defaultNamespace)
        if namespace is None:
            etree_tag = name
        else:
            etree_tag = "{%s}%s" % (namespace, name)
        root.tag = etree_tag
        # Add the root element to the internal child/open data structures
        root_element = self.elementClass(name, namespace)
        root_element._element = root
        self.document._childNodes.append(root_element)
        self.openElements.append(root_element)
        # Reset to the default insert comment function
        self.insertComment = self.insertCommentMain
--- a/lib/bleach/_vendor/html5lib/treewalkers/init.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/init.py
@ -0,0 +1,154 @@
 """A collection of modules for iterating through different kinds of
 tree, generating tokens identical to those produced by the tokenizer
 module.
 To create a tree walker for a new type of tree, you need to
 implement a tree walker object (called TreeWalker by convention) that
 implements a 'serialize' method which takes a tree as sole argument and
 returns an iterator which generates tokens.
 """
 from __future__ import absolute_import, division, unicode_literals
 from .. import constants
 from .._utils import default_etree
 __all__ = ["getTreeWalker", "pprint"]
 treeWalkerCache = {}
 def getTreeWalker(treeType, implementation=None, **kwargs):
    """Get a TreeWalker class for various types of tree with built-in support
    :arg str treeType: the name of the tree type required (case-insensitive).
        Supported values are:
        * "dom": The xml.dom.minidom DOM implementation
        * "etree": A generic walker for tree implementations exposing an
          elementtree-like interface (known to work with ElementTree,
          cElementTree and lxml.etree).
        * "lxml": Optimized walker for lxml.etree
        * "genshi": a Genshi stream
    :arg implementation: A module implementing the tree type e.g.
        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
        tree type only).
    :arg kwargs: keyword arguments passed to the etree walker--for other
        walkers, this has no effect
    :returns: a TreeWalker class
    """
    treeType = treeType.lower()
    if treeType not in treeWalkerCache:
        if treeType == "dom":
            from . import dom
            treeWalkerCache[treeType] = dom.TreeWalker
        elif treeType == "genshi":
            from . import genshi
            treeWalkerCache[treeType] = genshi.TreeWalker
        elif treeType == "lxml":
            from . import etree_lxml
            treeWalkerCache[treeType] = etree_lxml.TreeWalker
        elif treeType == "etree":
            from . import etree
            if implementation is None:
                implementation = default_etree
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)
 def concatenateCharacterTokens(tokens):
    pendingCharacters = []
    for token in tokens:
        type = token["type"]
        if type in ("Characters", "SpaceCharacters"):
            pendingCharacters.append(token["data"])
        else:
            if pendingCharacters:
                yield {"type": "Characters", "data": "".join(pendingCharacters)}
                pendingCharacters = []
            yield token
    if pendingCharacters:
        yield {"type": "Characters", "data": "".join(pendingCharacters)}
 def pprint(walker):
    """Pretty printer for tree walkers
    Takes a TreeWalker instance and pretty prints the output of walking the tree.
    :arg walker: a TreeWalker instance
    """
    output = []
    indent = 0
    for token in concatenateCharacterTokens(walker):
        type = token["type"]
        if type in ("StartTag", "EmptyTag"):
            # tag name
            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
                if token["namespace"] in constants.prefixes:
                    ns = constants.prefixes[token["namespace"]]
                else:
                    ns = token["namespace"]
                name = "%s %s" % (ns, token["name"])
            else:
                name = token["name"]
            output.append("%s<%s>" % (" " * indent, name))
            indent += 2
            # attributes (sorted for consistent ordering)
            attrs = token["data"]
            for (namespace, localname), value in sorted(attrs.items()):
                if namespace:
                    if namespace in constants.prefixes:
                        ns = constants.prefixes[namespace]
                    else:
                        ns = namespace
                    name = "%s %s" % (ns, localname)
                else:
                    name = localname
                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
            # self-closing
            if type == "EmptyTag":
                indent -= 2
        elif type == "EndTag":
            indent -= 2
        elif type == "Comment":
            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
        elif type == "Doctype":
            if token["name"]:
                if token["publicId"]:
                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
                                  (" " * indent,
                                   token["name"],
                                   token["publicId"],
                                   token["systemId"] if token["systemId"] else ""))
                elif token["systemId"]:
                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
                                  (" " * indent,
                                   token["name"],
                                   token["systemId"]))
                else:
                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
                                                       token["name"]))
            else:
                output.append("%s<!DOCTYPE >" % (" " * indent,))
        elif type == "Characters":
            output.append("%s\"%s\"" % (" " * indent, token["data"]))
        elif type == "SpaceCharacters":
            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
        else:
            raise ValueError("Unknown token type, %s" % type)
    return "\n".join(output)
--- a/lib/bleach/_vendor/html5lib/treewalkers/base.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/base.py
@ -0,0 +1,252 @@
 from __future__ import absolute_import, division, unicode_literals
 from xml.dom import Node
 from ..constants import namespaces, voidElements, spaceCharacters
 __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
           "TreeWalker", "NonRecursiveTreeWalker"]
 DOCUMENT = Node.DOCUMENT_NODE
 DOCTYPE = Node.DOCUMENT_TYPE_NODE
 TEXT = Node.TEXT_NODE
 ELEMENT = Node.ELEMENT_NODE
 COMMENT = Node.COMMENT_NODE
 ENTITY = Node.ENTITY_NODE
 UNKNOWN = "<#UNKNOWN#>"
 spaceCharacters = "".join(spaceCharacters)
 class TreeWalker(object):
    """Walks a tree yielding tokens
    Tokens are dicts that all have a ``type`` field specifying the type of the
    token.
    """
    def __init__(self, tree):
        """Creates a TreeWalker
        :arg tree: the tree to walk
        """
        self.tree = tree
    def __iter__(self):
        raise NotImplementedError
    def error(self, msg):
        """Generates an error token with the given message
        :arg msg: the error message
        :returns: SerializeError token
        """
        return {"type": "SerializeError", "data": msg}
    def emptyTag(self, namespace, name, attrs, hasChildren=False):
        """Generates an EmptyTag token
        :arg namespace: the namespace of the token--can be ``None``
        :arg name: the name of the element
        :arg attrs: the attributes of the element as a dict
        :arg hasChildren: whether or not to yield a SerializationError because
            this tag shouldn't have children
        :returns: EmptyTag token
        """
        yield {"type": "EmptyTag", "name": name,
               "namespace": namespace,
               "data": attrs}
        if hasChildren:
            yield self.error("Void element has children")
    def startTag(self, namespace, name, attrs):
        """Generates a StartTag token
        :arg namespace: the namespace of the token--can be ``None``
        :arg name: the name of the element
        :arg attrs: the attributes of the element as a dict
        :returns: StartTag token
        """
        return {"type": "StartTag",
                "name": name,
                "namespace": namespace,
                "data": attrs}
    def endTag(self, namespace, name):
        """Generates an EndTag token
        :arg namespace: the namespace of the token--can be ``None``
        :arg name: the name of the element
        :returns: EndTag token
        """
        return {"type": "EndTag",
                "name": name,
                "namespace": namespace}
    def text(self, data):
        """Generates SpaceCharacters and Characters tokens
        Depending on what's in the data, this generates one or more
        ``SpaceCharacters`` and ``Characters`` tokens.
        For example:
            >>> from html5lib.treewalkers.base import TreeWalker
            >>> # Give it an empty tree just so it instantiates
            >>> walker = TreeWalker([])
            >>> list(walker.text(''))
            []
            >>> list(walker.text('  '))
            [{u'data': '  ', u'type': u'SpaceCharacters'}]
            >>> list(walker.text(' abc '))  # doctest: +NORMALIZE_WHITESPACE
            [{u'data': ' ', u'type': u'SpaceCharacters'},
            {u'data': u'abc', u'type': u'Characters'},
            {u'data': u' ', u'type': u'SpaceCharacters'}]
        :arg data: the text data
        :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
        """
        data = data
        middle = data.lstrip(spaceCharacters)
        left = data[:len(data) - len(middle)]
        if left:
            yield {"type": "SpaceCharacters", "data": left}
        data = middle
        middle = data.rstrip(spaceCharacters)
        right = data[len(middle):]
        if middle:
            yield {"type": "Characters", "data": middle}
        if right:
            yield {"type": "SpaceCharacters", "data": right}
    def comment(self, data):
        """Generates a Comment token
        :arg data: the comment
        :returns: Comment token
        """
        return {"type": "Comment", "data": data}
    def doctype(self, name, publicId=None, systemId=None):
        """Generates a Doctype token
        :arg name:
        :arg publicId:
        :arg systemId:
        :returns: the Doctype token
        """
        return {"type": "Doctype",
                "name": name,
                "publicId": publicId,
                "systemId": systemId}
    def entity(self, name):
        """Generates an Entity token
        :arg name: the entity name
        :returns: an Entity token
        """
        return {"type": "Entity", "name": name}
    def unknown(self, nodeType):
        """Handles unknown node types"""
        return self.error("Unknown node type: " + nodeType)
 class NonRecursiveTreeWalker(TreeWalker):
    def getNodeDetails(self, node):
        raise NotImplementedError
    def getFirstChild(self, node):
        raise NotImplementedError
    def getNextSibling(self, node):
        raise NotImplementedError
    def getParentNode(self, node):
        raise NotImplementedError
    def __iter__(self):
        currentNode = self.tree
        while currentNode is not None:
            details = self.getNodeDetails(currentNode)
            type, details = details[0], details[1:]
            hasChildren = False
            if type == DOCTYPE:
                yield self.doctype(*details)
            elif type == TEXT:
                for token in self.text(*details):
                    yield token
            elif type == ELEMENT:
                namespace, name, attributes, hasChildren = details
                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    for token in self.emptyTag(namespace, name, attributes,
                                               hasChildren):
                        yield token
                    hasChildren = False
                else:
                    yield self.startTag(namespace, name, attributes)
            elif type == COMMENT:
                yield self.comment(details[0])
            elif type == ENTITY:
                yield self.entity(details[0])
            elif type == DOCUMENT:
                hasChildren = True
            else:
                yield self.unknown(details[0])
            if hasChildren:
                firstChild = self.getFirstChild(currentNode)
            else:
                firstChild = None
            if firstChild is not None:
                currentNode = firstChild
            else:
                while currentNode is not None:
                    details = self.getNodeDetails(currentNode)
                    type, details = details[0], details[1:]
                    if type == ELEMENT:
                        namespace, name, attributes, hasChildren = details
                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
                            yield self.endTag(namespace, name)
                    if self.tree is currentNode:
                        currentNode = None
                        break
                    nextSibling = self.getNextSibling(currentNode)
                    if nextSibling is not None:
                        currentNode = nextSibling
                        break
                    else:
                        currentNode = self.getParentNode(currentNode)
--- a/lib/bleach/_vendor/html5lib/treewalkers/dom.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/dom.py
@ -0,0 +1,43 @@
 from __future__ import absolute_import, division, unicode_literals
 from xml.dom import Node
 from . import base
 class TreeWalker(base.NonRecursiveTreeWalker):
    def getNodeDetails(self, node):
        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
            return base.DOCTYPE, node.name, node.publicId, node.systemId
        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
            return base.TEXT, node.nodeValue
        elif node.nodeType == Node.ELEMENT_NODE:
            attrs = {}
            for attr in list(node.attributes.keys()):
                attr = node.getAttributeNode(attr)
                if attr.namespaceURI:
                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
                else:
                    attrs[(None, attr.name)] = attr.value
            return (base.ELEMENT, node.namespaceURI, node.nodeName,
                    attrs, node.hasChildNodes())
        elif node.nodeType == Node.COMMENT_NODE:
            return base.COMMENT, node.nodeValue
        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
            return (base.DOCUMENT,)
        else:
            return base.UNKNOWN, node.nodeType
    def getFirstChild(self, node):
        return node.firstChild
    def getNextSibling(self, node):
        return node.nextSibling
    def getParentNode(self, node):
        return node.parentNode
--- a/lib/bleach/_vendor/html5lib/treewalkers/etree.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/etree.py
@ -0,0 +1,131 @@
 from __future__ import absolute_import, division, unicode_literals
 from collections import OrderedDict
 import re
 from six import string_types
 from . import base
 from .._utils import moduleFactoryFactory
 tag_regexp = re.compile("{([^}]*)}(.*)")
 def getETreeBuilder(ElementTreeImplementation):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag
    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
        """Given the particular ElementTree representation, this implementation,
        to avoid using recursion, returns "nodes" as tuples with the following
        content:
        1. The current element
        2. The index of the element relative to its parent
        3. A stack of ancestor elements
        4. A flag "text", "tail" or None to indicate if the current node is a
           text node; either the text or tail of the current element (1)
        """
        def getNodeDetails(self, node):
            if isinstance(node, tuple):  # It might be the root Element
                elt, _, _, flag = node
                if flag in ("text", "tail"):
                    return base.TEXT, getattr(elt, flag)
                else:
                    node = elt
            if not(hasattr(node, "tag")):
                node = node.getroot()
            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
                return (base.DOCUMENT,)
            elif node.tag == "<!DOCTYPE>":
                return (base.DOCTYPE, node.text,
                        node.get("publicId"), node.get("systemId"))
            elif node.tag == ElementTreeCommentType:
                return base.COMMENT, node.text
            else:
                assert isinstance(node.tag, string_types), type(node.tag)
                # This is assumed to be an ordinary element
                match = tag_regexp.match(node.tag)
                if match:
                    namespace, tag = match.groups()
                else:
                    namespace = None
                    tag = node.tag
                attrs = OrderedDict()
                for name, value in list(node.attrib.items()):
                    match = tag_regexp.match(name)
                    if match:
                        attrs[(match.group(1), match.group(2))] = value
                    else:
                        attrs[(None, name)] = value
                return (base.ELEMENT, namespace, tag,
                        attrs, len(node) or node.text)
        def getFirstChild(self, node):
            if isinstance(node, tuple):
                element, key, parents, flag = node
            else:
                element, key, parents, flag = node, None, [], None
            if flag in ("text", "tail"):
                return None
            else:
                if element.text:
                    return element, key, parents, "text"
                elif len(element):
                    parents.append(element)
                    return element[0], 0, parents, None
                else:
                    return None
        def getNextSibling(self, node):
            if isinstance(node, tuple):
                element, key, parents, flag = node
            else:
                return None
            if flag == "text":
                if len(element):
                    parents.append(element)
                    return element[0], 0, parents, None
                else:
                    return None
            else:
                if element.tail and flag != "tail":
                    return element, key, parents, "tail"
                elif key < len(parents[-1]) - 1:
                    return parents[-1][key + 1], key + 1, parents, None
                else:
                    return None
        def getParentNode(self, node):
            if isinstance(node, tuple):
                element, key, parents, flag = node
            else:
                return None
            if flag == "text":
                if not parents:
                    return element
                else:
                    return element, key, parents, None
            else:
                parent = parents.pop()
                if not parents:
                    return parent
                else:
                    assert list(parents[-1]).count(parent) == 1
                    return parent, list(parents[-1]).index(parent), parents, None
    return locals()
 getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
@ -0,0 +1,215 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from collections import OrderedDict
 from lxml import etree
 from ..treebuilders.etree import tag_regexp
 from . import base
 from .. import _ihatexml
 def ensure_str(s):
    if s is None:
        return None
    elif isinstance(s, text_type):
        return s
    else:
        return s.decode("ascii", "strict")
 class Root(object):
    def __init__(self, et):
        self.elementtree = et
        self.children = []
        try:
            if et.docinfo.internalDTD:
                self.children.append(Doctype(self,
                                             ensure_str(et.docinfo.root_name),
                                             ensure_str(et.docinfo.public_id),
                                             ensure_str(et.docinfo.system_url)))
        except AttributeError:
            pass
        try:
            node = et.getroot()
        except AttributeError:
            node = et
        while node.getprevious() is not None:
            node = node.getprevious()
        while node is not None:
            self.children.append(node)
            node = node.getnext()
        self.text = None
        self.tail = None
    def __getitem__(self, key):
        return self.children[key]
    def getnext(self):
        return None
    def __len__(self):
        return 1
 class Doctype(object):
    def __init__(self, root_node, name, public_id, system_id):
        self.root_node = root_node
        self.name = name
        self.public_id = public_id
        self.system_id = system_id
        self.text = None
        self.tail = None
    def getnext(self):
        return self.root_node.children[1]
 class FragmentRoot(Root):
    def __init__(self, children):
        self.children = [FragmentWrapper(self, child) for child in children]
        self.text = self.tail = None
    def getnext(self):
        return None
 class FragmentWrapper(object):
    def __init__(self, fragment_root, obj):
        self.root_node = fragment_root
        self.obj = obj
        if hasattr(self.obj, 'text'):
            self.text = ensure_str(self.obj.text)
        else:
            self.text = None
        if hasattr(self.obj, 'tail'):
            self.tail = ensure_str(self.obj.tail)
        else:
            self.tail = None
    def __getattr__(self, name):
        return getattr(self.obj, name)
    def getnext(self):
        siblings = self.root_node.children
        idx = siblings.index(self)
        if idx < len(siblings) - 1:
            return siblings[idx + 1]
        else:
            return None
    def __getitem__(self, key):
        return self.obj[key]
    def __bool__(self):
        return bool(self.obj)
    def getparent(self):
        return None
    def __str__(self):
        return str(self.obj)
    def __unicode__(self):
        return str(self.obj)
    def __len__(self):
        return len(self.obj)
 class TreeWalker(base.NonRecursiveTreeWalker):
    def __init__(self, tree):
        # pylint:disable=redefined-variable-type
        if isinstance(tree, list):
            self.fragmentChildren = set(tree)
            tree = FragmentRoot(tree)
        else:
            self.fragmentChildren = set()
            tree = Root(tree)
        base.NonRecursiveTreeWalker.__init__(self, tree)
        self.filter = _ihatexml.InfosetFilter()
    def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))
        elif isinstance(node, Root):
            return (base.DOCUMENT,)
        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id
        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)
        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)
        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = OrderedDict()
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text)
    def getFirstChild(self, node):
        assert not isinstance(node, tuple), "Text nodes have no children"
        assert len(node) or node.text, "Node has no children"
        if node.text:
            return (node, "text")
        else:
            return node[0]
    def getNextSibling(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            if key == "text":
                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
                # because node[0] might evaluate to False if it has no child element
                if len(node):
                    return node[0]
                else:
                    return None
            else:  # tail
                return node.getnext()
        return (node, "tail") if node.tail else node.getnext()
    def getParentNode(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            if key == "text":
                return node
            # else: fallback to "normal" processing
        elif node in self.fragmentChildren:
            return None
        return node.getparent()
--- a/lib/bleach/_vendor/html5lib/treewalkers/genshi.py
+++ b/lib/bleach/_vendor/html5lib/treewalkers/genshi.py
@ -0,0 +1,69 @@
 from __future__ import absolute_import, division, unicode_literals
 from genshi.core import QName
 from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
 from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 from . import base
 from ..constants import voidElements, namespaces
 class TreeWalker(base.TreeWalker):
    def __iter__(self):
        # Buffer the events so we can pass in the following one
        previous = None
        for event in self.tree:
            if previous is not None:
                for token in self.tokens(previous, event):
                    yield token
            previous = event
        # Don't forget the final event!
        if previous is not None:
            for token in self.tokens(previous, None):
                yield token
    def tokens(self, event, next):
        kind, data, _ = event
        if kind == START:
            tag, attribs = data
            name = tag.localname
            namespace = tag.namespace
            converted_attribs = {}
            for k, v in attribs:
                if isinstance(k, QName):
                    converted_attribs[(k.namespace, k.localname)] = v
                else:
                    converted_attribs[(None, k)] = v
            if namespace == namespaces["html"] and name in voidElements:
                for token in self.emptyTag(namespace, name, converted_attribs,
                                           not next or next[0] != END or
                                           next[1] != tag):
                    yield token
            else:
                yield self.startTag(namespace, name, converted_attribs)
        elif kind == END:
            name = data.localname
            namespace = data.namespace
            if namespace != namespaces["html"] or name not in voidElements:
                yield self.endTag(namespace, name)
        elif kind == COMMENT:
            yield self.comment(data)
        elif kind == TEXT:
            for token in self.text(data):
                yield token
        elif kind == DOCTYPE:
            yield self.doctype(*data)
        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
                      START_CDATA, END_CDATA, PI):
            pass
        else:
            yield self.unknown(kind)
--- a/lib/bleach/_vendor/parse.py
+++ b/lib/bleach/_vendor/parse.py
--- a/lib/bleach/_vendor/vendor.txt
+++ b/lib/bleach/_vendor/vendor.txt
@ -0,0 +1,3 @@
 html5lib==1.1 \
    --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
    --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
--- a/lib/bleach/_vendor/vendor_install.sh
+++ b/lib/bleach/_vendor/vendor_install.sh
@ -0,0 +1,14 @@
 #!/bin/bash
 set -e
 set -u
 set -o pipefail
 BLEACH_VENDOR_DIR=${BLEACH_VENDOR_DIR:-"."}
 DEST=${DEST:-"."}
 pip install --no-binary all --no-compile --no-deps -r "${BLEACH_VENDOR_DIR}/vendor.txt" --target "${DEST}"
 # install Python 3.6.14 urllib.urlparse for #536
 curl --proto '=https' --tlsv1.2 -o "${DEST}/parse.py" https://raw.githubusercontent.com/python/cpython/v3.6.14/Lib/urllib/parse.py
 (cd "${DEST}" && sha256sum parse.py > parse.py.SHA256SUM)
--- a/lib/bleach/callbacks.py
+++ b/lib/bleach/callbacks.py
@ -1,20 +1,32 @@
 """A set of basic callbacks for bleach.linkify."""
 from __future__ import unicode_literals
 def nofollow(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
+    href_key = (None, "href")
    if href_key not in attrs:
        return attrs
-    rel = [x for x in attrs.get('rel', '').split(' ') if x]
+
-    if 'nofollow' not in [x.lower() for x in rel]:
+    if attrs[href_key].startswith("mailto:"):
-        rel.append('nofollow')
+        return attrs
-    attrs['rel'] = ' '.join(rel)
+
    rel_key = (None, "rel")
    rel_values = [val for val in attrs.get(rel_key, "").split(" ") if val]
    if "nofollow" not in [rel_val.lower() for rel_val in rel_values]:
        rel_values.append("nofollow")
    attrs[rel_key] = " ".join(rel_values)
    return attrs
 def target_blank(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
+    href_key = (None, "href")
    if href_key not in attrs:
        return attrs
-    attrs['target'] = '_blank'
+
    if attrs[href_key].startswith("mailto:"):
        return attrs
    attrs[(None, "target")] = "_blank"
    return attrs
--- a/lib/bleach/encoding.py
+++ b/lib/bleach/encoding.py
@ -1,62 +0,0 @@
 import datetime
 from decimal import Decimal
 import types
 import six
 def is_protected_type(obj):
    """Determine if the object instance is of a protected type.
    Objects of protected types are preserved as-is when passed to
    force_unicode(strings_only=True).
    """
    return isinstance(obj, (
        six.integer_types +
        (types.NoneType,
         datetime.datetime, datetime.date, datetime.time,
         float, Decimal))
    )
 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
    """
    Similar to smart_text, except that lazy instances are resolved to
    strings, rather than kept as lazy objects.
    If strings_only is True, don't convert (some) non-string-like objects.
    """
    # Handle the common case first, saves 30-40% when s is an instance of
    # six.text_type. This function gets called often in that setting.
    if isinstance(s, six.text_type):
        return s
    if strings_only and is_protected_type(s):
        return s
    try:
        if not isinstance(s, six.string_types):
            if hasattr(s, '__unicode__'):
                s = s.__unicode__()
            else:
                if six.PY3:
                    if isinstance(s, bytes):
                        s = six.text_type(s, encoding, errors)
                    else:
                        s = six.text_type(s)
                else:
                    s = six.text_type(bytes(s), encoding, errors)
        else:
            # Note: We use .decode() here, instead of six.text_type(s,
            # encoding, errors), so that if s is a SafeBytes, it ends up being
            # a SafeText at the end.
            s = s.decode(encoding, errors)
    except UnicodeDecodeError as e:
        if not isinstance(s, Exception):
            raise UnicodeDecodeError(*e.args)
        else:
            # If we get to here, the caller has passed in an Exception
            # subclass populated with non-ASCII bytestring data without a
            # working unicode method. Try to handle this without raising a
            # further exception by individually forcing the exception args
            # to unicode.
            s = ' '.join([force_unicode(arg, encoding, strings_only,
                          errors) for arg in s])
    return s
--- a/lib/bleach/html5lib_shim.py
+++ b/lib/bleach/html5lib_shim.py
@ -0,0 +1,665 @@
 # flake8: noqa
 """
 Shim module between Bleach and html5lib. This makes it easier to upgrade the
 html5lib library without having to change a lot of code.
 """
 import re
 import string
 import warnings
 # ignore html5lib deprecation warnings to use bleach; we are bleach
 # apply before we import submodules that import html5lib
 warnings.filterwarnings(
    "ignore",
    message="html5lib's sanitizer is deprecated",
    category=DeprecationWarning,
    module="bleach._vendor.html5lib",
 )
 from bleach._vendor.html5lib import (  # noqa: E402 module level import not at top of file
    HTMLParser,
    getTreeWalker,
 )
 from bleach._vendor.html5lib import (
    constants,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.constants import (  # noqa: E402 module level import not at top of file
    namespaces,
    prefixes,
 )
 from bleach._vendor.html5lib.constants import (
    _ReparseException as ReparseException,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.filters.base import (
    Filter,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.filters.sanitizer import (
    allowed_protocols,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.filters.sanitizer import (
    Filter as SanitizerFilter,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib._inputstream import (
    HTMLInputStream,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.serializer import (
    escape,
    HTMLSerializer,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib._tokenizer import (
    attributeMap,
    HTMLTokenizer,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib._trie import (
    Trie,
 )  # noqa: E402 module level import not at top of file
 #: Map of entity name to expanded entity
 ENTITIES = constants.entities
 #: Trie of html entity string -> character representation
 ENTITIES_TRIE = Trie(ENTITIES)
 #: Token type constants--these never change
 TAG_TOKEN_TYPES = {
    constants.tokenTypes["StartTag"],
    constants.tokenTypes["EndTag"],
    constants.tokenTypes["EmptyTag"],
 }
 CHARACTERS_TYPE = constants.tokenTypes["Characters"]
 PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
 HTML_TAGS = [
    "a",
    "abbr",
    "address",
    "area",
    "article",
    "aside",
    "audio",
    "b",
    "base",
    "bdi",
    "bdo",
    "blockquote",
    "body",
    "br",
    "button",
    "canvas",
    "caption",
    "cite",
    "code",
    "col",
    "colgroup",
    "data",
    "datalist",
    "dd",
    "del",
    "details",
    "dfn",
    "dialog",
    "div",
    "dl",
    "dt",
    "em",
    "embed",
    "fieldset",
    "figcaption",
    "figure",
    "footer",
    "form",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "head",
    "header",
    "hgroup",
    "hr",
    "html",
    "i",
    "iframe",
    "img",
    "input",
    "ins",
    "kbd",
    "keygen",
    "label",
    "legend",
    "li",
    "link",
    "map",
    "mark",
    "menu",
    "meta",
    "meter",
    "nav",
    "noscript",
    "object",
    "ol",
    "optgroup",
    "option",
    "output",
    "p",
    "param",
    "picture",
    "pre",
    "progress",
    "q",
    "rp",
    "rt",
    "ruby",
    "s",
    "samp",
    "script",
    "section",
    "select",
    "slot",
    "small",
    "source",
    "span",
    "strong",
    "style",
    "sub",
    "summary",
    "sup",
    "table",
    "tbody",
    "td",
    "template",
    "textarea",
    "tfoot",
    "th",
    "thead",
    "time",
    "title",
    "tr",
    "track",
    "u",
    "ul",
    "var",
    "video",
    "wbr",
 ]
 class InputStreamWithMemory:
    """Wraps an HTMLInputStream to remember characters since last <
    This wraps existing HTMLInputStream classes to keep track of the stream
    since the last < which marked an open tag state.
    """
    def __init__(self, inner_stream):
        self._inner_stream = inner_stream
        self.reset = self._inner_stream.reset
        self.position = self._inner_stream.position
        self._buffer = []
    @property
    def errors(self):
        return self._inner_stream.errors
    @property
    def charEncoding(self):
        return self._inner_stream.charEncoding
    @property
    def changeEncoding(self):
        return self._inner_stream.changeEncoding
    def char(self):
        c = self._inner_stream.char()
        # char() can return None if EOF, so ignore that
        if c:
            self._buffer.append(c)
        return c
    def charsUntil(self, characters, opposite=False):
        chars = self._inner_stream.charsUntil(characters, opposite=opposite)
        self._buffer.extend(list(chars))
        return chars
    def unget(self, char):
        if self._buffer:
            self._buffer.pop(-1)
        return self._inner_stream.unget(char)
    def get_tag(self):
        """Returns the stream history since last '<'
        Since the buffer starts at the last '<' as as seen by tagOpenState(),
        we know that everything from that point to when this method is called
        is the "tag" that is being tokenized.
        """
        return "".join(self._buffer)
    def start_tag(self):
        """Resets stream history to just '<'
        This gets called by tagOpenState() which marks a '<' that denotes an
        open tag. Any time we see that, we reset the buffer.
        """
        self._buffer = ["<"]
 class BleachHTMLTokenizer(HTMLTokenizer):
    """Tokenizer that doesn't consume character entities"""
    def __init__(self, consume_entities=False, **kwargs):
        super(BleachHTMLTokenizer, self).__init__(**kwargs)
        self.consume_entities = consume_entities
        # Wrap the stream with one that remembers the history
        self.stream = InputStreamWithMemory(self.stream)
    def __iter__(self):
        last_error_token = None
        for token in super(BleachHTMLTokenizer, self).__iter__():
            if last_error_token is not None:
                if (
                    last_error_token["data"] == "invalid-character-in-attribute-name"
                    and token["type"] in TAG_TOKEN_TYPES
                    and token.get("data")
                ):
                    # token["data"] is an html5lib attributeMap
                    # (OrderedDict 3.7+ and dict otherwise)
                    # of attr name to attr value
                    #
                    # Remove attribute names that have ', " or < in them
                    # because those characters are invalid for attribute names.
                    token["data"] = attributeMap(
                        (attr_name, attr_value)
                        for attr_name, attr_value in token["data"].items()
                        if (
                            '"' not in attr_name
                            and "'" not in attr_name
                            and "<" not in attr_name
                        )
                    )
                    last_error_token = None
                    yield token
                elif (
                    last_error_token["data"] == "expected-closing-tag-but-got-char"
                    and self.parser.tags is not None
                    and token["data"].lower().strip() not in self.parser.tags
                ):
                    # We've got either a malformed tag or a pseudo-tag or
                    # something that html5lib wants to turn into a malformed
                    # comment which Bleach clean() will drop so we interfere
                    # with the token stream to handle it more correctly.
                    #
                    # If this is an allowed tag, it's malformed and we just let
                    # the html5lib parser deal with it--we don't enter into this
                    # block.
                    #
                    # If this is not an allowed tag, then we convert it to
                    # characters and it'll get escaped in the sanitizer.
                    token["data"] = self.stream.get_tag()
                    token["type"] = CHARACTERS_TYPE
                    last_error_token = None
                    yield token
                elif token["type"] == PARSEERROR_TYPE:
                    # If the token is a parse error, then let the last_error_token
                    # go, and make token the new last_error_token
                    yield last_error_token
                    last_error_token = token
                else:
                    yield last_error_token
                    yield token
                    last_error_token = None
                continue
            # If the token is a ParseError, we hold on to it so we can get the
            # next token and potentially fix it.
            if token["type"] == PARSEERROR_TYPE:
                last_error_token = token
                continue
            yield token
        if last_error_token:
            yield last_error_token
    def consumeEntity(self, allowedChar=None, fromAttribute=False):
        # If this tokenizer is set to consume entities, then we can let the
        # superclass do its thing.
        if self.consume_entities:
            return super(BleachHTMLTokenizer, self).consumeEntity(
                allowedChar, fromAttribute
            )
        # If this tokenizer is set to not consume entities, then we don't want
        # to consume and convert them, so this overrides the html5lib tokenizer's
        # consumeEntity so that it's now a no-op.
        #
        # However, when that gets called, it's consumed an &, so we put that back in
        # the stream.
        if fromAttribute:
            self.currentToken["data"][-1][1] += "&"
        else:
            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
    def tagOpenState(self):
        # This state marks a < that is either a StartTag, EndTag, EmptyTag,
        # or ParseError. In all cases, we want to drop any stream history
        # we've collected so far and we do that by calling start_tag() on
        # the input stream wrapper.
        self.stream.start_tag()
        return super(BleachHTMLTokenizer, self).tagOpenState()
    def emitCurrentToken(self):
        token = self.currentToken
        if (
            self.parser.tags is not None
            and token["type"] in TAG_TOKEN_TYPES
            and token["name"].lower() not in self.parser.tags
        ):
            # If this is a start/end/empty tag for a tag that's not in our
            # allowed list, then it gets stripped or escaped. In both of these
            # cases it gets converted to a Characters token.
            if self.parser.strip:
                # If we're stripping the token, we just throw in an empty
                # string token.
                new_data = ""
            else:
                # If we're escaping the token, we want to escape the exact
                # original string. Since tokenizing also normalizes data
                # and this is a tag-like thing, we've lost some information.
                # So we go back through the stream to get the original
                # string and use that.
                new_data = self.stream.get_tag()
            new_token = {"type": CHARACTERS_TYPE, "data": new_data}
            self.currentToken = new_token
            self.tokenQueue.append(new_token)
            self.state = self.dataState
            return
        super(BleachHTMLTokenizer, self).emitCurrentToken()
 class BleachHTMLParser(HTMLParser):
    """Parser that uses BleachHTMLTokenizer"""
    def __init__(self, tags, strip, consume_entities, **kwargs):
        """
        :arg tags: list of allowed tags--everything else is either stripped or
            escaped; if None, then this doesn't look at tags at all
        :arg strip: whether to strip disallowed tags (True) or escape them (False);
            if tags=None, then this doesn't have any effect
        :arg consume_entities: whether to consume entities (default behavior) or
            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
        """
        self.tags = [tag.lower() for tag in tags] if tags is not None else None
        self.strip = strip
        self.consume_entities = consume_entities
        super(BleachHTMLParser, self).__init__(**kwargs)
    def _parse(
        self, stream, innerHTML=False, container="div", scripting=True, **kwargs
    ):
        # set scripting=True to parse <noscript> as though JS is enabled to
        # match the expected context in browsers
        #
        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
        #
        # Override HTMLParser so we can swap out the tokenizer for our own.
        self.innerHTMLMode = innerHTML
        self.container = container
        self.scripting = scripting
        self.tokenizer = BleachHTMLTokenizer(
            stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
        )
        self.reset()
        try:
            self.mainLoop()
        except ReparseException:
            self.reset()
            self.mainLoop()
 def convert_entity(value):
    """Convert an entity (minus the & and ; part) into what it represents
    This handles numeric, hex, and text entities.
    :arg value: the string (minus the ``&`` and ``;`` part) to convert
    :returns: unicode character or None if it's an ambiguous ampersand that
        doesn't match a character entity
    """
    if value[0] == "#":
        if len(value) < 2:
            return None
        if value[1] in ("x", "X"):
            # hex-encoded code point
            int_as_string, base = value[2:], 16
        else:
            # decimal code point
            int_as_string, base = value[1:], 10
        if int_as_string == "":
            return None
        code_point = int(int_as_string, base)
        if 0 < code_point < 0x110000:
            return chr(code_point)
        else:
            return None
    return ENTITIES.get(value, None)
 def convert_entities(text):
    """Converts all found entities in the text
    :arg text: the text to convert entities in
    :returns: unicode text with converted entities
    """
    if "&" not in text:
        return text
    new_text = []
    for part in next_possible_entity(text):
        if not part:
            continue
        if part.startswith("&"):
            entity = match_entity(part)
            if entity is not None:
                converted = convert_entity(entity)
                # If it's not an ambiguous ampersand, then replace with the
                # unicode character. Otherwise, we leave the entity in.
                if converted is not None:
                    new_text.append(converted)
                    remainder = part[len(entity) + 2 :]
                    if part:
                        new_text.append(remainder)
                    continue
        new_text.append(part)
    return "".join(new_text)
 def match_entity(stream):
    """Returns first entity in stream or None if no entity exists
    Note: For Bleach purposes, entities must start with a "&" and end with
    a ";". This ignoresambiguous character entities that have no ";" at the
    end.
    :arg stream: the character stream
    :returns: ``None`` or the entity string without "&" or ";"
    """
    # Nix the & at the beginning
    if stream[0] != "&":
        raise ValueError('Stream should begin with "&"')
    stream = stream[1:]
    stream = list(stream)
    possible_entity = ""
    end_characters = "<&=;" + string.whitespace
    # Handle number entities
    if stream and stream[0] == "#":
        possible_entity = "#"
        stream.pop(0)
        if stream and stream[0] in ("x", "X"):
            allowed = "0123456789abcdefABCDEF"
            possible_entity += stream.pop(0)
        else:
            allowed = "0123456789"
        # FIXME(willkg): Do we want to make sure these are valid number
        # entities? This doesn't do that currently.
        while stream and stream[0] not in end_characters:
            c = stream.pop(0)
            if c not in allowed:
                break
            possible_entity += c
        if possible_entity and stream and stream[0] == ";":
            return possible_entity
        return None
    # Handle character entities
    while stream and stream[0] not in end_characters:
        c = stream.pop(0)
        if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
            break
        possible_entity += c
    if possible_entity and stream and stream[0] == ";":
        return possible_entity
    return None
 AMP_SPLIT_RE = re.compile("(&)")
 def next_possible_entity(text):
    """Takes a text and generates a list of possible entities
    :arg text: the text to look at
    :returns: generator where each part (except the first) starts with an
        "&"
    """
    for i, part in enumerate(AMP_SPLIT_RE.split(text)):
        if i == 0:
            yield part
        elif i % 2 == 0:
            yield "&" + part
 class BleachHTMLSerializer(HTMLSerializer):
    """HTMLSerializer that undoes & -> &amp; in attributes and sets
    escape_rcdata to True
    """
    # per the HTMLSerializer.__init__ docstring:
    #
    # Whether to escape characters that need to be
    # escaped within normal elements within rcdata elements such as
    # style.
    #
    escape_rcdata = True
    def escape_base_amp(self, stoken):
        """Escapes just bare & in HTML attribute values"""
        # First, undo escaping of &. We need to do this because html5lib's
        # HTMLSerializer expected the tokenizer to consume all the character
        # entities and convert them to their respective characters, but the
        # BleachHTMLTokenizer doesn't do that. For example, this fixes
        # &amp;entity; back to &entity; .
        stoken = stoken.replace("&amp;", "&")
        # However, we do want all bare & that are not marking character
        # entities to be changed to &amp;, so let's do that carefully here.
        for part in next_possible_entity(stoken):
            if not part:
                continue
            if part.startswith("&"):
                entity = match_entity(part)
                # Only leave entities in that are not ambiguous. If they're
                # ambiguous, then we escape the ampersand.
                if entity is not None and convert_entity(entity) is not None:
                    yield "&" + entity + ";"
                    # Length of the entity plus 2--one for & at the beginning
                    # and one for ; at the end
                    part = part[len(entity) + 2 :]
                    if part:
                        yield part
                    continue
            yield part.replace("&", "&amp;")
    def serialize(self, treewalker, encoding=None):
        """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
        Note that this converts & to &amp; in attribute values where the & isn't
        already part of an unambiguous character entity.
        """
        in_tag = False
        after_equals = False
        for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
            if in_tag:
                if stoken == ">":
                    in_tag = False
                elif after_equals:
                    if stoken != '"':
                        for part in self.escape_base_amp(stoken):
                            yield part
                        after_equals = False
                        continue
                elif stoken == "=":
                    after_equals = True
                yield stoken
            else:
                if stoken.startswith("<"):
                    in_tag = True
                yield stoken
--- a/lib/bleach/linkifier.py
+++ b/lib/bleach/linkifier.py
@ -0,0 +1,574 @@
 import re
 from bleach import callbacks as linkify_callbacks
 from bleach import html5lib_shim
 from bleach.utils import alphabetize_attributes
 #: List of default callbacks
 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
       xn xxx ye yt yu za zm zw""".split()
 # Make sure that .com doesn't get matched by .co first
 TLDS.reverse()
 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
    """Builds the url regex used by linkifier
    If you want a different set of tlds or allowed protocols, pass those in
    and stomp on the existing ``url_re``::
        from bleach import linkifier
        my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
        linker = LinkifyFilter(url_re=my_url_re)
    """
    return re.compile(
        r"""\(*  # Match any opening parentheses.
        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
            # /path/zz (excluding "unsafe" chars from RFC 1738,
            # except for # and ~, which happen in practice)
        """.format(
            "|".join(sorted(protocols)), "|".join(sorted(tlds))
        ),
        re.IGNORECASE | re.VERBOSE | re.UNICODE,
    )
 URL_RE = build_url_re()
 PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
 def build_email_re(tlds=TLDS):
    """Builds the email regex used by linkifier
    If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
        from bleach import linkifier
        my_email_re = linkifier.build_email_re(my_tlds_list)
        linker = LinkifyFilter(email_re=my_url_re)
    """
    # open and closing braces doubled below for format string
    return re.compile(
        r"""(?<!//)
        (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
            (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)*  # dot-atom
        |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
            |\\[\001-\011\013\014\016-\177])*"  # quoted-string
        )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0}))  # domain
        """.format(
            "|".join(tlds)
        ),
        re.IGNORECASE | re.MULTILINE | re.VERBOSE,
    )
 EMAIL_RE = build_email_re()
 class Linker:
    """Convert URL-like strings in an HTML fragment to links
    This function converts strings that look like URLs, domain names and email
    addresses in text that may be an HTML fragment to links, while preserving:
    1. links already in the string
    2. urls found in attributes
    3. email addresses
    linkify does a best-effort approach and tries to recover from bad
    situations due to crazy text.
    """
    def __init__(
        self,
        callbacks=DEFAULT_CALLBACKS,
        skip_tags=None,
        parse_email=False,
        url_re=URL_RE,
        email_re=EMAIL_RE,
        recognized_tags=html5lib_shim.HTML_TAGS,
    ):
        """Creates a Linker instance
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
        :arg list skip_tags: list of tags that you don't want to linkify the
            contents of; for example, you could set this to ``['pre']`` to skip
            linkifying contents of ``pre`` tags
        :arg bool parse_email: whether or not to linkify email addresses
        :arg re url_re: url matching regex
        :arg re email_re: email matching regex
        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
            everything else gets escaped
        :returns: linkified text as unicode
        """
        self.callbacks = callbacks
        self.skip_tags = skip_tags
        self.parse_email = parse_email
        self.url_re = url_re
        self.email_re = email_re
        # Create a parser/tokenizer that allows all HTML tags and escapes
        # anything not in that list.
        self.parser = html5lib_shim.BleachHTMLParser(
            tags=recognized_tags,
            strip=False,
            consume_entities=True,
            namespaceHTMLElements=False,
        )
        self.walker = html5lib_shim.getTreeWalker("etree")
        self.serializer = html5lib_shim.BleachHTMLSerializer(
            quote_attr_values="always",
            omit_optional_tags=False,
            # linkify does not sanitize
            sanitize=False,
            # linkify alphabetizes
            alphabetical_attributes=False,
        )
    def linkify(self, text):
        """Linkify specified text
        :arg str text: the text to add links to
        :returns: linkified text as unicode
        :raises TypeError: if ``text`` is not a text type
        """
        if not isinstance(text, str):
            raise TypeError("argument must be of text type")
        if not text:
            return ""
        dom = self.parser.parseFragment(text)
        filtered = LinkifyFilter(
            source=self.walker(dom),
            callbacks=self.callbacks,
            skip_tags=self.skip_tags,
            parse_email=self.parse_email,
            url_re=self.url_re,
            email_re=self.email_re,
        )
        return self.serializer.render(filtered)
 class LinkifyFilter(html5lib_shim.Filter):
    """html5lib filter that linkifies text
    This will do the following:
    * convert email addresses into links
    * convert urls into links
    * edit existing links by running them through callbacks--the default is to
      add a ``rel="nofollow"``
    This filter can be used anywhere html5lib filters can be used.
    """
    def __init__(
        self,
        source,
        callbacks=DEFAULT_CALLBACKS,
        skip_tags=None,
        parse_email=False,
        url_re=URL_RE,
        email_re=EMAIL_RE,
    ):
        """Creates a LinkifyFilter instance
        :arg TreeWalker source: stream
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
        :arg list skip_tags: list of tags that you don't want to linkify the
            contents of; for example, you could set this to ``['pre']`` to skip
            linkifying contents of ``pre`` tags
        :arg bool parse_email: whether or not to linkify email addresses
        :arg re url_re: url matching regex
        :arg re email_re: email matching regex
        """
        super(LinkifyFilter, self).__init__(source)
        self.callbacks = callbacks or []
        self.skip_tags = skip_tags or []
        self.parse_email = parse_email
        self.url_re = url_re
        self.email_re = email_re
    def apply_callbacks(self, attrs, is_new):
        """Given an attrs dict and an is_new bool, runs through callbacks
        Callbacks can return an adjusted attrs dict or ``None``. In the case of
        ``None``, we stop going through callbacks and return that and the link
        gets dropped.
        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
        :arg bool is_new: whether or not this link was added by linkify
        :returns: adjusted attrs dict or ``None``
        """
        for cb in self.callbacks:
            attrs = cb(attrs, is_new)
            if attrs is None:
                return None
        return attrs
    def extract_character_data(self, token_list):
        """Extracts and squashes character sequences in a token stream"""
        # FIXME(willkg): This is a terrible idea. What it does is drop all the
        # tags from the token list and merge the Characters and SpaceCharacters
        # tokens into a single text.
        #
        # So something like this::
        #
        #     "<span>" "<b>" "some text" "</b>" "</span>"
        #
        # gets converted to "some text".
        #
        # This gets used to figure out the ``_text`` fauxttribute value for
        # linkify callables.
        #
        # I'm not really sure how else to support that ``_text`` fauxttribute and
        # maintain some modicum of backwards compatibility with previous versions
        # of Bleach.
        out = []
        for token in token_list:
            token_type = token["type"]
            if token_type in ["Characters", "SpaceCharacters"]:
                out.append(token["data"])
        return "".join(out)
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token["type"] == "Characters":
                text = token["data"]
                new_tokens = []
                end = 0
                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {"type": "Characters", "data": text[end : match.start()]}
                        )
                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, "href"): "mailto:%s" % match.group(0),
                        "_text": match.group(0),
                    }
                    attrs = self.apply_callbacks(attrs, True)
                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append(
                            {"type": "Characters", "data": match.group(0)}
                        )
                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop("_text", "")
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend(
                            [
                                {"type": "StartTag", "name": "a", "data": attrs},
                                {"type": "Characters", "data": str(_text)},
                                {"type": "EndTag", "name": "a"},
                            ]
                        )
                    end = match.end()
                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({"type": "Characters", "data": text[end:]})
                    for new_token in new_tokens:
                        yield new_token
                    continue
            yield token
    def strip_non_url_bits(self, fragment):
        """Strips non-url bits from the url
        This accounts for over-eager matching by the regex.
        """
        prefix = suffix = ""
        while fragment:
            # Try removing ( from the beginning and, if it's balanced, from the
            # end, too
            if fragment.startswith("("):
                prefix = prefix + "("
                fragment = fragment[1:]
                if fragment.endswith(")"):
                    suffix = ")" + suffix
                    fragment = fragment[:-1]
                continue
            # Now try extraneous things from the end. For example, sometimes we
            # pick up ) at the end of a url, but the url is in a parenthesized
            # phrase like:
            #
            #     "i looked at the site (at http://example.com)"
            if fragment.endswith(")") and "(" not in fragment:
                fragment = fragment[:-1]
                suffix = ")" + suffix
                continue
            # Handle commas
            if fragment.endswith(","):
                fragment = fragment[:-1]
                suffix = "," + suffix
                continue
            # Handle periods
            if fragment.endswith("."):
                fragment = fragment[:-1]
                suffix = "." + suffix
                continue
            # Nothing matched, so we're done
            break
        return fragment, prefix, suffix
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token["type"] == "EndTag" and token["name"] == "a":
                    in_a = False
                yield token
                continue
            elif token["type"] == "StartTag" and token["name"] == "a":
                in_a = True
                yield token
                continue
            if token["type"] == "Characters":
                text = token["data"]
                new_tokens = []
                end = 0
                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {"type": "Characters", "data": text[end : match.start()]}
                        )
                    url = match.group(0)
                    prefix = suffix = ""
                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)
                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = "http://%s" % url
                    attrs = {(None, "href"): href, "_text": url}
                    attrs = self.apply_callbacks(attrs, True)
                    if attrs is None:
                        # Just add the text
                        new_tokens.append(
                            {"type": "Characters", "data": prefix + url + suffix}
                        )
                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append({"type": "Characters", "data": prefix})
                        _text = attrs.pop("_text", "")
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend(
                            [
                                {"type": "StartTag", "name": "a", "data": attrs},
                                {"type": "Characters", "data": str(_text)},
                                {"type": "EndTag", "name": "a"},
                            ]
                        )
                        if suffix:
                            new_tokens.append({"type": "Characters", "data": suffix})
                    end = match.end()
                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({"type": "Characters", "data": text[end:]})
                    for new_token in new_tokens:
                        yield new_token
                    continue
            yield token
    def handle_a_tag(self, token_buffer):
        """Handle the "a" tag
        This could adjust the link or drop it altogether depending on what the
        callbacks return.
        This yields the new set of tokens.
        """
        a_token = token_buffer[0]
        if a_token["data"]:
            attrs = a_token["data"]
        else:
            attrs = {}
        text = self.extract_character_data(token_buffer)
        attrs["_text"] = text
        attrs = self.apply_callbacks(attrs, False)
        if attrs is None:
            # We're dropping the "a" tag and everything else and replacing
            # it with character data. So emit that token.
            yield {"type": "Characters", "data": text}
        else:
            new_text = attrs.pop("_text", "")
            a_token["data"] = alphabetize_attributes(attrs)
            if text == new_text:
                # The callbacks didn't change the text, so we yield the new "a"
                # token, then whatever else was there, then the end "a" token
                yield a_token
                for mem in token_buffer[1:]:
                    yield mem
            else:
                # If the callbacks changed the text, then we're going to drop
                # all the tokens between the start and end "a" tags and replace
                # it with the new text
                yield a_token
                yield {"type": "Characters", "data": str(new_text)}
                yield token_buffer[-1]
    def __iter__(self):
        in_a = False
        in_skip_tag = None
        token_buffer = []
        for token in super(LinkifyFilter, self).__iter__():
            if in_a:
                # Handle the case where we're in an "a" tag--we want to buffer tokens
                # until we hit an end "a" tag.
                if token["type"] == "EndTag" and token["name"] == "a":
                    # Add the end tag to the token buffer and then handle them
                    # and yield anything returned
                    token_buffer.append(token)
                    for new_token in self.handle_a_tag(token_buffer):
                        yield new_token
                    # Clear "a" related state and continue since we've yielded all
                    # the tokens we're going to yield
                    in_a = False
                    token_buffer = []
                else:
                    token_buffer.append(token)
                continue
            if token["type"] in ["StartTag", "EmptyTag"]:
                if token["name"] in self.skip_tags:
                    # Skip tags start a "special mode" where we don't linkify
                    # anything until the end tag.
                    in_skip_tag = token["name"]
                elif token["name"] == "a":
                    # The "a" tag is special--we switch to a slurp mode and
                    # slurp all the tokens until the end "a" tag and then
                    # figure out what to do with them there.
                    in_a = True
                    token_buffer.append(token)
                    # We buffer the start tag, so we don't want to yield it,
                    # yet
                    continue
            elif in_skip_tag and self.skip_tags:
                # NOTE(willkg): We put this clause here since in_a and
                # switching in and out of in_a takes precedence.
                if token["type"] == "EndTag" and token["name"] == in_skip_tag:
                    in_skip_tag = None
            elif not in_a and not in_skip_tag and token["type"] == "Characters":
                new_stream = iter([token])
                if self.parse_email:
                    new_stream = self.handle_email_addresses(new_stream)
                new_stream = self.handle_links(new_stream)
                for token in new_stream:
                    yield token
                # We've already yielded this token, so continue
                continue
            yield token
--- a/lib/bleach/sanitizer.py
+++ b/lib/bleach/sanitizer.py
@ -1,148 +1,645 @@
-from __future__ import unicode_literals
+from itertools import chain
 import re
-from xml.sax.saxutils import escape, unescape
+import warnings
-from html5lib.constants import tokenTypes
+from bleach._vendor.parse import urlparse
-from html5lib.sanitizer import HTMLSanitizerMixin
+from xml.sax.saxutils import unescape
-from html5lib.tokenizer import HTMLTokenizer
+
 from bleach import html5lib_shim
 from bleach.utils import alphabetize_attributes
-PROTOS = HTMLSanitizerMixin.acceptable_protocols
+#: List of allowed tags
-PROTOS.remove('feed')
+ALLOWED_TAGS = [
    "a",
    "abbr",
    "acronym",
    "b",
    "blockquote",
    "code",
    "em",
    "i",
    "li",
    "ol",
    "strong",
    "ul",
 ]
-class BleachSanitizerMixin(HTMLSanitizerMixin):
+#: Map of allowed attributes by tag
-    """Mixin to replace sanitize_token() and sanitize_css()."""
+ALLOWED_ATTRIBUTES = {
    "a": ["href", "title"],
    "abbr": ["title"],
    "acronym": ["title"],
 }
-    allowed_svg_properties = []
+#: List of allowed styles
 ALLOWED_STYLES = []
 #: List of allowed protocols
 ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
 INVISIBLE_CHARACTERS = "".join(
    [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
 )
 #: Regexp for characters that are invisible
 INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
 #: String to replace invisible characters with. This can be a character, a
 #: string, or even a function that takes a Python re matchobj
 INVISIBLE_REPLACEMENT_CHAR = "?"
 class Cleaner:
    """Cleaner for cleaning HTML fragments of malicious content
    This cleaner is a security-focused function whose sole purpose is to remove
    malicious content from a string such that it can be displayed as content in
    a web page.
    To use::
        from bleach.sanitizer import Cleaner
        cleaner = Cleaner()
        for text in all_the_yucky_things:
            sanitized = cleaner.clean(text)
    .. Note::
       This cleaner is not designed to use to transform content to be used in
       non-web-page contexts.
    .. Warning::
       This cleaner is not thread-safe--the html parser has internal state.
       Create a separate cleaner per thread!
    """
    def __init__(
        self,
        tags=ALLOWED_TAGS,
        attributes=ALLOWED_ATTRIBUTES,
        styles=ALLOWED_STYLES,
        protocols=ALLOWED_PROTOCOLS,
        strip=False,
        strip_comments=True,
        filters=None,
    ):
        """Initializes a Cleaner
        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``
        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``
        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
        :arg bool strip: whether or not to strip disallowed elements
        :arg bool strip_comments: whether or not to strip HTML comments
        :arg list filters: list of html5lib Filter classes to pass streamed content through
            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
            .. Warning::
               Using filters changes the output of ``bleach.Cleaner.clean``.
               Make sure the way the filters change the output are secure.
        """
        self.tags = tags
        self.attributes = attributes
        self.styles = styles
        self.protocols = protocols
        self.strip = strip
        self.strip_comments = strip_comments
        self.filters = filters or []
        self.parser = html5lib_shim.BleachHTMLParser(
            tags=self.tags,
            strip=self.strip,
            consume_entities=False,
            namespaceHTMLElements=False,
        )
        self.walker = html5lib_shim.getTreeWalker("etree")
        self.serializer = html5lib_shim.BleachHTMLSerializer(
            quote_attr_values="always",
            omit_optional_tags=False,
            escape_lt_in_attrs=True,
            # We want to leave entities as they are without escaping or
            # resolving or expanding
            resolve_entities=False,
            # Bleach has its own sanitizer, so don't use the html5lib one
            sanitize=False,
            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
            alphabetical_attributes=False,
        )
    def clean(self, text):
        """Cleans text and returns sanitized result as unicode
        :arg str text: text to be cleaned
        :returns: sanitized text as unicode
        :raises TypeError: if ``text`` is not a text type
        """
        if not isinstance(text, str):
            message = (
                "argument cannot be of '{name}' type, must be of text type".format(
                    name=text.__class__.__name__
                )
            )
            raise TypeError(message)
        if not text:
            return ""
        dom = self.parser.parseFragment(text)
        filtered = BleachSanitizerFilter(
            source=self.walker(dom),
            # Bleach-sanitizer-specific things
            attributes=self.attributes,
            strip_disallowed_elements=self.strip,
            strip_html_comments=self.strip_comments,
            # html5lib-sanitizer things
            allowed_elements=self.tags,
            allowed_css_properties=self.styles,
            allowed_protocols=self.protocols,
            allowed_svg_properties=[],
        )
        # Apply any filters after the BleachSanitizerFilter
        for filter_class in self.filters:
            filtered = filter_class(source=filtered)
        return self.serializer.render(filtered)
 def attribute_filter_factory(attributes):
    """Generates attribute filter function for the given attributes value
    The attributes value can take one of several shapes. This returns a filter
    function appropriate to the attributes value. One nice thing about this is
    that there's less if/then shenanigans in the ``allow_token`` method.
    """
    if callable(attributes):
        return attributes
    if isinstance(attributes, dict):
        def _attr_filter(tag, attr, value):
            if tag in attributes:
                attr_val = attributes[tag]
                if callable(attr_val):
                    return attr_val(tag, attr, value)
                if attr in attr_val:
                    return True
            if "*" in attributes:
                attr_val = attributes["*"]
                if callable(attr_val):
                    return attr_val(tag, attr, value)
                return attr in attr_val
            return False
        return _attr_filter
    if isinstance(attributes, list):
        def _attr_filter(tag, attr, value):
            return attr in attributes
        return _attr_filter
    raise ValueError("attributes needs to be a callable, a list or a dict")
 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
    """html5lib Filter that sanitizes text
    This filter can be used anywhere html5lib filters can be used.
    """
    def __init__(
        self,
        source,
        attributes=ALLOWED_ATTRIBUTES,
        strip_disallowed_elements=False,
        strip_html_comments=True,
        **kwargs,
    ):
        """Creates a BleachSanitizerFilter instance
        :arg Treewalker source: stream
        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``
        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``
        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
        :arg bool strip_disallowed_elements: whether or not to strip disallowed
            elements
        :arg bool strip_html_comments: whether or not to strip HTML comments
        """
        self.attr_filter = attribute_filter_factory(attributes)
        self.strip_disallowed_elements = strip_disallowed_elements
        self.strip_html_comments = strip_html_comments
        # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
        warnings.filterwarnings(
            "ignore",
            message="html5lib's sanitizer is deprecated",
            category=DeprecationWarning,
            module="bleach._vendor.html5lib",
        )
        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
    def sanitize_stream(self, token_iterator):
        for token in token_iterator:
            ret = self.sanitize_token(token)
            if not ret:
                continue
            if isinstance(ret, list):
                for subtoken in ret:
                    yield subtoken
            else:
                yield ret
    def merge_characters(self, token_iterator):
        """Merge consecutive Characters tokens in a stream"""
        characters_buffer = []
        for token in token_iterator:
            if characters_buffer:
                if token["type"] == "Characters":
                    characters_buffer.append(token)
                    continue
                else:
                    # Merge all the characters tokens together into one and then
                    # operate on it.
                    new_token = {
                        "data": "".join(
                            [char_token["data"] for char_token in characters_buffer]
                        ),
                        "type": "Characters",
                    }
                    characters_buffer = []
                    yield new_token
            elif token["type"] == "Characters":
                characters_buffer.append(token)
                continue
            yield token
        new_token = {
            "data": "".join([char_token["data"] for char_token in characters_buffer]),
            "type": "Characters",
        }
        yield new_token
    def __iter__(self):
        return self.merge_characters(
            self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
        )
    def sanitize_token(self, token):
        """Sanitize a token either by HTML-encoding or dropping.
-        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
+        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
-        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+        ['attribute', 'pairs'], 'tag': callable}.
-        Here callable is a function with two arguments of attribute name
+        Here callable is a function with two arguments of attribute name and
-        and value. It should return true of false.
+        value. It should return true of false.
        Also gives the option to strip tags instead of encoding.
-        """
+        :arg dict token: token to sanitize
-        if (getattr(self, 'wildcard_attributes', None) is None and
+
-                isinstance(self.allowed_attributes, dict)):
+        :returns: token or list of tokens
-            self.wildcard_attributes = self.allowed_attributes.get('*', [])
+
        """
        token_type = token["type"]
        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
            if token["name"] in self.allowed_elements:
                return self.allow_token(token)
        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
                             tokenTypes['EmptyTag']):
            if token['name'] in self.allowed_elements:
                if 'data' in token:
                    if isinstance(self.allowed_attributes, dict):
                        allowed_attributes = self.allowed_attributes.get(
                            token['name'], [])
                        #print callable(allowed_attributes)
                        if not callable(allowed_attributes):
                            allowed_attributes += self.wildcard_attributes
                    else:
                        allowed_attributes = self.allowed_attributes
                    attrs = dict([(name, val) for name, val in
                                  token['data'][::-1]
                                  if (allowed_attributes(name, val)
                                      if callable(allowed_attributes)
                                      else name in allowed_attributes)])
                    for attr in self.attr_val_is_uri:
                        if attr not in attrs:
                            continue
                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                               unescape(attrs[attr])).lower()
                        # Remove replacement characters from unescaped
                        # characters.
                        val_unescaped = val_unescaped.replace("\ufffd", "")
                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
                            and (val_unescaped.split(':')[0] not in
                                 self.allowed_protocols)):
                            del attrs[attr]
                    for attr in self.svg_attr_val_allows_ref:
                        if attr in attrs:
                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                                 ' ',
                                                 unescape(attrs[attr]))
                    if (token['name'] in self.svg_allow_local_href and
                            'xlink:href' in attrs and
                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
                        del attrs['xlink:href']
                    if 'style' in attrs:
                        attrs['style'] = self.sanitize_css(attrs['style'])
                    token['data'] = [(name, val) for name, val in
                                     attrs.items()]
                return token
            elif self.strip_disallowed_elements:
-                pass
+                return None
            else:
-                if token['type'] == tokenTypes['EndTag']:
+                if "data" in token:
-                    token['data'] = '</{0!s}>'.format(token['name'])
+                    # Alphabetize the attributes before calling .disallowed_token()
-                elif token['data']:
+                    # so that the resulting string is stable
-                    attr = ' {0!s}="{1!s}"'
+                    token["data"] = alphabetize_attributes(token["data"])
-                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
+                return self.disallowed_token(token)
-                                    token['data']])
+
-                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
+        elif token_type == "Comment":
                else:
                    token['data'] = '<{0!s}>'.format(token['name'])
                if token['selfClosing']:
                    token['data'] = token['data'][:-1] + '/>'
                token['type'] = tokenTypes['Characters']
                del token["name"]
                return token
        elif token['type'] == tokenTypes['Comment']:
            if not self.strip_html_comments:
                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
                token["data"] = html5lib_shim.escape(
                    token["data"], entities={'"': "&quot;", "'": "&#x27;"}
                )
                return token
            else:
                return None
        elif token_type == "Characters":
            return self.sanitize_characters(token)
        else:
            return token
    def sanitize_characters(self, token):
        """Handles Characters tokens
        Our overridden tokenizer doesn't do anything with entities. However,
        that means that the serializer will convert all ``&`` in Characters
        tokens to ``&amp;``.
        Since we don't want that, we extract entities here and convert them to
        Entity tokens so the serializer will let them be.
        :arg token: the Characters token to work on
        :returns: a list of tokens
        """
        data = token.get("data", "")
        if not data:
            return token
        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
        token["data"] = data
        # If there isn't a & in the data, we can return now
        if "&" not in data:
            return token
        new_tokens = []
        # For each possible entity that starts with a "&", we try to extract an
        # actual entity and re-tokenize accordingly
        for part in html5lib_shim.next_possible_entity(data):
            if not part:
                continue
            if part.startswith("&"):
                entity = html5lib_shim.match_entity(part)
                if entity is not None:
                    if entity == "amp":
                        # LinkifyFilter can't match urls across token boundaries
                        # which is problematic with &amp; since that shows up in
                        # querystrings all the time. This special-cases &amp;
                        # and converts it to a & and sticks it in as a
                        # Characters token. It'll get merged with surrounding
                        # tokens in the BleachSanitizerfilter.__iter__ and
                        # escaped in the serializer.
                        new_tokens.append({"type": "Characters", "data": "&"})
                    else:
                        new_tokens.append({"type": "Entity", "name": entity})
                    # Length of the entity plus 2--one for & at the beginning
                    # and one for ; at the end
                    remainder = part[len(entity) + 2 :]
                    if remainder:
                        new_tokens.append({"type": "Characters", "data": remainder})
                    continue
            new_tokens.append({"type": "Characters", "data": part})
        return new_tokens
    def sanitize_uri_value(self, value, allowed_protocols):
        """Checks a uri value to see if it's allowed
        :arg value: the uri value to sanitize
        :arg allowed_protocols: list of allowed protocols
        :returns: allowed value or None
        """
        # NOTE(willkg): This transforms the value into one that's easier to
        # match and verify, but shouldn't get returned since it's vastly
        # different than the original value.
        # Convert all character entities in the value
        new_value = html5lib_shim.convert_entities(value)
        # Nix backtick, space characters, and control characters
        new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
        # Remove REPLACEMENT characters
        new_value = new_value.replace("\ufffd", "")
        # Lowercase it--this breaks the value, but makes it easier to match
        # against
        new_value = new_value.lower()
        try:
            # Drop attributes with uri values that have protocols that aren't
            # allowed
            parsed = urlparse(new_value)
        except ValueError:
            # URI is impossible to parse, therefore it's not allowed
            return None
        if parsed.scheme:
            # If urlparse found a scheme, check that
            if parsed.scheme in allowed_protocols:
                return value
        else:
            # Allow uris that are just an anchor
            if new_value.startswith("#"):
                return value
            # Handle protocols that urlparse doesn't recognize like "myprotocol"
            if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
                return value
            # If there's no protocol/scheme specified, then assume it's "http"
            # and see if that's allowed
            if "http" in allowed_protocols:
                return value
        return None
    def allow_token(self, token):
        """Handles the case where we're allowing the tag"""
        if "data" in token:
            # Loop through all the attributes and drop the ones that are not
            # allowed, are unsafe or break other rules. Additionally, fix
            # attribute values that need fixing.
            #
            # At the end of this loop, we have the final set of attributes
            # we're keeping.
            attrs = {}
            for namespaced_name, val in token["data"].items():
                namespace, name = namespaced_name
                # Drop attributes that are not explicitly allowed
                #
                # NOTE(willkg): We pass in the attribute name--not a namespaced
                # name.
                if not self.attr_filter(token["name"], name, val):
                    continue
                # Drop attributes with uri values that use a disallowed protocol
                # Sanitize attributes with uri values
                if namespaced_name in self.attr_val_is_uri:
                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)
                    if new_value is None:
                        continue
                    val = new_value
                # Drop values in svg attrs with non-local IRIs
                if namespaced_name in self.svg_attr_val_allows_ref:
                    new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
                    new_val = new_val.strip()
                    if not new_val:
                        continue
                    else:
                        # Replace the val with the unescaped version because
                        # it's a iri
                        val = new_val
                # Drop href and xlink:href attr for svg elements with non-local IRIs
                if (None, token["name"]) in self.svg_allow_local_href:
                    if namespaced_name in [
                        (None, "href"),
                        (html5lib_shim.namespaces["xlink"], "href"),
                    ]:
                        if re.search(r"^\s*[^#\s]", val):
                            continue
                # If it's a style attribute, sanitize it
                if namespaced_name == (None, "style"):
                    val = self.sanitize_css(val)
                # At this point, we want to keep the attribute, so add it in
                attrs[namespaced_name] = val
            token["data"] = alphabetize_attributes(attrs)
        return token
    def disallowed_token(self, token):
        token_type = token["type"]
        if token_type == "EndTag":
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
            assert token_type in ("StartTag", "EmptyTag")
            attrs = []
            for (ns, name), v in token["data"].items():
                # If we end up with a namespace, but no name, switch them so we
                # have a valid name to use.
                if ns and not name:
                    ns, name = name, ns
                # Figure out namespaced name if the namespace is appropriate
                # and exists; if the ns isn't in prefixes, then drop it.
                if ns is None or ns not in html5lib_shim.prefixes:
                    namespaced_name = name
                else:
                    namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
                attrs.append(
                    ' %s="%s"'
                    % (
                        namespaced_name,
                        # NOTE(willkg): HTMLSerializer escapes attribute values
                        # already, so if we do it here (like HTMLSerializer does),
                        # then we end up double-escaping.
                        v,
                    )
                )
            token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
        else:
            token["data"] = "<%s>" % token["name"]
        if token.get("selfClosing"):
            token["data"] = token["data"][:-1] + "/>"
        token["type"] = "Characters"
        del token["name"]
        return token
    def sanitize_css(self, style):
-        """HTMLSanitizerMixin.sanitize_css replacement.
+        """Sanitizes css in style tags"""
        # Convert entities in the style so that it can be parsed as CSS
        style = html5lib_shim.convert_entities(style)
-        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
+        # Drop any url values before we do anything else
-        border-*, margin-*, and padding-*. We only whitelist what's in
+        style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
        the whitelist.
-        """
+        # The gauntlet of sanitization
-        # disallow urls
+
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+        # Validate the css in the style tag and if it's not valid, then drop
        # the whole thing.
        parts = style.split(";")
        gauntlet = re.compile(
            r"""^(  # consider a style attribute value as composed of:
 [/:,#%!.\s\w]    # a non-newline character
 |\w-\w           # 3 characters in the form \w-\w
 |'[\s\w]+'\s*    # a single quoted string of [\s\w]+ with trailing space
 |"[\s\w]+"       # a double quoted string of [\s\w]+
 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
 )*$""",  # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
            flags=re.U | re.VERBOSE,
        )
        # gauntlet
        # TODO: Make sure this does what it's meant to - I *think* it wants to
        # validate style attribute contents.
        parts = style.split(';')
        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
        for part in parts:
            if not gauntlet.match(part):
-                return ''
+                return ""
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
+            return ""
        clean = []
-        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+        for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
+                clean.append(prop + ": " + value + ";")
            elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
+                clean.append(prop + ": " + value + ";")
-        return ' '.join(clean)
+        return " ".join(clean)
 class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                               lowercaseElementName, lowercaseAttrName,
                               **kwargs)
    def __iter__(self):
        for token in HTMLTokenizer.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
--- a/lib/bleach/utils.py
+++ b/lib/bleach/utils.py
@ -0,0 +1,21 @@
 from collections import OrderedDict
 def _attr_key(attr):
    """Returns appropriate key for sorting attribute names
    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
    ``None`` or a string. These can't be compared in Python 3, so we conver the
    ``None`` to an empty string.
    """
    key = (attr[0][0] or ""), attr[0][1]
    return key
 def alphabetize_attributes(attrs):
    """Takes a dict of attributes (or None) and returns them alphabetized"""
    if not attrs:
        return attrs
    return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])