diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py
new file mode 100644
index 00000000..aec2d340
--- /dev/null
+++ b/lib/bleach/__init__.py
@@ -0,0 +1,401 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+import logging
+import re
+
+import html5lib
+from html5lib.sanitizer import HTMLSanitizer
+from html5lib.serializer.htmlserializer import HTMLSerializer
+
+from . import callbacks as linkify_callbacks
+from .encoding import force_unicode
+from .sanitizer import BleachSanitizer
+
+
+VERSION = (1, 4, 2)
+__version__ = '.'.join([str(n) for n in VERSION])
+
+__all__ = ['clean', 'linkify']
+
+log = logging.getLogger('bleach')
+
+ALLOWED_TAGS = [
+ 'a',
+ 'abbr',
+ 'acronym',
+ 'b',
+ 'blockquote',
+ 'code',
+ 'em',
+ 'i',
+ 'li',
+ 'ol',
+ 'strong',
+ 'ul',
+]
+
+ALLOWED_ATTRIBUTES = {
+ 'a': ['href', 'title'],
+ 'abbr': ['title'],
+ 'acronym': ['title'],
+}
+
+ALLOWED_STYLES = []
+
+ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+ ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+ cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+ dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+ gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+ im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+ kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+ ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+ net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+ pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+ sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+ tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+ xn xxx ye yt yu za zm zw""".split()
+
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+
+PROTOCOLS = HTMLSanitizer.acceptable_protocols
+
+url_re = re.compile(
+ r"""\(* # Match any opening parentheses.
+ \b(?"]*)?
+ # /path/zz (excluding "unsafe" chars from RFC 1738,
+ # except for # and ~, which happen in practice)
+ """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
+ re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+
+punct_re = re.compile(r'([\.,]+)$')
+
+email_re = re.compile(
+ r"""(? tag replaced by the text within it
+ adj = replace_nodes(tree, _text, node,
+ current_child)
+ current_child -= 1
+ # pull back current_child by 1 to scan the
+ # new nodes again.
+ else:
+ text = force_unicode(attrs.pop('_text'))
+ for attr_key, attr_val in attrs.items():
+ node.set(attr_key, attr_val)
+
+ for n in reversed(list(node)):
+ node.remove(n)
+ text = parser.parseFragment(text)
+ node.text = text.text
+ for n in text:
+ node.append(n)
+ _seen.add(node)
+
+ elif current_child >= 0:
+ if node.tag == ETREE_TAG('pre') and skip_pre:
+ linkify_nodes(node, False)
+ elif not (node in _seen):
+ linkify_nodes(node, True)
+
+ current_child += 1
+
+ def email_repl(match):
+ addr = match.group(0).replace('"', '"')
+ link = {
+ '_text': addr,
+ 'href': 'mailto:{0!s}'.format(addr),
+ }
+ link = apply_callbacks(link, True)
+
+ if link is None:
+ return addr
+
+ _href = link.pop('href')
+ _text = link.pop('_text')
+
+ repl = '{2!s}'
+ attr = '{0!s}="{1!s}"'
+ attribs = ' '.join(attr.format(k, v) for k, v in link.items())
+ return repl.format(_href, attribs, _text)
+
+ def link_repl(match):
+ url = match.group(0)
+ open_brackets = close_brackets = 0
+ if url.startswith('('):
+ _wrapping = strip_wrapping_parentheses(url)
+ url, open_brackets, close_brackets = _wrapping
+ end = ''
+ m = re.search(punct_re, url)
+ if m:
+ end = m.group(0)
+ url = url[0:m.start()]
+ if re.search(proto_re, url):
+ href = url
+ else:
+ href = ''.join(['http://', url])
+
+ link = {
+ '_text': url,
+ 'href': href,
+ }
+
+ link = apply_callbacks(link, True)
+
+ if link is None:
+ return '(' * open_brackets + url + ')' * close_brackets
+
+ _text = link.pop('_text')
+ _href = link.pop('href')
+
+ repl = '{0!s}{3!s}{4!s}{5!s}'
+ attr = '{0!s}="{1!s}"'
+ attribs = ' '.join(attr.format(k, v) for k, v in link.items())
+
+ return repl.format('(' * open_brackets,
+ _href, attribs, _text, end,
+ ')' * close_brackets)
+
+ try:
+ linkify_nodes(forest)
+ except RuntimeError as e:
+ # If we hit the max recursion depth, just return what we've got.
+ log.exception('Probable recursion error: {0!r}'.format(e))
+
+ return _render(forest)
+
+
+def _render(tree):
+ """Try rendering as HTML, then XML, then give up."""
+ return force_unicode(_serialize(tree))
+
+
+def _serialize(domtree):
+ walker = html5lib.treewalkers.getTreeWalker('etree')
+ stream = walker(domtree)
+ serializer = HTMLSerializer(quote_attr_values=True,
+ alphabetical_attributes=True,
+ omit_optional_tags=False)
+ return serializer.render(stream)
diff --git a/lib/bleach/callbacks.py b/lib/bleach/callbacks.py
new file mode 100644
index 00000000..3cb82c25
--- /dev/null
+++ b/lib/bleach/callbacks.py
@@ -0,0 +1,20 @@
+"""A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
+
+
+def nofollow(attrs, new=False):
+ if attrs['href'].startswith('mailto:'):
+ return attrs
+ rel = [x for x in attrs.get('rel', '').split(' ') if x]
+ if 'nofollow' not in [x.lower() for x in rel]:
+ rel.append('nofollow')
+ attrs['rel'] = ' '.join(rel)
+
+ return attrs
+
+
+def target_blank(attrs, new=False):
+ if attrs['href'].startswith('mailto:'):
+ return attrs
+ attrs['target'] = '_blank'
+ return attrs
diff --git a/lib/bleach/encoding.py b/lib/bleach/encoding.py
new file mode 100644
index 00000000..707adaa2
--- /dev/null
+++ b/lib/bleach/encoding.py
@@ -0,0 +1,62 @@
+import datetime
+from decimal import Decimal
+import types
+import six
+
+
+def is_protected_type(obj):
+ """Determine if the object instance is of a protected type.
+
+ Objects of protected types are preserved as-is when passed to
+ force_unicode(strings_only=True).
+ """
+ return isinstance(obj, (
+ six.integer_types +
+ (types.NoneType,
+ datetime.datetime, datetime.date, datetime.time,
+ float, Decimal))
+ )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Similar to smart_text, except that lazy instances are resolved to
+ strings, rather than kept as lazy objects.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ # Handle the common case first, saves 30-40% when s is an instance of
+ # six.text_type. This function gets called often in that setting.
+ if isinstance(s, six.text_type):
+ return s
+ if strings_only and is_protected_type(s):
+ return s
+ try:
+ if not isinstance(s, six.string_types):
+ if hasattr(s, '__unicode__'):
+ s = s.__unicode__()
+ else:
+ if six.PY3:
+ if isinstance(s, bytes):
+ s = six.text_type(s, encoding, errors)
+ else:
+ s = six.text_type(s)
+ else:
+ s = six.text_type(bytes(s), encoding, errors)
+ else:
+ # Note: We use .decode() here, instead of six.text_type(s,
+ # encoding, errors), so that if s is a SafeBytes, it ends up being
+ # a SafeText at the end.
+ s = s.decode(encoding, errors)
+ except UnicodeDecodeError as e:
+ if not isinstance(s, Exception):
+ raise UnicodeDecodeError(*e.args)
+ else:
+ # If we get to here, the caller has passed in an Exception
+ # subclass populated with non-ASCII bytestring data without a
+ # working unicode method. Try to handle this without raising a
+ # further exception by individually forcing the exception args
+ # to unicode.
+ s = ' '.join([force_unicode(arg, encoding, strings_only,
+ errors) for arg in s])
+ return s
diff --git a/lib/bleach/sanitizer.py b/lib/bleach/sanitizer.py
new file mode 100644
index 00000000..8bdca752
--- /dev/null
+++ b/lib/bleach/sanitizer.py
@@ -0,0 +1,148 @@
+from __future__ import unicode_literals
+import re
+from xml.sax.saxutils import escape, unescape
+
+from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
+
+
+PROTOS = HTMLSanitizerMixin.acceptable_protocols
+PROTOS.remove('feed')
+
+
+class BleachSanitizerMixin(HTMLSanitizerMixin):
+ """Mixin to replace sanitize_token() and sanitize_css()."""
+
+ allowed_svg_properties = []
+
+ def sanitize_token(self, token):
+ """Sanitize a token either by HTML-encoding or dropping.
+
+ Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
+ a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
+
+ Here callable is a function with two arguments of attribute name
+ and value. It should return true of false.
+
+ Also gives the option to strip tags instead of encoding.
+
+ """
+ if (getattr(self, 'wildcard_attributes', None) is None and
+ isinstance(self.allowed_attributes, dict)):
+ self.wildcard_attributes = self.allowed_attributes.get('*', [])
+
+ if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
+ tokenTypes['EmptyTag']):
+ if token['name'] in self.allowed_elements:
+ if 'data' in token:
+ if isinstance(self.allowed_attributes, dict):
+ allowed_attributes = self.allowed_attributes.get(
+ token['name'], [])
+ print callable(allowed_attributes)
+ if not callable(allowed_attributes):
+ allowed_attributes += self.wildcard_attributes
+ else:
+ allowed_attributes = self.allowed_attributes
+ attrs = dict([(name, val) for name, val in
+ token['data'][::-1]
+ if (allowed_attributes(name, val)
+ if callable(allowed_attributes)
+ else name in allowed_attributes)])
+ for attr in self.attr_val_is_uri:
+ if attr not in attrs:
+ continue
+ val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+ unescape(attrs[attr])).lower()
+ # Remove replacement characters from unescaped
+ # characters.
+ val_unescaped = val_unescaped.replace("\ufffd", "")
+ if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
+ and (val_unescaped.split(':')[0] not in
+ self.allowed_protocols)):
+ del attrs[attr]
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token['name'] in self.svg_allow_local_href and
+ 'xlink:href' in attrs and
+ re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
+ del attrs['xlink:href']
+ if 'style' in attrs:
+ attrs['style'] = self.sanitize_css(attrs['style'])
+ token['data'] = [(name, val) for name, val in
+ attrs.items()]
+ return token
+ elif self.strip_disallowed_elements:
+ pass
+ else:
+ if token['type'] == tokenTypes['EndTag']:
+ token['data'] = '{0!s}>'.format(token['name'])
+ elif token['data']:
+ attr = ' {0!s}="{1!s}"'
+ attrs = ''.join([attr.format(k, escape(v)) for k, v in
+ token['data']])
+ token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
+ else:
+ token['data'] = '<{0!s}>'.format(token['name'])
+ if token['selfClosing']:
+ token['data'] = token['data'][:-1] + '/>'
+ token['type'] = tokenTypes['Characters']
+ del token["name"]
+ return token
+ elif token['type'] == tokenTypes['Comment']:
+ if not self.strip_html_comments:
+ return token
+ else:
+ return token
+
+ def sanitize_css(self, style):
+ """HTMLSanitizerMixin.sanitize_css replacement.
+
+ HTMLSanitizerMixin.sanitize_css always whitelists background-*,
+ border-*, margin-*, and padding-*. We only whitelist what's in
+ the whitelist.
+
+ """
+ # disallow urls
+ style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+ # gauntlet
+ # TODO: Make sure this does what it's meant to - I *think* it wants to
+ # validate style attribute contents.
+ parts = style.split(';')
+ gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
+ """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
+ for part in parts:
+ if not gauntlet.match(part):
+ return ''
+
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+ return ''
+
+ clean = []
+ for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+ if not value:
+ continue
+ if prop.lower() in self.allowed_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.lower() in self.allowed_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+
+class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
+ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+ lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
+ HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+ lowercaseElementName, lowercaseAttrName,
+ **kwargs)
+
+ def __iter__(self):
+ for token in HTMLTokenizer.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token