From 453c46df0038aaf221db2d3238f864d43c0d3ee0 Mon Sep 17 00:00:00 2001 From: JonnyWong16 Date: Wed, 4 May 2016 17:45:48 -0700 Subject: [PATCH] Add bleach library to clean notification HTML --- lib/bleach/__init__.py | 401 ++++++++++++++++++++++++++++++++++++++++ lib/bleach/callbacks.py | 20 ++ lib/bleach/encoding.py | 62 +++++++ lib/bleach/sanitizer.py | 148 +++++++++++++++ 4 files changed, 631 insertions(+) create mode 100644 lib/bleach/__init__.py create mode 100644 lib/bleach/callbacks.py create mode 100644 lib/bleach/encoding.py create mode 100644 lib/bleach/sanitizer.py diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py new file mode 100644 index 00000000..aec2d340 --- /dev/null +++ b/lib/bleach/__init__.py @@ -0,0 +1,401 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals +import logging +import re + +import html5lib +from html5lib.sanitizer import HTMLSanitizer +from html5lib.serializer.htmlserializer import HTMLSerializer + +from . import callbacks as linkify_callbacks +from .encoding import force_unicode +from .sanitizer import BleachSanitizer + + +VERSION = (1, 4, 2) +__version__ = '.'.join([str(n) for n in VERSION]) + +__all__ = ['clean', 'linkify'] + +log = logging.getLogger('bleach') + +ALLOWED_TAGS = [ + 'a', + 'abbr', + 'acronym', + 'b', + 'blockquote', + 'code', + 'em', + 'i', + 'li', + 'ol', + 'strong', + 'ul', +] + +ALLOWED_ATTRIBUTES = { + 'a': ['href', 'title'], + 'abbr': ['title'], + 'acronym': ['title'], +} + +ALLOWED_STYLES = [] + +ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] + +TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az + ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat + cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk + dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg + gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il + im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp + kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk + ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne + net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post + pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl + sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to + tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws + xn xxx ye yt yu za zm zw""".split() + +# Make sure that .com doesn't get matched by .co first +TLDS.reverse() + +PROTOCOLS = HTMLSanitizer.acceptable_protocols + +url_re = re.compile( + r"""\(* # Match any opening parentheses. + \b(?"]*)? + # /path/zz (excluding "unsafe" chars from RFC 1738, + # except for # and ~, which happen in practice) + """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), + re.IGNORECASE | re.VERBOSE | re.UNICODE) + +proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) + +punct_re = re.compile(r'([\.,]+)$') + +email_re = re.compile( + r"""(? tag replaced by the text within it + adj = replace_nodes(tree, _text, node, + current_child) + current_child -= 1 + # pull back current_child by 1 to scan the + # new nodes again. + else: + text = force_unicode(attrs.pop('_text')) + for attr_key, attr_val in attrs.items(): + node.set(attr_key, attr_val) + + for n in reversed(list(node)): + node.remove(n) + text = parser.parseFragment(text) + node.text = text.text + for n in text: + node.append(n) + _seen.add(node) + + elif current_child >= 0: + if node.tag == ETREE_TAG('pre') and skip_pre: + linkify_nodes(node, False) + elif not (node in _seen): + linkify_nodes(node, True) + + current_child += 1 + + def email_repl(match): + addr = match.group(0).replace('"', '"') + link = { + '_text': addr, + 'href': 'mailto:{0!s}'.format(addr), + } + link = apply_callbacks(link, True) + + if link is None: + return addr + + _href = link.pop('href') + _text = link.pop('_text') + + repl = '{2!s}' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) + return repl.format(_href, attribs, _text) + + def link_repl(match): + url = match.group(0) + open_brackets = close_brackets = 0 + if url.startswith('('): + _wrapping = strip_wrapping_parentheses(url) + url, open_brackets, close_brackets = _wrapping + end = '' + m = re.search(punct_re, url) + if m: + end = m.group(0) + url = url[0:m.start()] + if re.search(proto_re, url): + href = url + else: + href = ''.join(['http://', url]) + + link = { + '_text': url, + 'href': href, + } + + link = apply_callbacks(link, True) + + if link is None: + return '(' * open_brackets + url + ')' * close_brackets + + _text = link.pop('_text') + _href = link.pop('href') + + repl = '{0!s}{3!s}{4!s}{5!s}' + attr = '{0!s}="{1!s}"' + attribs = ' '.join(attr.format(k, v) for k, v in link.items()) + + return repl.format('(' * open_brackets, + _href, attribs, _text, end, + ')' * close_brackets) + + try: + linkify_nodes(forest) + except RuntimeError as e: + # If we hit the max recursion depth, just return what we've got. + log.exception('Probable recursion error: {0!r}'.format(e)) + + return _render(forest) + + +def _render(tree): + """Try rendering as HTML, then XML, then give up.""" + return force_unicode(_serialize(tree)) + + +def _serialize(domtree): + walker = html5lib.treewalkers.getTreeWalker('etree') + stream = walker(domtree) + serializer = HTMLSerializer(quote_attr_values=True, + alphabetical_attributes=True, + omit_optional_tags=False) + return serializer.render(stream) diff --git a/lib/bleach/callbacks.py b/lib/bleach/callbacks.py new file mode 100644 index 00000000..3cb82c25 --- /dev/null +++ b/lib/bleach/callbacks.py @@ -0,0 +1,20 @@ +"""A set of basic callbacks for bleach.linkify.""" +from __future__ import unicode_literals + + +def nofollow(attrs, new=False): + if attrs['href'].startswith('mailto:'): + return attrs + rel = [x for x in attrs.get('rel', '').split(' ') if x] + if 'nofollow' not in [x.lower() for x in rel]: + rel.append('nofollow') + attrs['rel'] = ' '.join(rel) + + return attrs + + +def target_blank(attrs, new=False): + if attrs['href'].startswith('mailto:'): + return attrs + attrs['target'] = '_blank' + return attrs diff --git a/lib/bleach/encoding.py b/lib/bleach/encoding.py new file mode 100644 index 00000000..707adaa2 --- /dev/null +++ b/lib/bleach/encoding.py @@ -0,0 +1,62 @@ +import datetime +from decimal import Decimal +import types +import six + + +def is_protected_type(obj): + """Determine if the object instance is of a protected type. + + Objects of protected types are preserved as-is when passed to + force_unicode(strings_only=True). + """ + return isinstance(obj, ( + six.integer_types + + (types.NoneType, + datetime.datetime, datetime.date, datetime.time, + float, Decimal)) + ) + + +def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Similar to smart_text, except that lazy instances are resolved to + strings, rather than kept as lazy objects. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + # Handle the common case first, saves 30-40% when s is an instance of + # six.text_type. This function gets called often in that setting. + if isinstance(s, six.text_type): + return s + if strings_only and is_protected_type(s): + return s + try: + if not isinstance(s, six.string_types): + if hasattr(s, '__unicode__'): + s = s.__unicode__() + else: + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + else: + s = six.text_type(bytes(s), encoding, errors) + else: + # Note: We use .decode() here, instead of six.text_type(s, + # encoding, errors), so that if s is a SafeBytes, it ends up being + # a SafeText at the end. + s = s.decode(encoding, errors) + except UnicodeDecodeError as e: + if not isinstance(s, Exception): + raise UnicodeDecodeError(*e.args) + else: + # If we get to here, the caller has passed in an Exception + # subclass populated with non-ASCII bytestring data without a + # working unicode method. Try to handle this without raising a + # further exception by individually forcing the exception args + # to unicode. + s = ' '.join([force_unicode(arg, encoding, strings_only, + errors) for arg in s]) + return s diff --git a/lib/bleach/sanitizer.py b/lib/bleach/sanitizer.py new file mode 100644 index 00000000..8bdca752 --- /dev/null +++ b/lib/bleach/sanitizer.py @@ -0,0 +1,148 @@ +from __future__ import unicode_literals +import re +from xml.sax.saxutils import escape, unescape + +from html5lib.constants import tokenTypes +from html5lib.sanitizer import HTMLSanitizerMixin +from html5lib.tokenizer import HTMLTokenizer + + +PROTOS = HTMLSanitizerMixin.acceptable_protocols +PROTOS.remove('feed') + + +class BleachSanitizerMixin(HTMLSanitizerMixin): + """Mixin to replace sanitize_token() and sanitize_css().""" + + allowed_svg_properties = [] + + def sanitize_token(self, token): + """Sanitize a token either by HTML-encoding or dropping. + + Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be + a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. + + Here callable is a function with two arguments of attribute name + and value. It should return true of false. + + Also gives the option to strip tags instead of encoding. + + """ + if (getattr(self, 'wildcard_attributes', None) is None and + isinstance(self.allowed_attributes, dict)): + self.wildcard_attributes = self.allowed_attributes.get('*', []) + + if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], + tokenTypes['EmptyTag']): + if token['name'] in self.allowed_elements: + if 'data' in token: + if isinstance(self.allowed_attributes, dict): + allowed_attributes = self.allowed_attributes.get( + token['name'], []) + print callable(allowed_attributes) + if not callable(allowed_attributes): + allowed_attributes += self.wildcard_attributes + else: + allowed_attributes = self.allowed_attributes + attrs = dict([(name, val) for name, val in + token['data'][::-1] + if (allowed_attributes(name, val) + if callable(allowed_attributes) + else name in allowed_attributes)]) + for attr in self.attr_val_is_uri: + if attr not in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + # Remove replacement characters from unescaped + # characters. + val_unescaped = val_unescaped.replace("\ufffd", "") + if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) + and (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token['name'] in self.svg_allow_local_href and + 'xlink:href' in attrs and + re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): + del attrs['xlink:href'] + if 'style' in attrs: + attrs['style'] = self.sanitize_css(attrs['style']) + token['data'] = [(name, val) for name, val in + attrs.items()] + return token + elif self.strip_disallowed_elements: + pass + else: + if token['type'] == tokenTypes['EndTag']: + token['data'] = ''.format(token['name']) + elif token['data']: + attr = ' {0!s}="{1!s}"' + attrs = ''.join([attr.format(k, escape(v)) for k, v in + token['data']]) + token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) + else: + token['data'] = '<{0!s}>'.format(token['name']) + if token['selfClosing']: + token['data'] = token['data'][:-1] + '/>' + token['type'] = tokenTypes['Characters'] + del token["name"] + return token + elif token['type'] == tokenTypes['Comment']: + if not self.strip_html_comments: + return token + else: + return token + + def sanitize_css(self, style): + """HTMLSanitizerMixin.sanitize_css replacement. + + HTMLSanitizerMixin.sanitize_css always whitelists background-*, + border-*, margin-*, and padding-*. We only whitelist what's in + the whitelist. + + """ + # disallow urls + style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + + # gauntlet + # TODO: Make sure this does what it's meant to - I *think* it wants to + # validate style attribute contents. + parts = style.split(';') + gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" + """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") + for part in parts: + if not gauntlet.match(part): + return '' + + if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + return '' + + clean = [] + for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): + if not value: + continue + if prop.lower() in self.allowed_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean) + + +class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True, **kwargs): + HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, + lowercaseElementName, lowercaseAttrName, + **kwargs) + + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: + yield token