mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 21:21:15 -07:00
Add bleach library to clean notification HTML
This commit is contained in:
parent
f001e19728
commit
453c46df00
4 changed files with 631 additions and 0 deletions
401
lib/bleach/__init__.py
Normal file
401
lib/bleach/__init__.py
Normal file
|
@ -0,0 +1,401 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
from html5lib.sanitizer import HTMLSanitizer
|
||||||
|
from html5lib.serializer.htmlserializer import HTMLSerializer
|
||||||
|
|
||||||
|
from . import callbacks as linkify_callbacks
|
||||||
|
from .encoding import force_unicode
|
||||||
|
from .sanitizer import BleachSanitizer
|
||||||
|
|
||||||
|
|
||||||
|
VERSION = (1, 4, 2)
|
||||||
|
__version__ = '.'.join([str(n) for n in VERSION])
|
||||||
|
|
||||||
|
__all__ = ['clean', 'linkify']
|
||||||
|
|
||||||
|
log = logging.getLogger('bleach')
|
||||||
|
|
||||||
|
ALLOWED_TAGS = [
|
||||||
|
'a',
|
||||||
|
'abbr',
|
||||||
|
'acronym',
|
||||||
|
'b',
|
||||||
|
'blockquote',
|
||||||
|
'code',
|
||||||
|
'em',
|
||||||
|
'i',
|
||||||
|
'li',
|
||||||
|
'ol',
|
||||||
|
'strong',
|
||||||
|
'ul',
|
||||||
|
]
|
||||||
|
|
||||||
|
ALLOWED_ATTRIBUTES = {
|
||||||
|
'a': ['href', 'title'],
|
||||||
|
'abbr': ['title'],
|
||||||
|
'acronym': ['title'],
|
||||||
|
}
|
||||||
|
|
||||||
|
ALLOWED_STYLES = []
|
||||||
|
|
||||||
|
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
|
||||||
|
|
||||||
|
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
||||||
|
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
||||||
|
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
||||||
|
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
||||||
|
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
||||||
|
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
||||||
|
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
||||||
|
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
||||||
|
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
||||||
|
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
||||||
|
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
||||||
|
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
||||||
|
xn xxx ye yt yu za zm zw""".split()
|
||||||
|
|
||||||
|
# Make sure that .com doesn't get matched by .co first
|
||||||
|
TLDS.reverse()
|
||||||
|
|
||||||
|
PROTOCOLS = HTMLSanitizer.acceptable_protocols
|
||||||
|
|
||||||
|
url_re = re.compile(
|
||||||
|
r"""\(* # Match any opening parentheses.
|
||||||
|
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
||||||
|
([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||||
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
||||||
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||||
|
# except for # and ~, which happen in practice)
|
||||||
|
""".format('|'.join(PROTOCOLS), '|'.join(TLDS)),
|
||||||
|
re.IGNORECASE | re.VERBOSE | re.UNICODE)
|
||||||
|
|
||||||
|
proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
|
||||||
|
|
||||||
|
punct_re = re.compile(r'([\.,]+)$')
|
||||||
|
|
||||||
|
email_re = re.compile(
|
||||||
|
r"""(?<!//)
|
||||||
|
(([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
|
||||||
|
(\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom
|
||||||
|
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
||||||
|
|\\[\001-011\013\014\016-\177])*" # quoted-string
|
||||||
|
)@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain
|
||||||
|
""",
|
||||||
|
re.IGNORECASE | re.MULTILINE | re.VERBOSE)
|
||||||
|
|
||||||
|
NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
|
||||||
|
|
||||||
|
ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
|
||||||
|
# a simple routine that returns the tag name with the namespace prefix
|
||||||
|
# as returned by etree's Element.tag attribute
|
||||||
|
|
||||||
|
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
||||||
|
|
||||||
|
|
||||||
|
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
|
||||||
|
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
|
||||||
|
strip_comments=True):
|
||||||
|
"""Clean an HTML fragment and return it
|
||||||
|
|
||||||
|
:arg text: the text to clean
|
||||||
|
:arg tags: whitelist of allowed tags; defaults to
|
||||||
|
``bleach.ALLOWED_TAGS``
|
||||||
|
:arg attributes: whitelist of allowed attributes; defaults to
|
||||||
|
``bleach.ALLOWED_ATTRIBUTES``
|
||||||
|
:arg styles: whitelist of allowed css; defaults to
|
||||||
|
``bleach.ALLOWED_STYLES``
|
||||||
|
:arg protocols: whitelist of allowed protocols for links; defaults
|
||||||
|
to ``bleach.ALLOWED_PROTOCOLS``
|
||||||
|
:arg strip: whether or not to strip disallowed elements
|
||||||
|
:arg strip_comments: whether or not to strip HTML comments
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
text = force_unicode(text)
|
||||||
|
|
||||||
|
class s(BleachSanitizer):
|
||||||
|
allowed_elements = tags
|
||||||
|
allowed_attributes = attributes
|
||||||
|
allowed_css_properties = styles
|
||||||
|
allowed_protocols = protocols
|
||||||
|
strip_disallowed_elements = strip
|
||||||
|
strip_html_comments = strip_comments
|
||||||
|
|
||||||
|
parser = html5lib.HTMLParser(tokenizer=s)
|
||||||
|
|
||||||
|
return _render(parser.parseFragment(text))
|
||||||
|
|
||||||
|
|
||||||
|
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
|
||||||
|
parse_email=False, tokenizer=HTMLSanitizer):
|
||||||
|
"""Convert URL-like strings in an HTML fragment to links.
|
||||||
|
|
||||||
|
linkify() converts strings that look like URLs or domain names in a
|
||||||
|
blob of text that may be an HTML fragment to links, while preserving
|
||||||
|
(a) links already in the string, (b) urls found in attributes, and
|
||||||
|
(c) email addresses.
|
||||||
|
"""
|
||||||
|
text = force_unicode(text)
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
parser = html5lib.HTMLParser(tokenizer=tokenizer)
|
||||||
|
|
||||||
|
forest = parser.parseFragment(text)
|
||||||
|
_seen = set([])
|
||||||
|
|
||||||
|
def replace_nodes(tree, new_frag, node, index=0):
|
||||||
|
"""
|
||||||
|
Doesn't really replace nodes, but inserts the nodes contained in
|
||||||
|
new_frag into the treee at position index and returns the number
|
||||||
|
of nodes inserted.
|
||||||
|
If node is passed in, it is removed from the tree
|
||||||
|
"""
|
||||||
|
count = 0
|
||||||
|
new_tree = parser.parseFragment(new_frag)
|
||||||
|
# capture any non-tag text at the start of the fragment
|
||||||
|
if new_tree.text:
|
||||||
|
if index == 0:
|
||||||
|
tree.text = tree.text or ''
|
||||||
|
tree.text += new_tree.text
|
||||||
|
else:
|
||||||
|
tree[index - 1].tail = tree[index - 1].tail or ''
|
||||||
|
tree[index - 1].tail += new_tree.text
|
||||||
|
# the put in the tagged elements into the old tree
|
||||||
|
for n in new_tree:
|
||||||
|
if n.tag == ETREE_TAG('a'):
|
||||||
|
_seen.add(n)
|
||||||
|
tree.insert(index + count, n)
|
||||||
|
count += 1
|
||||||
|
# if we got a node to remove...
|
||||||
|
if node is not None:
|
||||||
|
tree.remove(node)
|
||||||
|
return count
|
||||||
|
|
||||||
|
def strip_wrapping_parentheses(fragment):
|
||||||
|
"""Strips wrapping parentheses.
|
||||||
|
|
||||||
|
Returns a tuple of the following format::
|
||||||
|
|
||||||
|
(string stripped from wrapping parentheses,
|
||||||
|
count of stripped opening parentheses,
|
||||||
|
count of stripped closing parentheses)
|
||||||
|
"""
|
||||||
|
opening_parentheses = closing_parentheses = 0
|
||||||
|
# Count consecutive opening parentheses
|
||||||
|
# at the beginning of the fragment (string).
|
||||||
|
for char in fragment:
|
||||||
|
if char == '(':
|
||||||
|
opening_parentheses += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
if opening_parentheses:
|
||||||
|
newer_frag = ''
|
||||||
|
# Cut the consecutive opening brackets from the fragment.
|
||||||
|
fragment = fragment[opening_parentheses:]
|
||||||
|
# Reverse the fragment for easier detection of parentheses
|
||||||
|
# inside the URL.
|
||||||
|
reverse_fragment = fragment[::-1]
|
||||||
|
skip = False
|
||||||
|
for char in reverse_fragment:
|
||||||
|
# Remove the closing parentheses if it has a matching
|
||||||
|
# opening parentheses (they are balanced).
|
||||||
|
if (char == ')' and
|
||||||
|
closing_parentheses < opening_parentheses and
|
||||||
|
not skip):
|
||||||
|
closing_parentheses += 1
|
||||||
|
continue
|
||||||
|
# Do not remove ')' from the URL itself.
|
||||||
|
elif char != ')':
|
||||||
|
skip = True
|
||||||
|
newer_frag += char
|
||||||
|
fragment = newer_frag[::-1]
|
||||||
|
|
||||||
|
return fragment, opening_parentheses, closing_parentheses
|
||||||
|
|
||||||
|
def apply_callbacks(attrs, new):
|
||||||
|
for cb in callbacks:
|
||||||
|
attrs = cb(attrs, new)
|
||||||
|
if attrs is None:
|
||||||
|
return None
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
def _render_inner(node):
|
||||||
|
out = ['' if node.text is None else node.text]
|
||||||
|
for subnode in node:
|
||||||
|
out.append(_render(subnode))
|
||||||
|
if subnode.tail:
|
||||||
|
out.append(subnode.tail)
|
||||||
|
return ''.join(out)
|
||||||
|
|
||||||
|
def linkify_nodes(tree, parse_text=True):
|
||||||
|
children = len(tree)
|
||||||
|
current_child = -1
|
||||||
|
# start at -1 to process the parent first
|
||||||
|
while current_child < len(tree):
|
||||||
|
if current_child < 0:
|
||||||
|
node = tree
|
||||||
|
if parse_text and node.text:
|
||||||
|
new_txt = old_txt = node.text
|
||||||
|
if parse_email:
|
||||||
|
new_txt = re.sub(email_re, email_repl, node.text)
|
||||||
|
if new_txt and new_txt != node.text:
|
||||||
|
node.text = ''
|
||||||
|
adj = replace_nodes(tree, new_txt, None, 0)
|
||||||
|
children += adj
|
||||||
|
current_child += adj
|
||||||
|
linkify_nodes(tree, True)
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_txt = re.sub(url_re, link_repl, new_txt)
|
||||||
|
if new_txt != old_txt:
|
||||||
|
node.text = ''
|
||||||
|
adj = replace_nodes(tree, new_txt, None, 0)
|
||||||
|
children += adj
|
||||||
|
current_child += adj
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
node = tree[current_child]
|
||||||
|
|
||||||
|
if parse_text and node.tail:
|
||||||
|
new_tail = old_tail = node.tail
|
||||||
|
if parse_email:
|
||||||
|
new_tail = re.sub(email_re, email_repl, new_tail)
|
||||||
|
if new_tail != node.tail:
|
||||||
|
node.tail = ''
|
||||||
|
adj = replace_nodes(tree, new_tail, None,
|
||||||
|
current_child + 1)
|
||||||
|
# Insert the new nodes made from my tail into
|
||||||
|
# the tree right after me. current_child+1
|
||||||
|
children += adj
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_tail = re.sub(url_re, link_repl, new_tail)
|
||||||
|
if new_tail != old_tail:
|
||||||
|
node.tail = ''
|
||||||
|
adj = replace_nodes(tree, new_tail, None,
|
||||||
|
current_child + 1)
|
||||||
|
children += adj
|
||||||
|
|
||||||
|
if node.tag == ETREE_TAG('a') and not (node in _seen):
|
||||||
|
if not node.get('href', None) is None:
|
||||||
|
attrs = dict(node.items())
|
||||||
|
|
||||||
|
_text = attrs['_text'] = _render_inner(node)
|
||||||
|
|
||||||
|
attrs = apply_callbacks(attrs, False)
|
||||||
|
|
||||||
|
if attrs is None:
|
||||||
|
# <a> tag replaced by the text within it
|
||||||
|
adj = replace_nodes(tree, _text, node,
|
||||||
|
current_child)
|
||||||
|
current_child -= 1
|
||||||
|
# pull back current_child by 1 to scan the
|
||||||
|
# new nodes again.
|
||||||
|
else:
|
||||||
|
text = force_unicode(attrs.pop('_text'))
|
||||||
|
for attr_key, attr_val in attrs.items():
|
||||||
|
node.set(attr_key, attr_val)
|
||||||
|
|
||||||
|
for n in reversed(list(node)):
|
||||||
|
node.remove(n)
|
||||||
|
text = parser.parseFragment(text)
|
||||||
|
node.text = text.text
|
||||||
|
for n in text:
|
||||||
|
node.append(n)
|
||||||
|
_seen.add(node)
|
||||||
|
|
||||||
|
elif current_child >= 0:
|
||||||
|
if node.tag == ETREE_TAG('pre') and skip_pre:
|
||||||
|
linkify_nodes(node, False)
|
||||||
|
elif not (node in _seen):
|
||||||
|
linkify_nodes(node, True)
|
||||||
|
|
||||||
|
current_child += 1
|
||||||
|
|
||||||
|
def email_repl(match):
|
||||||
|
addr = match.group(0).replace('"', '"')
|
||||||
|
link = {
|
||||||
|
'_text': addr,
|
||||||
|
'href': 'mailto:{0!s}'.format(addr),
|
||||||
|
}
|
||||||
|
link = apply_callbacks(link, True)
|
||||||
|
|
||||||
|
if link is None:
|
||||||
|
return addr
|
||||||
|
|
||||||
|
_href = link.pop('href')
|
||||||
|
_text = link.pop('_text')
|
||||||
|
|
||||||
|
repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
|
||||||
|
attr = '{0!s}="{1!s}"'
|
||||||
|
attribs = ' '.join(attr.format(k, v) for k, v in link.items())
|
||||||
|
return repl.format(_href, attribs, _text)
|
||||||
|
|
||||||
|
def link_repl(match):
|
||||||
|
url = match.group(0)
|
||||||
|
open_brackets = close_brackets = 0
|
||||||
|
if url.startswith('('):
|
||||||
|
_wrapping = strip_wrapping_parentheses(url)
|
||||||
|
url, open_brackets, close_brackets = _wrapping
|
||||||
|
end = ''
|
||||||
|
m = re.search(punct_re, url)
|
||||||
|
if m:
|
||||||
|
end = m.group(0)
|
||||||
|
url = url[0:m.start()]
|
||||||
|
if re.search(proto_re, url):
|
||||||
|
href = url
|
||||||
|
else:
|
||||||
|
href = ''.join(['http://', url])
|
||||||
|
|
||||||
|
link = {
|
||||||
|
'_text': url,
|
||||||
|
'href': href,
|
||||||
|
}
|
||||||
|
|
||||||
|
link = apply_callbacks(link, True)
|
||||||
|
|
||||||
|
if link is None:
|
||||||
|
return '(' * open_brackets + url + ')' * close_brackets
|
||||||
|
|
||||||
|
_text = link.pop('_text')
|
||||||
|
_href = link.pop('href')
|
||||||
|
|
||||||
|
repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
|
||||||
|
attr = '{0!s}="{1!s}"'
|
||||||
|
attribs = ' '.join(attr.format(k, v) for k, v in link.items())
|
||||||
|
|
||||||
|
return repl.format('(' * open_brackets,
|
||||||
|
_href, attribs, _text, end,
|
||||||
|
')' * close_brackets)
|
||||||
|
|
||||||
|
try:
|
||||||
|
linkify_nodes(forest)
|
||||||
|
except RuntimeError as e:
|
||||||
|
# If we hit the max recursion depth, just return what we've got.
|
||||||
|
log.exception('Probable recursion error: {0!r}'.format(e))
|
||||||
|
|
||||||
|
return _render(forest)
|
||||||
|
|
||||||
|
|
||||||
|
def _render(tree):
|
||||||
|
"""Try rendering as HTML, then XML, then give up."""
|
||||||
|
return force_unicode(_serialize(tree))
|
||||||
|
|
||||||
|
|
||||||
|
def _serialize(domtree):
|
||||||
|
walker = html5lib.treewalkers.getTreeWalker('etree')
|
||||||
|
stream = walker(domtree)
|
||||||
|
serializer = HTMLSerializer(quote_attr_values=True,
|
||||||
|
alphabetical_attributes=True,
|
||||||
|
omit_optional_tags=False)
|
||||||
|
return serializer.render(stream)
|
20
lib/bleach/callbacks.py
Normal file
20
lib/bleach/callbacks.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
"""A set of basic callbacks for bleach.linkify."""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
def nofollow(attrs, new=False):
|
||||||
|
if attrs['href'].startswith('mailto:'):
|
||||||
|
return attrs
|
||||||
|
rel = [x for x in attrs.get('rel', '').split(' ') if x]
|
||||||
|
if 'nofollow' not in [x.lower() for x in rel]:
|
||||||
|
rel.append('nofollow')
|
||||||
|
attrs['rel'] = ' '.join(rel)
|
||||||
|
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
|
||||||
|
def target_blank(attrs, new=False):
|
||||||
|
if attrs['href'].startswith('mailto:'):
|
||||||
|
return attrs
|
||||||
|
attrs['target'] = '_blank'
|
||||||
|
return attrs
|
62
lib/bleach/encoding.py
Normal file
62
lib/bleach/encoding.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
import datetime
|
||||||
|
from decimal import Decimal
|
||||||
|
import types
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
|
def is_protected_type(obj):
|
||||||
|
"""Determine if the object instance is of a protected type.
|
||||||
|
|
||||||
|
Objects of protected types are preserved as-is when passed to
|
||||||
|
force_unicode(strings_only=True).
|
||||||
|
"""
|
||||||
|
return isinstance(obj, (
|
||||||
|
six.integer_types +
|
||||||
|
(types.NoneType,
|
||||||
|
datetime.datetime, datetime.date, datetime.time,
|
||||||
|
float, Decimal))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
|
||||||
|
"""
|
||||||
|
Similar to smart_text, except that lazy instances are resolved to
|
||||||
|
strings, rather than kept as lazy objects.
|
||||||
|
|
||||||
|
If strings_only is True, don't convert (some) non-string-like objects.
|
||||||
|
"""
|
||||||
|
# Handle the common case first, saves 30-40% when s is an instance of
|
||||||
|
# six.text_type. This function gets called often in that setting.
|
||||||
|
if isinstance(s, six.text_type):
|
||||||
|
return s
|
||||||
|
if strings_only and is_protected_type(s):
|
||||||
|
return s
|
||||||
|
try:
|
||||||
|
if not isinstance(s, six.string_types):
|
||||||
|
if hasattr(s, '__unicode__'):
|
||||||
|
s = s.__unicode__()
|
||||||
|
else:
|
||||||
|
if six.PY3:
|
||||||
|
if isinstance(s, bytes):
|
||||||
|
s = six.text_type(s, encoding, errors)
|
||||||
|
else:
|
||||||
|
s = six.text_type(s)
|
||||||
|
else:
|
||||||
|
s = six.text_type(bytes(s), encoding, errors)
|
||||||
|
else:
|
||||||
|
# Note: We use .decode() here, instead of six.text_type(s,
|
||||||
|
# encoding, errors), so that if s is a SafeBytes, it ends up being
|
||||||
|
# a SafeText at the end.
|
||||||
|
s = s.decode(encoding, errors)
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
if not isinstance(s, Exception):
|
||||||
|
raise UnicodeDecodeError(*e.args)
|
||||||
|
else:
|
||||||
|
# If we get to here, the caller has passed in an Exception
|
||||||
|
# subclass populated with non-ASCII bytestring data without a
|
||||||
|
# working unicode method. Try to handle this without raising a
|
||||||
|
# further exception by individually forcing the exception args
|
||||||
|
# to unicode.
|
||||||
|
s = ' '.join([force_unicode(arg, encoding, strings_only,
|
||||||
|
errors) for arg in s])
|
||||||
|
return s
|
148
lib/bleach/sanitizer.py
Normal file
148
lib/bleach/sanitizer.py
Normal file
|
@ -0,0 +1,148 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import re
|
||||||
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
|
||||||
|
from html5lib.constants import tokenTypes
|
||||||
|
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||||
|
from html5lib.tokenizer import HTMLTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
PROTOS = HTMLSanitizerMixin.acceptable_protocols
|
||||||
|
PROTOS.remove('feed')
|
||||||
|
|
||||||
|
|
||||||
|
class BleachSanitizerMixin(HTMLSanitizerMixin):
|
||||||
|
"""Mixin to replace sanitize_token() and sanitize_css()."""
|
||||||
|
|
||||||
|
allowed_svg_properties = []
|
||||||
|
|
||||||
|
def sanitize_token(self, token):
|
||||||
|
"""Sanitize a token either by HTML-encoding or dropping.
|
||||||
|
|
||||||
|
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
|
||||||
|
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
|
||||||
|
|
||||||
|
Here callable is a function with two arguments of attribute name
|
||||||
|
and value. It should return true of false.
|
||||||
|
|
||||||
|
Also gives the option to strip tags instead of encoding.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if (getattr(self, 'wildcard_attributes', None) is None and
|
||||||
|
isinstance(self.allowed_attributes, dict)):
|
||||||
|
self.wildcard_attributes = self.allowed_attributes.get('*', [])
|
||||||
|
|
||||||
|
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
|
||||||
|
tokenTypes['EmptyTag']):
|
||||||
|
if token['name'] in self.allowed_elements:
|
||||||
|
if 'data' in token:
|
||||||
|
if isinstance(self.allowed_attributes, dict):
|
||||||
|
allowed_attributes = self.allowed_attributes.get(
|
||||||
|
token['name'], [])
|
||||||
|
print callable(allowed_attributes)
|
||||||
|
if not callable(allowed_attributes):
|
||||||
|
allowed_attributes += self.wildcard_attributes
|
||||||
|
else:
|
||||||
|
allowed_attributes = self.allowed_attributes
|
||||||
|
attrs = dict([(name, val) for name, val in
|
||||||
|
token['data'][::-1]
|
||||||
|
if (allowed_attributes(name, val)
|
||||||
|
if callable(allowed_attributes)
|
||||||
|
else name in allowed_attributes)])
|
||||||
|
for attr in self.attr_val_is_uri:
|
||||||
|
if attr not in attrs:
|
||||||
|
continue
|
||||||
|
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||||
|
unescape(attrs[attr])).lower()
|
||||||
|
# Remove replacement characters from unescaped
|
||||||
|
# characters.
|
||||||
|
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||||
|
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
|
||||||
|
and (val_unescaped.split(':')[0] not in
|
||||||
|
self.allowed_protocols)):
|
||||||
|
del attrs[attr]
|
||||||
|
for attr in self.svg_attr_val_allows_ref:
|
||||||
|
if attr in attrs:
|
||||||
|
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||||
|
' ',
|
||||||
|
unescape(attrs[attr]))
|
||||||
|
if (token['name'] in self.svg_allow_local_href and
|
||||||
|
'xlink:href' in attrs and
|
||||||
|
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
|
||||||
|
del attrs['xlink:href']
|
||||||
|
if 'style' in attrs:
|
||||||
|
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||||
|
token['data'] = [(name, val) for name, val in
|
||||||
|
attrs.items()]
|
||||||
|
return token
|
||||||
|
elif self.strip_disallowed_elements:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if token['type'] == tokenTypes['EndTag']:
|
||||||
|
token['data'] = '</{0!s}>'.format(token['name'])
|
||||||
|
elif token['data']:
|
||||||
|
attr = ' {0!s}="{1!s}"'
|
||||||
|
attrs = ''.join([attr.format(k, escape(v)) for k, v in
|
||||||
|
token['data']])
|
||||||
|
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
|
||||||
|
else:
|
||||||
|
token['data'] = '<{0!s}>'.format(token['name'])
|
||||||
|
if token['selfClosing']:
|
||||||
|
token['data'] = token['data'][:-1] + '/>'
|
||||||
|
token['type'] = tokenTypes['Characters']
|
||||||
|
del token["name"]
|
||||||
|
return token
|
||||||
|
elif token['type'] == tokenTypes['Comment']:
|
||||||
|
if not self.strip_html_comments:
|
||||||
|
return token
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
|
||||||
|
def sanitize_css(self, style):
|
||||||
|
"""HTMLSanitizerMixin.sanitize_css replacement.
|
||||||
|
|
||||||
|
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
|
||||||
|
border-*, margin-*, and padding-*. We only whitelist what's in
|
||||||
|
the whitelist.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# disallow urls
|
||||||
|
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||||
|
|
||||||
|
# gauntlet
|
||||||
|
# TODO: Make sure this does what it's meant to - I *think* it wants to
|
||||||
|
# validate style attribute contents.
|
||||||
|
parts = style.split(';')
|
||||||
|
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
|
||||||
|
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
|
||||||
|
for part in parts:
|
||||||
|
if not gauntlet.match(part):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
clean = []
|
||||||
|
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
if prop.lower() in self.allowed_css_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.lower() in self.allowed_svg_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
|
||||||
|
return ' '.join(clean)
|
||||||
|
|
||||||
|
|
||||||
|
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
|
||||||
|
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||||
|
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
|
||||||
|
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||||
|
lowercaseElementName, lowercaseAttrName,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for token in HTMLTokenizer.__iter__(self):
|
||||||
|
token = self.sanitize_token(token)
|
||||||
|
if token:
|
||||||
|
yield token
|
Loading…
Add table
Add a link
Reference in a new issue