mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-08 06:00:51 -07:00
Add bleach library to clean notification HTML
This commit is contained in:
parent
f001e19728
commit
453c46df00
4 changed files with 631 additions and 0 deletions
148
lib/bleach/sanitizer.py
Normal file
148
lib/bleach/sanitizer.py
Normal file
|
@ -0,0 +1,148 @@
|
|||
from __future__ import unicode_literals
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from html5lib.constants import tokenTypes
|
||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||
from html5lib.tokenizer import HTMLTokenizer
|
||||
|
||||
|
||||
PROTOS = HTMLSanitizerMixin.acceptable_protocols
|
||||
PROTOS.remove('feed')
|
||||
|
||||
|
||||
class BleachSanitizerMixin(HTMLSanitizerMixin):
|
||||
"""Mixin to replace sanitize_token() and sanitize_css()."""
|
||||
|
||||
allowed_svg_properties = []
|
||||
|
||||
def sanitize_token(self, token):
|
||||
"""Sanitize a token either by HTML-encoding or dropping.
|
||||
|
||||
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
|
||||
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
|
||||
|
||||
Here callable is a function with two arguments of attribute name
|
||||
and value. It should return true of false.
|
||||
|
||||
Also gives the option to strip tags instead of encoding.
|
||||
|
||||
"""
|
||||
if (getattr(self, 'wildcard_attributes', None) is None and
|
||||
isinstance(self.allowed_attributes, dict)):
|
||||
self.wildcard_attributes = self.allowed_attributes.get('*', [])
|
||||
|
||||
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
|
||||
tokenTypes['EmptyTag']):
|
||||
if token['name'] in self.allowed_elements:
|
||||
if 'data' in token:
|
||||
if isinstance(self.allowed_attributes, dict):
|
||||
allowed_attributes = self.allowed_attributes.get(
|
||||
token['name'], [])
|
||||
print callable(allowed_attributes)
|
||||
if not callable(allowed_attributes):
|
||||
allowed_attributes += self.wildcard_attributes
|
||||
else:
|
||||
allowed_attributes = self.allowed_attributes
|
||||
attrs = dict([(name, val) for name, val in
|
||||
token['data'][::-1]
|
||||
if (allowed_attributes(name, val)
|
||||
if callable(allowed_attributes)
|
||||
else name in allowed_attributes)])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if attr not in attrs:
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# Remove replacement characters from unescaped
|
||||
# characters.
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
|
||||
and (val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token['name'] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and
|
||||
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if 'style' in attrs:
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token['data'] = [(name, val) for name, val in
|
||||
attrs.items()]
|
||||
return token
|
||||
elif self.strip_disallowed_elements:
|
||||
pass
|
||||
else:
|
||||
if token['type'] == tokenTypes['EndTag']:
|
||||
token['data'] = '</{0!s}>'.format(token['name'])
|
||||
elif token['data']:
|
||||
attr = ' {0!s}="{1!s}"'
|
||||
attrs = ''.join([attr.format(k, escape(v)) for k, v in
|
||||
token['data']])
|
||||
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
|
||||
else:
|
||||
token['data'] = '<{0!s}>'.format(token['name'])
|
||||
if token['selfClosing']:
|
||||
token['data'] = token['data'][:-1] + '/>'
|
||||
token['type'] = tokenTypes['Characters']
|
||||
del token["name"]
|
||||
return token
|
||||
elif token['type'] == tokenTypes['Comment']:
|
||||
if not self.strip_html_comments:
|
||||
return token
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
"""HTMLSanitizerMixin.sanitize_css replacement.
|
||||
|
||||
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
|
||||
border-*, margin-*, and padding-*. We only whitelist what's in
|
||||
the whitelist.
|
||||
|
||||
"""
|
||||
# disallow urls
|
||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
|
||||
# gauntlet
|
||||
# TODO: Make sure this does what it's meant to - I *think* it wants to
|
||||
# validate style attribute contents.
|
||||
parts = style.split(';')
|
||||
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
|
||||
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
|
||||
for part in parts:
|
||||
if not gauntlet.match(part):
|
||||
return ''
|
||||
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
|
||||
if not value:
|
||||
continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
|
||||
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName,
|
||||
**kwargs)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
Loading…
Add table
Add a link
Reference in a new issue