mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-16 02:02:58 -07:00
Update bleach-4.1.0
This commit is contained in:
parent
4086529906
commit
a4130d6c56
51 changed files with 17071 additions and 568 deletions
|
@ -1,148 +1,645 @@
|
|||
from __future__ import unicode_literals
|
||||
from itertools import chain
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
import warnings
|
||||
|
||||
from html5lib.constants import tokenTypes
|
||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||
from html5lib.tokenizer import HTMLTokenizer
|
||||
from bleach._vendor.parse import urlparse
|
||||
from xml.sax.saxutils import unescape
|
||||
|
||||
from bleach import html5lib_shim
|
||||
from bleach.utils import alphabetize_attributes
|
||||
|
||||
|
||||
PROTOS = HTMLSanitizerMixin.acceptable_protocols
|
||||
PROTOS.remove('feed')
|
||||
#: List of allowed tags
|
||||
ALLOWED_TAGS = [
|
||||
"a",
|
||||
"abbr",
|
||||
"acronym",
|
||||
"b",
|
||||
"blockquote",
|
||||
"code",
|
||||
"em",
|
||||
"i",
|
||||
"li",
|
||||
"ol",
|
||||
"strong",
|
||||
"ul",
|
||||
]
|
||||
|
||||
|
||||
class BleachSanitizerMixin(HTMLSanitizerMixin):
|
||||
"""Mixin to replace sanitize_token() and sanitize_css()."""
|
||||
#: Map of allowed attributes by tag
|
||||
ALLOWED_ATTRIBUTES = {
|
||||
"a": ["href", "title"],
|
||||
"abbr": ["title"],
|
||||
"acronym": ["title"],
|
||||
}
|
||||
|
||||
allowed_svg_properties = []
|
||||
#: List of allowed styles
|
||||
ALLOWED_STYLES = []
|
||||
|
||||
#: List of allowed protocols
|
||||
ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
|
||||
|
||||
#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
|
||||
INVISIBLE_CHARACTERS = "".join(
|
||||
[chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
|
||||
)
|
||||
|
||||
#: Regexp for characters that are invisible
|
||||
INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
|
||||
|
||||
#: String to replace invisible characters with. This can be a character, a
|
||||
#: string, or even a function that takes a Python re matchobj
|
||||
INVISIBLE_REPLACEMENT_CHAR = "?"
|
||||
|
||||
|
||||
class Cleaner:
|
||||
"""Cleaner for cleaning HTML fragments of malicious content
|
||||
|
||||
This cleaner is a security-focused function whose sole purpose is to remove
|
||||
malicious content from a string such that it can be displayed as content in
|
||||
a web page.
|
||||
|
||||
To use::
|
||||
|
||||
from bleach.sanitizer import Cleaner
|
||||
|
||||
cleaner = Cleaner()
|
||||
|
||||
for text in all_the_yucky_things:
|
||||
sanitized = cleaner.clean(text)
|
||||
|
||||
.. Note::
|
||||
|
||||
This cleaner is not designed to use to transform content to be used in
|
||||
non-web-page contexts.
|
||||
|
||||
.. Warning::
|
||||
|
||||
This cleaner is not thread-safe--the html parser has internal state.
|
||||
Create a separate cleaner per thread!
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tags=ALLOWED_TAGS,
|
||||
attributes=ALLOWED_ATTRIBUTES,
|
||||
styles=ALLOWED_STYLES,
|
||||
protocols=ALLOWED_PROTOCOLS,
|
||||
strip=False,
|
||||
strip_comments=True,
|
||||
filters=None,
|
||||
):
|
||||
"""Initializes a Cleaner
|
||||
|
||||
:arg list tags: allowed list of tags; defaults to
|
||||
``bleach.sanitizer.ALLOWED_TAGS``
|
||||
|
||||
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||
|
||||
:arg list styles: allowed list of css styles; defaults to
|
||||
``bleach.sanitizer.ALLOWED_STYLES``
|
||||
|
||||
:arg list protocols: allowed list of protocols for links; defaults
|
||||
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||
|
||||
:arg bool strip: whether or not to strip disallowed elements
|
||||
|
||||
:arg bool strip_comments: whether or not to strip HTML comments
|
||||
|
||||
:arg list filters: list of html5lib Filter classes to pass streamed content through
|
||||
|
||||
.. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
|
||||
|
||||
.. Warning::
|
||||
|
||||
Using filters changes the output of ``bleach.Cleaner.clean``.
|
||||
Make sure the way the filters change the output are secure.
|
||||
|
||||
"""
|
||||
self.tags = tags
|
||||
self.attributes = attributes
|
||||
self.styles = styles
|
||||
self.protocols = protocols
|
||||
self.strip = strip
|
||||
self.strip_comments = strip_comments
|
||||
self.filters = filters or []
|
||||
|
||||
self.parser = html5lib_shim.BleachHTMLParser(
|
||||
tags=self.tags,
|
||||
strip=self.strip,
|
||||
consume_entities=False,
|
||||
namespaceHTMLElements=False,
|
||||
)
|
||||
self.walker = html5lib_shim.getTreeWalker("etree")
|
||||
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
||||
quote_attr_values="always",
|
||||
omit_optional_tags=False,
|
||||
escape_lt_in_attrs=True,
|
||||
# We want to leave entities as they are without escaping or
|
||||
# resolving or expanding
|
||||
resolve_entities=False,
|
||||
# Bleach has its own sanitizer, so don't use the html5lib one
|
||||
sanitize=False,
|
||||
# Bleach sanitizer alphabetizes already, so don't use the html5lib one
|
||||
alphabetical_attributes=False,
|
||||
)
|
||||
|
||||
def clean(self, text):
|
||||
"""Cleans text and returns sanitized result as unicode
|
||||
|
||||
:arg str text: text to be cleaned
|
||||
|
||||
:returns: sanitized text as unicode
|
||||
|
||||
:raises TypeError: if ``text`` is not a text type
|
||||
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
message = (
|
||||
"argument cannot be of '{name}' type, must be of text type".format(
|
||||
name=text.__class__.__name__
|
||||
)
|
||||
)
|
||||
raise TypeError(message)
|
||||
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
dom = self.parser.parseFragment(text)
|
||||
filtered = BleachSanitizerFilter(
|
||||
source=self.walker(dom),
|
||||
# Bleach-sanitizer-specific things
|
||||
attributes=self.attributes,
|
||||
strip_disallowed_elements=self.strip,
|
||||
strip_html_comments=self.strip_comments,
|
||||
# html5lib-sanitizer things
|
||||
allowed_elements=self.tags,
|
||||
allowed_css_properties=self.styles,
|
||||
allowed_protocols=self.protocols,
|
||||
allowed_svg_properties=[],
|
||||
)
|
||||
|
||||
# Apply any filters after the BleachSanitizerFilter
|
||||
for filter_class in self.filters:
|
||||
filtered = filter_class(source=filtered)
|
||||
|
||||
return self.serializer.render(filtered)
|
||||
|
||||
|
||||
def attribute_filter_factory(attributes):
|
||||
"""Generates attribute filter function for the given attributes value
|
||||
|
||||
The attributes value can take one of several shapes. This returns a filter
|
||||
function appropriate to the attributes value. One nice thing about this is
|
||||
that there's less if/then shenanigans in the ``allow_token`` method.
|
||||
|
||||
"""
|
||||
if callable(attributes):
|
||||
return attributes
|
||||
|
||||
if isinstance(attributes, dict):
|
||||
|
||||
def _attr_filter(tag, attr, value):
|
||||
if tag in attributes:
|
||||
attr_val = attributes[tag]
|
||||
if callable(attr_val):
|
||||
return attr_val(tag, attr, value)
|
||||
|
||||
if attr in attr_val:
|
||||
return True
|
||||
|
||||
if "*" in attributes:
|
||||
attr_val = attributes["*"]
|
||||
if callable(attr_val):
|
||||
return attr_val(tag, attr, value)
|
||||
|
||||
return attr in attr_val
|
||||
|
||||
return False
|
||||
|
||||
return _attr_filter
|
||||
|
||||
if isinstance(attributes, list):
|
||||
|
||||
def _attr_filter(tag, attr, value):
|
||||
return attr in attributes
|
||||
|
||||
return _attr_filter
|
||||
|
||||
raise ValueError("attributes needs to be a callable, a list or a dict")
|
||||
|
||||
|
||||
class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
|
||||
"""html5lib Filter that sanitizes text
|
||||
|
||||
This filter can be used anywhere html5lib filters can be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source,
|
||||
attributes=ALLOWED_ATTRIBUTES,
|
||||
strip_disallowed_elements=False,
|
||||
strip_html_comments=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Creates a BleachSanitizerFilter instance
|
||||
|
||||
:arg Treewalker source: stream
|
||||
|
||||
:arg list tags: allowed list of tags; defaults to
|
||||
``bleach.sanitizer.ALLOWED_TAGS``
|
||||
|
||||
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||
|
||||
:arg list styles: allowed list of css styles; defaults to
|
||||
``bleach.sanitizer.ALLOWED_STYLES``
|
||||
|
||||
:arg list protocols: allowed list of protocols for links; defaults
|
||||
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||
|
||||
:arg bool strip_disallowed_elements: whether or not to strip disallowed
|
||||
elements
|
||||
|
||||
:arg bool strip_html_comments: whether or not to strip HTML comments
|
||||
|
||||
"""
|
||||
self.attr_filter = attribute_filter_factory(attributes)
|
||||
self.strip_disallowed_elements = strip_disallowed_elements
|
||||
self.strip_html_comments = strip_html_comments
|
||||
|
||||
# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="html5lib's sanitizer is deprecated",
|
||||
category=DeprecationWarning,
|
||||
module="bleach._vendor.html5lib",
|
||||
)
|
||||
return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
|
||||
|
||||
def sanitize_stream(self, token_iterator):
|
||||
for token in token_iterator:
|
||||
ret = self.sanitize_token(token)
|
||||
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
if isinstance(ret, list):
|
||||
for subtoken in ret:
|
||||
yield subtoken
|
||||
else:
|
||||
yield ret
|
||||
|
||||
def merge_characters(self, token_iterator):
|
||||
"""Merge consecutive Characters tokens in a stream"""
|
||||
characters_buffer = []
|
||||
|
||||
for token in token_iterator:
|
||||
if characters_buffer:
|
||||
if token["type"] == "Characters":
|
||||
characters_buffer.append(token)
|
||||
continue
|
||||
else:
|
||||
# Merge all the characters tokens together into one and then
|
||||
# operate on it.
|
||||
new_token = {
|
||||
"data": "".join(
|
||||
[char_token["data"] for char_token in characters_buffer]
|
||||
),
|
||||
"type": "Characters",
|
||||
}
|
||||
characters_buffer = []
|
||||
yield new_token
|
||||
|
||||
elif token["type"] == "Characters":
|
||||
characters_buffer.append(token)
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
new_token = {
|
||||
"data": "".join([char_token["data"] for char_token in characters_buffer]),
|
||||
"type": "Characters",
|
||||
}
|
||||
yield new_token
|
||||
|
||||
def __iter__(self):
|
||||
return self.merge_characters(
|
||||
self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
|
||||
)
|
||||
|
||||
def sanitize_token(self, token):
|
||||
"""Sanitize a token either by HTML-encoding or dropping.
|
||||
|
||||
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
|
||||
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
|
||||
Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
|
||||
['attribute', 'pairs'], 'tag': callable}.
|
||||
|
||||
Here callable is a function with two arguments of attribute name
|
||||
and value. It should return true of false.
|
||||
Here callable is a function with two arguments of attribute name and
|
||||
value. It should return true of false.
|
||||
|
||||
Also gives the option to strip tags instead of encoding.
|
||||
|
||||
"""
|
||||
if (getattr(self, 'wildcard_attributes', None) is None and
|
||||
isinstance(self.allowed_attributes, dict)):
|
||||
self.wildcard_attributes = self.allowed_attributes.get('*', [])
|
||||
:arg dict token: token to sanitize
|
||||
|
||||
:returns: token or list of tokens
|
||||
|
||||
"""
|
||||
token_type = token["type"]
|
||||
if token_type in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["name"] in self.allowed_elements:
|
||||
return self.allow_token(token)
|
||||
|
||||
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
|
||||
tokenTypes['EmptyTag']):
|
||||
if token['name'] in self.allowed_elements:
|
||||
if 'data' in token:
|
||||
if isinstance(self.allowed_attributes, dict):
|
||||
allowed_attributes = self.allowed_attributes.get(
|
||||
token['name'], [])
|
||||
#print callable(allowed_attributes)
|
||||
if not callable(allowed_attributes):
|
||||
allowed_attributes += self.wildcard_attributes
|
||||
else:
|
||||
allowed_attributes = self.allowed_attributes
|
||||
attrs = dict([(name, val) for name, val in
|
||||
token['data'][::-1]
|
||||
if (allowed_attributes(name, val)
|
||||
if callable(allowed_attributes)
|
||||
else name in allowed_attributes)])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if attr not in attrs:
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# Remove replacement characters from unescaped
|
||||
# characters.
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
|
||||
and (val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token['name'] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and
|
||||
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if 'style' in attrs:
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token['data'] = [(name, val) for name, val in
|
||||
attrs.items()]
|
||||
return token
|
||||
elif self.strip_disallowed_elements:
|
||||
pass
|
||||
return None
|
||||
|
||||
else:
|
||||
if token['type'] == tokenTypes['EndTag']:
|
||||
token['data'] = '</{0!s}>'.format(token['name'])
|
||||
elif token['data']:
|
||||
attr = ' {0!s}="{1!s}"'
|
||||
attrs = ''.join([attr.format(k, escape(v)) for k, v in
|
||||
token['data']])
|
||||
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
|
||||
else:
|
||||
token['data'] = '<{0!s}>'.format(token['name'])
|
||||
if token['selfClosing']:
|
||||
token['data'] = token['data'][:-1] + '/>'
|
||||
token['type'] = tokenTypes['Characters']
|
||||
del token["name"]
|
||||
return token
|
||||
elif token['type'] == tokenTypes['Comment']:
|
||||
if "data" in token:
|
||||
# Alphabetize the attributes before calling .disallowed_token()
|
||||
# so that the resulting string is stable
|
||||
token["data"] = alphabetize_attributes(token["data"])
|
||||
return self.disallowed_token(token)
|
||||
|
||||
elif token_type == "Comment":
|
||||
if not self.strip_html_comments:
|
||||
# call lxml.sax.saxutils to escape &, <, and > in addition to " and '
|
||||
token["data"] = html5lib_shim.escape(
|
||||
token["data"], entities={'"': """, "'": "'"}
|
||||
)
|
||||
return token
|
||||
else:
|
||||
return None
|
||||
|
||||
elif token_type == "Characters":
|
||||
return self.sanitize_characters(token)
|
||||
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
"""HTMLSanitizerMixin.sanitize_css replacement.
|
||||
def sanitize_characters(self, token):
|
||||
"""Handles Characters tokens
|
||||
|
||||
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
|
||||
border-*, margin-*, and padding-*. We only whitelist what's in
|
||||
the whitelist.
|
||||
Our overridden tokenizer doesn't do anything with entities. However,
|
||||
that means that the serializer will convert all ``&`` in Characters
|
||||
tokens to ``&``.
|
||||
|
||||
Since we don't want that, we extract entities here and convert them to
|
||||
Entity tokens so the serializer will let them be.
|
||||
|
||||
:arg token: the Characters token to work on
|
||||
|
||||
:returns: a list of tokens
|
||||
|
||||
"""
|
||||
# disallow urls
|
||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
data = token.get("data", "")
|
||||
|
||||
if not data:
|
||||
return token
|
||||
|
||||
data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
|
||||
token["data"] = data
|
||||
|
||||
# If there isn't a & in the data, we can return now
|
||||
if "&" not in data:
|
||||
return token
|
||||
|
||||
new_tokens = []
|
||||
|
||||
# For each possible entity that starts with a "&", we try to extract an
|
||||
# actual entity and re-tokenize accordingly
|
||||
for part in html5lib_shim.next_possible_entity(data):
|
||||
if not part:
|
||||
continue
|
||||
|
||||
if part.startswith("&"):
|
||||
entity = html5lib_shim.match_entity(part)
|
||||
if entity is not None:
|
||||
if entity == "amp":
|
||||
# LinkifyFilter can't match urls across token boundaries
|
||||
# which is problematic with & since that shows up in
|
||||
# querystrings all the time. This special-cases &
|
||||
# and converts it to a & and sticks it in as a
|
||||
# Characters token. It'll get merged with surrounding
|
||||
# tokens in the BleachSanitizerfilter.__iter__ and
|
||||
# escaped in the serializer.
|
||||
new_tokens.append({"type": "Characters", "data": "&"})
|
||||
else:
|
||||
new_tokens.append({"type": "Entity", "name": entity})
|
||||
|
||||
# Length of the entity plus 2--one for & at the beginning
|
||||
# and one for ; at the end
|
||||
remainder = part[len(entity) + 2 :]
|
||||
if remainder:
|
||||
new_tokens.append({"type": "Characters", "data": remainder})
|
||||
continue
|
||||
|
||||
new_tokens.append({"type": "Characters", "data": part})
|
||||
|
||||
return new_tokens
|
||||
|
||||
def sanitize_uri_value(self, value, allowed_protocols):
|
||||
"""Checks a uri value to see if it's allowed
|
||||
|
||||
:arg value: the uri value to sanitize
|
||||
:arg allowed_protocols: list of allowed protocols
|
||||
|
||||
:returns: allowed value or None
|
||||
|
||||
"""
|
||||
# NOTE(willkg): This transforms the value into one that's easier to
|
||||
# match and verify, but shouldn't get returned since it's vastly
|
||||
# different than the original value.
|
||||
|
||||
# Convert all character entities in the value
|
||||
new_value = html5lib_shim.convert_entities(value)
|
||||
|
||||
# Nix backtick, space characters, and control characters
|
||||
new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
|
||||
|
||||
# Remove REPLACEMENT characters
|
||||
new_value = new_value.replace("\ufffd", "")
|
||||
|
||||
# Lowercase it--this breaks the value, but makes it easier to match
|
||||
# against
|
||||
new_value = new_value.lower()
|
||||
|
||||
try:
|
||||
# Drop attributes with uri values that have protocols that aren't
|
||||
# allowed
|
||||
parsed = urlparse(new_value)
|
||||
except ValueError:
|
||||
# URI is impossible to parse, therefore it's not allowed
|
||||
return None
|
||||
|
||||
if parsed.scheme:
|
||||
# If urlparse found a scheme, check that
|
||||
if parsed.scheme in allowed_protocols:
|
||||
return value
|
||||
|
||||
else:
|
||||
# Allow uris that are just an anchor
|
||||
if new_value.startswith("#"):
|
||||
return value
|
||||
|
||||
# Handle protocols that urlparse doesn't recognize like "myprotocol"
|
||||
if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
|
||||
return value
|
||||
|
||||
# If there's no protocol/scheme specified, then assume it's "http"
|
||||
# and see if that's allowed
|
||||
if "http" in allowed_protocols:
|
||||
return value
|
||||
|
||||
return None
|
||||
|
||||
def allow_token(self, token):
|
||||
"""Handles the case where we're allowing the tag"""
|
||||
if "data" in token:
|
||||
# Loop through all the attributes and drop the ones that are not
|
||||
# allowed, are unsafe or break other rules. Additionally, fix
|
||||
# attribute values that need fixing.
|
||||
#
|
||||
# At the end of this loop, we have the final set of attributes
|
||||
# we're keeping.
|
||||
attrs = {}
|
||||
for namespaced_name, val in token["data"].items():
|
||||
namespace, name = namespaced_name
|
||||
|
||||
# Drop attributes that are not explicitly allowed
|
||||
#
|
||||
# NOTE(willkg): We pass in the attribute name--not a namespaced
|
||||
# name.
|
||||
if not self.attr_filter(token["name"], name, val):
|
||||
continue
|
||||
|
||||
# Drop attributes with uri values that use a disallowed protocol
|
||||
# Sanitize attributes with uri values
|
||||
if namespaced_name in self.attr_val_is_uri:
|
||||
new_value = self.sanitize_uri_value(val, self.allowed_protocols)
|
||||
if new_value is None:
|
||||
continue
|
||||
val = new_value
|
||||
|
||||
# Drop values in svg attrs with non-local IRIs
|
||||
if namespaced_name in self.svg_attr_val_allows_ref:
|
||||
new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
|
||||
new_val = new_val.strip()
|
||||
if not new_val:
|
||||
continue
|
||||
|
||||
else:
|
||||
# Replace the val with the unescaped version because
|
||||
# it's a iri
|
||||
val = new_val
|
||||
|
||||
# Drop href and xlink:href attr for svg elements with non-local IRIs
|
||||
if (None, token["name"]) in self.svg_allow_local_href:
|
||||
if namespaced_name in [
|
||||
(None, "href"),
|
||||
(html5lib_shim.namespaces["xlink"], "href"),
|
||||
]:
|
||||
if re.search(r"^\s*[^#\s]", val):
|
||||
continue
|
||||
|
||||
# If it's a style attribute, sanitize it
|
||||
if namespaced_name == (None, "style"):
|
||||
val = self.sanitize_css(val)
|
||||
|
||||
# At this point, we want to keep the attribute, so add it in
|
||||
attrs[namespaced_name] = val
|
||||
|
||||
token["data"] = alphabetize_attributes(attrs)
|
||||
|
||||
return token
|
||||
|
||||
def disallowed_token(self, token):
|
||||
token_type = token["type"]
|
||||
if token_type == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
|
||||
elif token["data"]:
|
||||
assert token_type in ("StartTag", "EmptyTag")
|
||||
attrs = []
|
||||
for (ns, name), v in token["data"].items():
|
||||
# If we end up with a namespace, but no name, switch them so we
|
||||
# have a valid name to use.
|
||||
if ns and not name:
|
||||
ns, name = name, ns
|
||||
|
||||
# Figure out namespaced name if the namespace is appropriate
|
||||
# and exists; if the ns isn't in prefixes, then drop it.
|
||||
if ns is None or ns not in html5lib_shim.prefixes:
|
||||
namespaced_name = name
|
||||
else:
|
||||
namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
|
||||
|
||||
attrs.append(
|
||||
' %s="%s"'
|
||||
% (
|
||||
namespaced_name,
|
||||
# NOTE(willkg): HTMLSerializer escapes attribute values
|
||||
# already, so if we do it here (like HTMLSerializer does),
|
||||
# then we end up double-escaping.
|
||||
v,
|
||||
)
|
||||
)
|
||||
token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
|
||||
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
|
||||
if token.get("selfClosing"):
|
||||
token["data"] = token["data"][:-1] + "/>"
|
||||
|
||||
token["type"] = "Characters"
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
"""Sanitizes css in style tags"""
|
||||
# Convert entities in the style so that it can be parsed as CSS
|
||||
style = html5lib_shim.convert_entities(style)
|
||||
|
||||
# Drop any url values before we do anything else
|
||||
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
|
||||
|
||||
# The gauntlet of sanitization
|
||||
|
||||
# Validate the css in the style tag and if it's not valid, then drop
|
||||
# the whole thing.
|
||||
parts = style.split(";")
|
||||
gauntlet = re.compile(
|
||||
r"""^( # consider a style attribute value as composed of:
|
||||
[/:,#%!.\s\w] # a non-newline character
|
||||
|\w-\w # 3 characters in the form \w-\w
|
||||
|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
|
||||
|"[\s\w]+" # a double quoted string of [\s\w]+
|
||||
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
|
||||
)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
|
||||
flags=re.U | re.VERBOSE,
|
||||
)
|
||||
|
||||
# gauntlet
|
||||
# TODO: Make sure this does what it's meant to - I *think* it wants to
|
||||
# validate style attribute contents.
|
||||
parts = style.split(';')
|
||||
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
|
||||
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
|
||||
for part in parts:
|
||||
if not gauntlet.match(part):
|
||||
return ''
|
||||
return ""
|
||||
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ""
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
|
||||
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
clean.append(prop + ": " + value + ";")
|
||||
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
clean.append(prop + ": " + value + ";")
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
|
||||
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName,
|
||||
**kwargs)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
return " ".join(clean)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue