Bump bleach from 4.1.0 to 5.0.0 (#1708)

* Bump bleach from 4.1.0 to 5.0.0

Bumps [bleach](https://github.com/mozilla/bleach) from 4.1.0 to 5.0.0.
- [Release notes](https://github.com/mozilla/bleach/releases)
- [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES)
- [Commits](https://github.com/mozilla/bleach/compare/v4.1.0...v5.0.0)

---
updated-dependencies:
- dependency-name: bleach
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update bleach==5.0.0

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2022-05-16 20:41:47 -07:00 committed by GitHub
parent d510e0f600
commit a1fe0b04d7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 264 additions and 151 deletions

View file

@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
import packaging.version
from bleach.linkifier import (
DEFAULT_CALLBACKS,
Linker,
@ -9,17 +5,15 @@ from bleach.linkifier import (
from bleach.sanitizer import (
ALLOWED_ATTRIBUTES,
ALLOWED_PROTOCOLS,
ALLOWED_STYLES,
ALLOWED_TAGS,
Cleaner,
)
# yyyymmdd
__releasedate__ = "20210825"
__releasedate__ = "20220407"
# x.y.z or x.y.z.dev0 -- semver
__version__ = "4.1.0"
VERSION = packaging.version.Version(__version__)
__version__ = "5.0.0"
__all__ = ["clean", "linkify"]
@ -29,10 +23,10 @@ def clean(
text,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
strip_comments=True,
css_sanitizer=None,
):
"""Clean an HTML fragment of malicious content and return it
@ -64,9 +58,6 @@ def clean(
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
@ -74,16 +65,19 @@ def clean(
:arg bool strip_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
:returns: cleaned text as unicode
"""
cleaner = Cleaner(
tags=tags,
attributes=attributes,
styles=styles,
protocols=protocols,
strip=strip,
strip_comments=strip_comments,
css_sanitizer=css_sanitizer,
)
return cleaner.clean(text)

View file

@ -0,0 +1,20 @@
Copyright (c) 2006-2013 James Graham and other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1 @@
46af966e33b6247ae1d57d9459115a3eb46cda9f809c9f14e052abc2fe8dacb2 parse.py

104
lib/bleach/css_sanitizer.py Normal file
View file

@ -0,0 +1,104 @@
import tinycss2
ALLOWED_CSS_PROPERTIES = frozenset(
(
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
)
)
ALLOWED_SVG_PROPERTIES = frozenset(
(
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-width",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
)
)
class CSSSanitizer:
def __init__(
self,
allowed_css_properties=ALLOWED_CSS_PROPERTIES,
allowed_svg_properties=ALLOWED_SVG_PROPERTIES,
):
self.allowed_css_properties = allowed_css_properties
self.allowed_svg_properties = allowed_svg_properties
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
parsed = tinycss2.parse_declaration_list(style)
if not parsed:
return ""
new_tokens = []
for token in parsed:
if token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif token.type in ("comment", "whitespace"):
if new_tokens and new_tokens[-1].type != token.type:
new_tokens.append(token)
# NOTE(willkg): We currently don't handle AtRule or ParseError and
# so both get silently thrown out
if not new_tokens:
return ""
return tinycss2.serialize(new_tokens).strip()

View file

@ -36,6 +36,8 @@ from bleach._vendor.html5lib.filters.base import (
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
allowed_protocols,
allowed_css_properties,
allowed_svg_properties,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
Filter as SanitizerFilter,
@ -68,8 +70,10 @@ TAG_TOKEN_TYPES = {
constants.tokenTypes["EndTag"],
constants.tokenTypes["EmptyTag"],
}
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
@ -190,6 +194,48 @@ HTML_TAGS = [
]
#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
#: from mozilla on 2019.07.11
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS_BLOCK_LEVEL = frozenset(
[
"address",
"article",
"aside",
"blockquote",
"details",
"dialog",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"table",
"ul",
]
)
class InputStreamWithMemory:
"""Wraps an HTMLInputStream to remember characters since last <
@ -257,17 +303,20 @@ class BleachHTMLTokenizer(HTMLTokenizer):
"""Tokenizer that doesn't consume character entities"""
def __init__(self, consume_entities=False, **kwargs):
super(BleachHTMLTokenizer, self).__init__(**kwargs)
super().__init__(**kwargs)
self.consume_entities = consume_entities
# Wrap the stream with one that remembers the history
self.stream = InputStreamWithMemory(self.stream)
# Remember the last token emitted; needed for block element spacing
self.emitted_last_token = None
def __iter__(self):
last_error_token = None
for token in super(BleachHTMLTokenizer, self).__iter__():
for token in super().__iter__():
if last_error_token is not None:
if (
last_error_token["data"] == "invalid-character-in-attribute-name"
@ -309,12 +358,12 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# If this is not an allowed tag, then we convert it to
# characters and it'll get escaped in the sanitizer.
token["data"] = self.stream.get_tag()
token["type"] = CHARACTERS_TYPE
token["type"] = TAG_TOKEN_TYPE_CHARACTERS
last_error_token = None
yield token
elif token["type"] == PARSEERROR_TYPE:
elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
# If the token is a parse error, then let the last_error_token
# go, and make token the new last_error_token
yield last_error_token
@ -329,7 +378,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# If the token is a ParseError, we hold on to it so we can get the
# next token and potentially fix it.
if token["type"] == PARSEERROR_TYPE:
if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
last_error_token = token
continue
@ -342,9 +391,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# If this tokenizer is set to consume entities, then we can let the
# superclass do its thing.
if self.consume_entities:
return super(BleachHTMLTokenizer, self).consumeEntity(
allowedChar, fromAttribute
)
return super().consumeEntity(allowedChar, fromAttribute)
# If this tokenizer is set to not consume entities, then we don't want
# to consume and convert them, so this overrides the html5lib tokenizer's
@ -356,7 +403,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
self.currentToken["data"][-1][1] += "&"
else:
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
def tagOpenState(self):
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
@ -364,7 +411,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# we've collected so far and we do that by calling start_tag() on
# the input stream wrapper.
self.stream.start_tag()
return super(BleachHTMLTokenizer, self).tagOpenState()
return super().tagOpenState()
def emitCurrentToken(self):
token = self.currentToken
@ -378,8 +425,18 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# allowed list, then it gets stripped or escaped. In both of these
# cases it gets converted to a Characters token.
if self.parser.strip:
# If we're stripping the token, we just throw in an empty
# string token.
if (
self.emitted_last_token
and token["type"] == TAG_TOKEN_TYPE_START
and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
):
# If this is a block level tag we're stripping, we drop it
# for a newline because that's what a browser would parse
# it as
new_data = "\n"
else:
# For all other things being stripped, we throw in an empty
# string token
new_data = ""
else:
@ -390,14 +447,15 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# string and use that.
new_data = self.stream.get_tag()
new_token = {"type": CHARACTERS_TYPE, "data": new_data}
new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
self.currentToken = new_token
self.currentToken = self.emitted_last_token = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return
super(BleachHTMLTokenizer, self).emitCurrentToken()
self.emitted_last_token = self.currentToken
super().emitCurrentToken()
class BleachHTMLParser(HTMLParser):
@ -416,7 +474,7 @@ class BleachHTMLParser(HTMLParser):
self.tags = [tag.lower() for tag in tags] if tags is not None else None
self.strip = strip
self.consume_entities = consume_entities
super(BleachHTMLParser, self).__init__(**kwargs)
super().__init__(**kwargs)
def _parse(
self, stream, innerHTML=False, container="div", scripting=True, **kwargs
@ -514,13 +572,13 @@ def convert_entities(text):
def match_entity(stream):
"""Returns first entity in stream or None if no entity exists
Note: For Bleach purposes, entities must start with a "&" and end with
a ";". This ignoresambiguous character entities that have no ";" at the
end.
Note: For Bleach purposes, entities must start with a "&" and end with a
";". This ignores ambiguous character entities that have no ";" at the end.
:arg stream: the character stream
:returns: ``None`` or the entity string without "&" or ";"
:returns: the entity string without "&" or ";" if it's a valid character
entity; ``None`` otherwise
"""
# Nix the & at the beginning
@ -559,9 +617,11 @@ def match_entity(stream):
# Handle character entities
while stream and stream[0] not in end_characters:
c = stream.pop(0)
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
break
possible_entity += c
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
# If it's not a prefix, then it's not an entity and we're
# out
return None
if possible_entity and stream and stream[0] == ";":
return possible_entity
@ -642,15 +702,14 @@ class BleachHTMLSerializer(HTMLSerializer):
in_tag = False
after_equals = False
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
for stoken in super().serialize(treewalker, encoding):
if in_tag:
if stoken == ">":
in_tag = False
elif after_equals:
if stoken != '"':
for part in self.escape_base_amp(stoken):
yield part
yield from self.escape_base_amp(stoken)
after_equals = False
continue

View file

@ -2,7 +2,6 @@ import re
from bleach import callbacks as linkify_callbacks
from bleach import html5lib_shim
from bleach.utils import alphabetize_attributes
#: List of default callbacks
@ -155,7 +154,7 @@ class Linker:
omit_optional_tags=False,
# linkify does not sanitize
sanitize=False,
# linkify alphabetizes
# linkify preserves attr order
alphabetical_attributes=False,
)
@ -228,7 +227,7 @@ class LinkifyFilter(html5lib_shim.Filter):
:arg re email_re: email matching regex
"""
super(LinkifyFilter, self).__init__(source)
super().__init__(source)
self.callbacks = callbacks or []
self.skip_tags = skip_tags or []
@ -316,7 +315,6 @@ class LinkifyFilter(html5lib_shim.Filter):
else:
# Add an "a" tag for the new link
_text = attrs.pop("_text", "")
attrs = alphabetize_attributes(attrs)
new_tokens.extend(
[
{"type": "StartTag", "name": "a", "data": attrs},
@ -332,8 +330,7 @@ class LinkifyFilter(html5lib_shim.Filter):
if end < len(text):
new_tokens.append({"type": "Characters", "data": text[end:]})
for new_token in new_tokens:
yield new_token
yield from new_tokens
continue
@ -439,8 +436,6 @@ class LinkifyFilter(html5lib_shim.Filter):
new_tokens.append({"type": "Characters", "data": prefix})
_text = attrs.pop("_text", "")
attrs = alphabetize_attributes(attrs)
new_tokens.extend(
[
{"type": "StartTag", "name": "a", "data": attrs},
@ -460,8 +455,7 @@ class LinkifyFilter(html5lib_shim.Filter):
if end < len(text):
new_tokens.append({"type": "Characters", "data": text[end:]})
for new_token in new_tokens:
yield new_token
yield from new_tokens
continue
@ -493,14 +487,13 @@ class LinkifyFilter(html5lib_shim.Filter):
else:
new_text = attrs.pop("_text", "")
a_token["data"] = alphabetize_attributes(attrs)
a_token["data"] = attrs
if text == new_text:
# The callbacks didn't change the text, so we yield the new "a"
# token, then whatever else was there, then the end "a" token
yield a_token
for mem in token_buffer[1:]:
yield mem
yield from token_buffer[1:]
else:
# If the callbacks changed the text, then we're going to drop
@ -516,7 +509,7 @@ class LinkifyFilter(html5lib_shim.Filter):
token_buffer = []
for token in super(LinkifyFilter, self).__iter__():
for token in super().__iter__():
if in_a:
# Handle the case where we're in an "a" tag--we want to buffer tokens
# until we hit an end "a" tag.
@ -524,8 +517,7 @@ class LinkifyFilter(html5lib_shim.Filter):
# Add the end tag to the token buffer and then handle them
# and yield anything returned
token_buffer.append(token)
for new_token in self.handle_a_tag(token_buffer):
yield new_token
yield from self.handle_a_tag(token_buffer)
# Clear "a" related state and continue since we've yielded all
# the tokens we're going to yield

View file

@ -6,7 +6,6 @@ from bleach._vendor.parse import urlparse
from xml.sax.saxutils import unescape
from bleach import html5lib_shim
from bleach.utils import alphabetize_attributes
#: List of allowed tags
@ -33,9 +32,6 @@ ALLOWED_ATTRIBUTES = {
"acronym": ["title"],
}
#: List of allowed styles
ALLOWED_STYLES = []
#: List of allowed protocols
ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
@ -85,11 +81,11 @@ class Cleaner:
self,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
strip_comments=True,
filters=None,
css_sanitizer=None,
):
"""Initializes a Cleaner
@ -99,9 +95,6 @@ class Cleaner:
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
@ -118,14 +111,17 @@ class Cleaner:
Using filters changes the output of ``bleach.Cleaner.clean``.
Make sure the way the filters change the output are secure.
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
"""
self.tags = tags
self.attributes = attributes
self.styles = styles
self.protocols = protocols
self.strip = strip
self.strip_comments = strip_comments
self.filters = filters or []
self.css_sanitizer = css_sanitizer
self.parser = html5lib_shim.BleachHTMLParser(
tags=self.tags,
@ -143,7 +139,7 @@ class Cleaner:
resolve_entities=False,
# Bleach has its own sanitizer, so don't use the html5lib one
sanitize=False,
# Bleach sanitizer alphabetizes already, so don't use the html5lib one
# clean preserves attr order
alphabetical_attributes=False,
)
@ -175,11 +171,10 @@ class Cleaner:
attributes=self.attributes,
strip_disallowed_elements=self.strip,
strip_html_comments=self.strip_comments,
css_sanitizer=self.css_sanitizer,
# html5lib-sanitizer things
allowed_elements=self.tags,
allowed_css_properties=self.styles,
allowed_protocols=self.protocols,
allowed_svg_properties=[],
)
# Apply any filters after the BleachSanitizerFilter
@ -242,25 +237,25 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
def __init__(
self,
source,
allowed_elements=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
allowed_protocols=ALLOWED_PROTOCOLS,
strip_disallowed_elements=False,
strip_html_comments=True,
css_sanitizer=None,
**kwargs,
):
"""Creates a BleachSanitizerFilter instance
:arg Treewalker source: stream
:arg list tags: allowed list of tags; defaults to
:arg list allowed_elements: allowed list of tags; defaults to
``bleach.sanitizer.ALLOWED_TAGS``
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
:arg list allowed_protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
:arg bool strip_disallowed_elements: whether or not to strip disallowed
@ -268,10 +263,14 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
:arg bool strip_html_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
"""
self.attr_filter = attribute_filter_factory(attributes)
self.strip_disallowed_elements = strip_disallowed_elements
self.strip_html_comments = strip_html_comments
self.css_sanitizer = css_sanitizer
# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
warnings.filterwarnings(
@ -280,7 +279,12 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
category=DeprecationWarning,
module="bleach._vendor.html5lib",
)
return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
return super().__init__(
source,
allowed_elements=allowed_elements,
allowed_protocols=allowed_protocols,
**kwargs,
)
def sanitize_stream(self, token_iterator):
for token in token_iterator:
@ -290,8 +294,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
continue
if isinstance(ret, list):
for subtoken in ret:
yield subtoken
yield from ret
else:
yield ret
@ -358,10 +361,6 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
return None
else:
if "data" in token:
# Alphabetize the attributes before calling .disallowed_token()
# so that the resulting string is stable
token["data"] = alphabetize_attributes(token["data"])
return self.disallowed_token(token)
elif token_type == "Comment":
@ -547,12 +546,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
# If it's a style attribute, sanitize it
if namespaced_name == (None, "style"):
val = self.sanitize_css(val)
if self.css_sanitizer:
val = self.css_sanitizer.sanitize_css(val)
else:
# FIXME(willkg): if style is allowed, but no
# css_sanitizer was set up, then this is probably a
# mistake and we should raise an error here
#
# For now, we're going to set the value to "" because
# there was no sanitizer set
val = ""
# At this point, we want to keep the attribute, so add it in
attrs[namespaced_name] = val
token["data"] = alphabetize_attributes(attrs)
token["data"] = attrs
return token
@ -575,7 +583,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
if ns is None or ns not in html5lib_shim.prefixes:
namespaced_name = name
else:
namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name)
attrs.append(
' %s="%s"'
@ -587,7 +595,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
v,
)
)
token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
token["data"] = "<{}{}>".format(token["name"], "".join(attrs))
else:
token["data"] = "<%s>" % token["name"]
@ -599,47 +607,3 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
del token["name"]
return token
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
# Convert entities in the style so that it can be parsed as CSS
style = html5lib_shim.convert_entities(style)
# Drop any url values before we do anything else
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
# The gauntlet of sanitization
# Validate the css in the style tag and if it's not valid, then drop
# the whole thing.
parts = style.split(";")
gauntlet = re.compile(
r"""^( # consider a style attribute value as composed of:
[/:,#%!.\s\w] # a non-newline character
|\w-\w # 3 characters in the form \w-\w
|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
|"[\s\w]+" # a double quoted string of [\s\w]+
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
flags=re.U | re.VERBOSE,
)
for part in parts:
if not gauntlet.match(part):
return ""
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ""
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ": " + value + ";")
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ": " + value + ";")
return " ".join(clean)

View file

@ -1,21 +0,0 @@
from collections import OrderedDict
def _attr_key(attr):
"""Returns appropriate key for sorting attribute names
Attribute names are a tuple of ``(namespace, name)`` where namespace can be
``None`` or a string. These can't be compared in Python 3, so we conver the
``None`` to an empty string.
"""
key = (attr[0][0] or ""), attr[0][1]
return key
def alphabetize_attributes(attrs):
"""Takes a dict of attributes (or None) and returns them alphabetized"""
if not attrs:
return attrs
return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])

View file

@ -5,7 +5,7 @@ backports.csv==1.0.7
backports.functools-lru-cache==1.6.4
backports.zoneinfo==0.2.1
beautifulsoup4==4.10.0
bleach==4.1.0
bleach==5.0.0
certifi==2021.10.8
cheroot==8.6.0
cherrypy==18.6.1