Bump bleach from 4.1.0 to 5.0.0 (#1708)

* Bump bleach from 4.1.0 to 5.0.0

Bumps [bleach](https://github.com/mozilla/bleach) from 4.1.0 to 5.0.0.
- [Release notes](https://github.com/mozilla/bleach/releases)
- [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES)
- [Commits](https://github.com/mozilla/bleach/compare/v4.1.0...v5.0.0)

---
updated-dependencies:
- dependency-name: bleach
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update bleach==5.0.0

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2022-05-16 20:41:47 -07:00 committed by GitHub
parent d510e0f600
commit a1fe0b04d7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 264 additions and 151 deletions

View file

@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
import packaging.version
from bleach.linkifier import ( from bleach.linkifier import (
DEFAULT_CALLBACKS, DEFAULT_CALLBACKS,
Linker, Linker,
@ -9,17 +5,15 @@ from bleach.linkifier import (
from bleach.sanitizer import ( from bleach.sanitizer import (
ALLOWED_ATTRIBUTES, ALLOWED_ATTRIBUTES,
ALLOWED_PROTOCOLS, ALLOWED_PROTOCOLS,
ALLOWED_STYLES,
ALLOWED_TAGS, ALLOWED_TAGS,
Cleaner, Cleaner,
) )
# yyyymmdd # yyyymmdd
__releasedate__ = "20210825" __releasedate__ = "20220407"
# x.y.z or x.y.z.dev0 -- semver # x.y.z or x.y.z.dev0 -- semver
__version__ = "4.1.0" __version__ = "5.0.0"
VERSION = packaging.version.Version(__version__)
__all__ = ["clean", "linkify"] __all__ = ["clean", "linkify"]
@ -29,10 +23,10 @@ def clean(
text, text,
tags=ALLOWED_TAGS, tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES, attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS, protocols=ALLOWED_PROTOCOLS,
strip=False, strip=False,
strip_comments=True, strip_comments=True,
css_sanitizer=None,
): ):
"""Clean an HTML fragment of malicious content and return it """Clean an HTML fragment of malicious content and return it
@ -64,9 +58,6 @@ def clean(
:arg dict attributes: allowed attributes; can be a callable, list or dict; :arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults :arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
@ -74,16 +65,19 @@ def clean(
:arg bool strip_comments: whether or not to strip HTML comments :arg bool strip_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
:returns: cleaned text as unicode :returns: cleaned text as unicode
""" """
cleaner = Cleaner( cleaner = Cleaner(
tags=tags, tags=tags,
attributes=attributes, attributes=attributes,
styles=styles,
protocols=protocols, protocols=protocols,
strip=strip, strip=strip,
strip_comments=strip_comments, strip_comments=strip_comments,
css_sanitizer=css_sanitizer,
) )
return cleaner.clean(text) return cleaner.clean(text)

View file

@ -0,0 +1,20 @@
Copyright (c) 2006-2013 James Graham and other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1 @@
46af966e33b6247ae1d57d9459115a3eb46cda9f809c9f14e052abc2fe8dacb2 parse.py

104
lib/bleach/css_sanitizer.py Normal file
View file

@ -0,0 +1,104 @@
import tinycss2
ALLOWED_CSS_PROPERTIES = frozenset(
(
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
)
)
ALLOWED_SVG_PROPERTIES = frozenset(
(
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-width",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
)
)
class CSSSanitizer:
def __init__(
self,
allowed_css_properties=ALLOWED_CSS_PROPERTIES,
allowed_svg_properties=ALLOWED_SVG_PROPERTIES,
):
self.allowed_css_properties = allowed_css_properties
self.allowed_svg_properties = allowed_svg_properties
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
parsed = tinycss2.parse_declaration_list(style)
if not parsed:
return ""
new_tokens = []
for token in parsed:
if token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif token.type in ("comment", "whitespace"):
if new_tokens and new_tokens[-1].type != token.type:
new_tokens.append(token)
# NOTE(willkg): We currently don't handle AtRule or ParseError and
# so both get silently thrown out
if not new_tokens:
return ""
return tinycss2.serialize(new_tokens).strip()

View file

@ -36,6 +36,8 @@ from bleach._vendor.html5lib.filters.base import (
) # noqa: E402 module level import not at top of file ) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import ( from bleach._vendor.html5lib.filters.sanitizer import (
allowed_protocols, allowed_protocols,
allowed_css_properties,
allowed_svg_properties,
) # noqa: E402 module level import not at top of file ) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import ( from bleach._vendor.html5lib.filters.sanitizer import (
Filter as SanitizerFilter, Filter as SanitizerFilter,
@ -68,8 +70,10 @@ TAG_TOKEN_TYPES = {
constants.tokenTypes["EndTag"], constants.tokenTypes["EndTag"],
constants.tokenTypes["EmptyTag"], constants.tokenTypes["EmptyTag"],
} }
CHARACTERS_TYPE = constants.tokenTypes["Characters"] TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
PARSEERROR_TYPE = constants.tokenTypes["ParseError"] TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
@ -190,6 +194,48 @@ HTML_TAGS = [
] ]
#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
#: from mozilla on 2019.07.11
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS_BLOCK_LEVEL = frozenset(
[
"address",
"article",
"aside",
"blockquote",
"details",
"dialog",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"table",
"ul",
]
)
class InputStreamWithMemory: class InputStreamWithMemory:
"""Wraps an HTMLInputStream to remember characters since last < """Wraps an HTMLInputStream to remember characters since last <
@ -257,17 +303,20 @@ class BleachHTMLTokenizer(HTMLTokenizer):
"""Tokenizer that doesn't consume character entities""" """Tokenizer that doesn't consume character entities"""
def __init__(self, consume_entities=False, **kwargs): def __init__(self, consume_entities=False, **kwargs):
super(BleachHTMLTokenizer, self).__init__(**kwargs) super().__init__(**kwargs)
self.consume_entities = consume_entities self.consume_entities = consume_entities
# Wrap the stream with one that remembers the history # Wrap the stream with one that remembers the history
self.stream = InputStreamWithMemory(self.stream) self.stream = InputStreamWithMemory(self.stream)
# Remember the last token emitted; needed for block element spacing
self.emitted_last_token = None
def __iter__(self): def __iter__(self):
last_error_token = None last_error_token = None
for token in super(BleachHTMLTokenizer, self).__iter__(): for token in super().__iter__():
if last_error_token is not None: if last_error_token is not None:
if ( if (
last_error_token["data"] == "invalid-character-in-attribute-name" last_error_token["data"] == "invalid-character-in-attribute-name"
@ -309,12 +358,12 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# If this is not an allowed tag, then we convert it to # If this is not an allowed tag, then we convert it to
# characters and it'll get escaped in the sanitizer. # characters and it'll get escaped in the sanitizer.
token["data"] = self.stream.get_tag() token["data"] = self.stream.get_tag()
token["type"] = CHARACTERS_TYPE token["type"] = TAG_TOKEN_TYPE_CHARACTERS
last_error_token = None last_error_token = None
yield token yield token
elif token["type"] == PARSEERROR_TYPE: elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
# If the token is a parse error, then let the last_error_token # If the token is a parse error, then let the last_error_token
# go, and make token the new last_error_token # go, and make token the new last_error_token
yield last_error_token yield last_error_token
@ -329,7 +378,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# If the token is a ParseError, we hold on to it so we can get the # If the token is a ParseError, we hold on to it so we can get the
# next token and potentially fix it. # next token and potentially fix it.
if token["type"] == PARSEERROR_TYPE: if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
last_error_token = token last_error_token = token
continue continue
@ -342,9 +391,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# If this tokenizer is set to consume entities, then we can let the # If this tokenizer is set to consume entities, then we can let the
# superclass do its thing. # superclass do its thing.
if self.consume_entities: if self.consume_entities:
return super(BleachHTMLTokenizer, self).consumeEntity( return super().consumeEntity(allowedChar, fromAttribute)
allowedChar, fromAttribute
)
# If this tokenizer is set to not consume entities, then we don't want # If this tokenizer is set to not consume entities, then we don't want
# to consume and convert them, so this overrides the html5lib tokenizer's # to consume and convert them, so this overrides the html5lib tokenizer's
@ -356,7 +403,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
self.currentToken["data"][-1][1] += "&" self.currentToken["data"][-1][1] += "&"
else: else:
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"}) self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
def tagOpenState(self): def tagOpenState(self):
# This state marks a < that is either a StartTag, EndTag, EmptyTag, # This state marks a < that is either a StartTag, EndTag, EmptyTag,
@ -364,7 +411,7 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# we've collected so far and we do that by calling start_tag() on # we've collected so far and we do that by calling start_tag() on
# the input stream wrapper. # the input stream wrapper.
self.stream.start_tag() self.stream.start_tag()
return super(BleachHTMLTokenizer, self).tagOpenState() return super().tagOpenState()
def emitCurrentToken(self): def emitCurrentToken(self):
token = self.currentToken token = self.currentToken
@ -378,8 +425,18 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# allowed list, then it gets stripped or escaped. In both of these # allowed list, then it gets stripped or escaped. In both of these
# cases it gets converted to a Characters token. # cases it gets converted to a Characters token.
if self.parser.strip: if self.parser.strip:
# If we're stripping the token, we just throw in an empty if (
# string token. self.emitted_last_token
and token["type"] == TAG_TOKEN_TYPE_START
and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
):
# If this is a block level tag we're stripping, we drop it
# for a newline because that's what a browser would parse
# it as
new_data = "\n"
else:
# For all other things being stripped, we throw in an empty
# string token
new_data = "" new_data = ""
else: else:
@ -390,14 +447,15 @@ class BleachHTMLTokenizer(HTMLTokenizer):
# string and use that. # string and use that.
new_data = self.stream.get_tag() new_data = self.stream.get_tag()
new_token = {"type": CHARACTERS_TYPE, "data": new_data} new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
self.currentToken = new_token self.currentToken = self.emitted_last_token = new_token
self.tokenQueue.append(new_token) self.tokenQueue.append(new_token)
self.state = self.dataState self.state = self.dataState
return return
super(BleachHTMLTokenizer, self).emitCurrentToken() self.emitted_last_token = self.currentToken
super().emitCurrentToken()
class BleachHTMLParser(HTMLParser): class BleachHTMLParser(HTMLParser):
@ -416,7 +474,7 @@ class BleachHTMLParser(HTMLParser):
self.tags = [tag.lower() for tag in tags] if tags is not None else None self.tags = [tag.lower() for tag in tags] if tags is not None else None
self.strip = strip self.strip = strip
self.consume_entities = consume_entities self.consume_entities = consume_entities
super(BleachHTMLParser, self).__init__(**kwargs) super().__init__(**kwargs)
def _parse( def _parse(
self, stream, innerHTML=False, container="div", scripting=True, **kwargs self, stream, innerHTML=False, container="div", scripting=True, **kwargs
@ -514,13 +572,13 @@ def convert_entities(text):
def match_entity(stream): def match_entity(stream):
"""Returns first entity in stream or None if no entity exists """Returns first entity in stream or None if no entity exists
Note: For Bleach purposes, entities must start with a "&" and end with Note: For Bleach purposes, entities must start with a "&" and end with a
a ";". This ignoresambiguous character entities that have no ";" at the ";". This ignores ambiguous character entities that have no ";" at the end.
end.
:arg stream: the character stream :arg stream: the character stream
:returns: ``None`` or the entity string without "&" or ";" :returns: the entity string without "&" or ";" if it's a valid character
entity; ``None`` otherwise
""" """
# Nix the & at the beginning # Nix the & at the beginning
@ -559,9 +617,11 @@ def match_entity(stream):
# Handle character entities # Handle character entities
while stream and stream[0] not in end_characters: while stream and stream[0] not in end_characters:
c = stream.pop(0) c = stream.pop(0)
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
break
possible_entity += c possible_entity += c
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
# If it's not a prefix, then it's not an entity and we're
# out
return None
if possible_entity and stream and stream[0] == ";": if possible_entity and stream and stream[0] == ";":
return possible_entity return possible_entity
@ -642,15 +702,14 @@ class BleachHTMLSerializer(HTMLSerializer):
in_tag = False in_tag = False
after_equals = False after_equals = False
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): for stoken in super().serialize(treewalker, encoding):
if in_tag: if in_tag:
if stoken == ">": if stoken == ">":
in_tag = False in_tag = False
elif after_equals: elif after_equals:
if stoken != '"': if stoken != '"':
for part in self.escape_base_amp(stoken): yield from self.escape_base_amp(stoken)
yield part
after_equals = False after_equals = False
continue continue

View file

@ -2,7 +2,6 @@ import re
from bleach import callbacks as linkify_callbacks from bleach import callbacks as linkify_callbacks
from bleach import html5lib_shim from bleach import html5lib_shim
from bleach.utils import alphabetize_attributes
#: List of default callbacks #: List of default callbacks
@ -155,7 +154,7 @@ class Linker:
omit_optional_tags=False, omit_optional_tags=False,
# linkify does not sanitize # linkify does not sanitize
sanitize=False, sanitize=False,
# linkify alphabetizes # linkify preserves attr order
alphabetical_attributes=False, alphabetical_attributes=False,
) )
@ -228,7 +227,7 @@ class LinkifyFilter(html5lib_shim.Filter):
:arg re email_re: email matching regex :arg re email_re: email matching regex
""" """
super(LinkifyFilter, self).__init__(source) super().__init__(source)
self.callbacks = callbacks or [] self.callbacks = callbacks or []
self.skip_tags = skip_tags or [] self.skip_tags = skip_tags or []
@ -316,7 +315,6 @@ class LinkifyFilter(html5lib_shim.Filter):
else: else:
# Add an "a" tag for the new link # Add an "a" tag for the new link
_text = attrs.pop("_text", "") _text = attrs.pop("_text", "")
attrs = alphabetize_attributes(attrs)
new_tokens.extend( new_tokens.extend(
[ [
{"type": "StartTag", "name": "a", "data": attrs}, {"type": "StartTag", "name": "a", "data": attrs},
@ -332,8 +330,7 @@ class LinkifyFilter(html5lib_shim.Filter):
if end < len(text): if end < len(text):
new_tokens.append({"type": "Characters", "data": text[end:]}) new_tokens.append({"type": "Characters", "data": text[end:]})
for new_token in new_tokens: yield from new_tokens
yield new_token
continue continue
@ -439,8 +436,6 @@ class LinkifyFilter(html5lib_shim.Filter):
new_tokens.append({"type": "Characters", "data": prefix}) new_tokens.append({"type": "Characters", "data": prefix})
_text = attrs.pop("_text", "") _text = attrs.pop("_text", "")
attrs = alphabetize_attributes(attrs)
new_tokens.extend( new_tokens.extend(
[ [
{"type": "StartTag", "name": "a", "data": attrs}, {"type": "StartTag", "name": "a", "data": attrs},
@ -460,8 +455,7 @@ class LinkifyFilter(html5lib_shim.Filter):
if end < len(text): if end < len(text):
new_tokens.append({"type": "Characters", "data": text[end:]}) new_tokens.append({"type": "Characters", "data": text[end:]})
for new_token in new_tokens: yield from new_tokens
yield new_token
continue continue
@ -493,14 +487,13 @@ class LinkifyFilter(html5lib_shim.Filter):
else: else:
new_text = attrs.pop("_text", "") new_text = attrs.pop("_text", "")
a_token["data"] = alphabetize_attributes(attrs) a_token["data"] = attrs
if text == new_text: if text == new_text:
# The callbacks didn't change the text, so we yield the new "a" # The callbacks didn't change the text, so we yield the new "a"
# token, then whatever else was there, then the end "a" token # token, then whatever else was there, then the end "a" token
yield a_token yield a_token
for mem in token_buffer[1:]: yield from token_buffer[1:]
yield mem
else: else:
# If the callbacks changed the text, then we're going to drop # If the callbacks changed the text, then we're going to drop
@ -516,7 +509,7 @@ class LinkifyFilter(html5lib_shim.Filter):
token_buffer = [] token_buffer = []
for token in super(LinkifyFilter, self).__iter__(): for token in super().__iter__():
if in_a: if in_a:
# Handle the case where we're in an "a" tag--we want to buffer tokens # Handle the case where we're in an "a" tag--we want to buffer tokens
# until we hit an end "a" tag. # until we hit an end "a" tag.
@ -524,8 +517,7 @@ class LinkifyFilter(html5lib_shim.Filter):
# Add the end tag to the token buffer and then handle them # Add the end tag to the token buffer and then handle them
# and yield anything returned # and yield anything returned
token_buffer.append(token) token_buffer.append(token)
for new_token in self.handle_a_tag(token_buffer): yield from self.handle_a_tag(token_buffer)
yield new_token
# Clear "a" related state and continue since we've yielded all # Clear "a" related state and continue since we've yielded all
# the tokens we're going to yield # the tokens we're going to yield

View file

@ -6,7 +6,6 @@ from bleach._vendor.parse import urlparse
from xml.sax.saxutils import unescape from xml.sax.saxutils import unescape
from bleach import html5lib_shim from bleach import html5lib_shim
from bleach.utils import alphabetize_attributes
#: List of allowed tags #: List of allowed tags
@ -33,9 +32,6 @@ ALLOWED_ATTRIBUTES = {
"acronym": ["title"], "acronym": ["title"],
} }
#: List of allowed styles
ALLOWED_STYLES = []
#: List of allowed protocols #: List of allowed protocols
ALLOWED_PROTOCOLS = ["http", "https", "mailto"] ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
@ -85,11 +81,11 @@ class Cleaner:
self, self,
tags=ALLOWED_TAGS, tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES, attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS, protocols=ALLOWED_PROTOCOLS,
strip=False, strip=False,
strip_comments=True, strip_comments=True,
filters=None, filters=None,
css_sanitizer=None,
): ):
"""Initializes a Cleaner """Initializes a Cleaner
@ -99,9 +95,6 @@ class Cleaner:
:arg dict attributes: allowed attributes; can be a callable, list or dict; :arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults :arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
@ -118,14 +111,17 @@ class Cleaner:
Using filters changes the output of ``bleach.Cleaner.clean``. Using filters changes the output of ``bleach.Cleaner.clean``.
Make sure the way the filters change the output are secure. Make sure the way the filters change the output are secure.
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
""" """
self.tags = tags self.tags = tags
self.attributes = attributes self.attributes = attributes
self.styles = styles
self.protocols = protocols self.protocols = protocols
self.strip = strip self.strip = strip
self.strip_comments = strip_comments self.strip_comments = strip_comments
self.filters = filters or [] self.filters = filters or []
self.css_sanitizer = css_sanitizer
self.parser = html5lib_shim.BleachHTMLParser( self.parser = html5lib_shim.BleachHTMLParser(
tags=self.tags, tags=self.tags,
@ -143,7 +139,7 @@ class Cleaner:
resolve_entities=False, resolve_entities=False,
# Bleach has its own sanitizer, so don't use the html5lib one # Bleach has its own sanitizer, so don't use the html5lib one
sanitize=False, sanitize=False,
# Bleach sanitizer alphabetizes already, so don't use the html5lib one # clean preserves attr order
alphabetical_attributes=False, alphabetical_attributes=False,
) )
@ -175,11 +171,10 @@ class Cleaner:
attributes=self.attributes, attributes=self.attributes,
strip_disallowed_elements=self.strip, strip_disallowed_elements=self.strip,
strip_html_comments=self.strip_comments, strip_html_comments=self.strip_comments,
css_sanitizer=self.css_sanitizer,
# html5lib-sanitizer things # html5lib-sanitizer things
allowed_elements=self.tags, allowed_elements=self.tags,
allowed_css_properties=self.styles,
allowed_protocols=self.protocols, allowed_protocols=self.protocols,
allowed_svg_properties=[],
) )
# Apply any filters after the BleachSanitizerFilter # Apply any filters after the BleachSanitizerFilter
@ -242,25 +237,25 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
def __init__( def __init__(
self, self,
source, source,
allowed_elements=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES, attributes=ALLOWED_ATTRIBUTES,
allowed_protocols=ALLOWED_PROTOCOLS,
strip_disallowed_elements=False, strip_disallowed_elements=False,
strip_html_comments=True, strip_html_comments=True,
css_sanitizer=None,
**kwargs, **kwargs,
): ):
"""Creates a BleachSanitizerFilter instance """Creates a BleachSanitizerFilter instance
:arg Treewalker source: stream :arg Treewalker source: stream
:arg list tags: allowed list of tags; defaults to :arg list allowed_elements: allowed list of tags; defaults to
``bleach.sanitizer.ALLOWED_TAGS`` ``bleach.sanitizer.ALLOWED_TAGS``
:arg dict attributes: allowed attributes; can be a callable, list or dict; :arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to :arg list allowed_protocols: allowed list of protocols for links; defaults
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
:arg bool strip_disallowed_elements: whether or not to strip disallowed :arg bool strip_disallowed_elements: whether or not to strip disallowed
@ -268,10 +263,14 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
:arg bool strip_html_comments: whether or not to strip HTML comments :arg bool strip_html_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
""" """
self.attr_filter = attribute_filter_factory(attributes) self.attr_filter = attribute_filter_factory(attributes)
self.strip_disallowed_elements = strip_disallowed_elements self.strip_disallowed_elements = strip_disallowed_elements
self.strip_html_comments = strip_html_comments self.strip_html_comments = strip_html_comments
self.css_sanitizer = css_sanitizer
# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
warnings.filterwarnings( warnings.filterwarnings(
@ -280,7 +279,12 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
category=DeprecationWarning, category=DeprecationWarning,
module="bleach._vendor.html5lib", module="bleach._vendor.html5lib",
) )
return super(BleachSanitizerFilter, self).__init__(source, **kwargs) return super().__init__(
source,
allowed_elements=allowed_elements,
allowed_protocols=allowed_protocols,
**kwargs,
)
def sanitize_stream(self, token_iterator): def sanitize_stream(self, token_iterator):
for token in token_iterator: for token in token_iterator:
@ -290,8 +294,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
continue continue
if isinstance(ret, list): if isinstance(ret, list):
for subtoken in ret: yield from ret
yield subtoken
else: else:
yield ret yield ret
@ -358,10 +361,6 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
return None return None
else: else:
if "data" in token:
# Alphabetize the attributes before calling .disallowed_token()
# so that the resulting string is stable
token["data"] = alphabetize_attributes(token["data"])
return self.disallowed_token(token) return self.disallowed_token(token)
elif token_type == "Comment": elif token_type == "Comment":
@ -547,12 +546,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
# If it's a style attribute, sanitize it # If it's a style attribute, sanitize it
if namespaced_name == (None, "style"): if namespaced_name == (None, "style"):
val = self.sanitize_css(val) if self.css_sanitizer:
val = self.css_sanitizer.sanitize_css(val)
else:
# FIXME(willkg): if style is allowed, but no
# css_sanitizer was set up, then this is probably a
# mistake and we should raise an error here
#
# For now, we're going to set the value to "" because
# there was no sanitizer set
val = ""
# At this point, we want to keep the attribute, so add it in # At this point, we want to keep the attribute, so add it in
attrs[namespaced_name] = val attrs[namespaced_name] = val
token["data"] = alphabetize_attributes(attrs) token["data"] = attrs
return token return token
@ -575,7 +583,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
if ns is None or ns not in html5lib_shim.prefixes: if ns is None or ns not in html5lib_shim.prefixes:
namespaced_name = name namespaced_name = name
else: else:
namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name) namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name)
attrs.append( attrs.append(
' %s="%s"' ' %s="%s"'
@ -587,7 +595,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
v, v,
) )
) )
token["data"] = "<%s%s>" % (token["name"], "".join(attrs)) token["data"] = "<{}{}>".format(token["name"], "".join(attrs))
else: else:
token["data"] = "<%s>" % token["name"] token["data"] = "<%s>" % token["name"]
@ -599,47 +607,3 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
del token["name"] del token["name"]
return token return token
def sanitize_css(self, style):
"""Sanitizes css in style tags"""
# Convert entities in the style so that it can be parsed as CSS
style = html5lib_shim.convert_entities(style)
# Drop any url values before we do anything else
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
# The gauntlet of sanitization
# Validate the css in the style tag and if it's not valid, then drop
# the whole thing.
parts = style.split(";")
gauntlet = re.compile(
r"""^( # consider a style attribute value as composed of:
[/:,#%!.\s\w] # a non-newline character
|\w-\w # 3 characters in the form \w-\w
|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
|"[\s\w]+" # a double quoted string of [\s\w]+
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
flags=re.U | re.VERBOSE,
)
for part in parts:
if not gauntlet.match(part):
return ""
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ""
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ": " + value + ";")
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ": " + value + ";")
return " ".join(clean)

View file

@ -1,21 +0,0 @@
from collections import OrderedDict
def _attr_key(attr):
"""Returns appropriate key for sorting attribute names
Attribute names are a tuple of ``(namespace, name)`` where namespace can be
``None`` or a string. These can't be compared in Python 3, so we conver the
``None`` to an empty string.
"""
key = (attr[0][0] or ""), attr[0][1]
return key
def alphabetize_attributes(attrs):
"""Takes a dict of attributes (or None) and returns them alphabetized"""
if not attrs:
return attrs
return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])

View file

@ -5,7 +5,7 @@ backports.csv==1.0.7
backports.functools-lru-cache==1.6.4 backports.functools-lru-cache==1.6.4
backports.zoneinfo==0.2.1 backports.zoneinfo==0.2.1
beautifulsoup4==4.10.0 beautifulsoup4==4.10.0
bleach==4.1.0 bleach==5.0.0
certifi==2021.10.8 certifi==2021.10.8
cheroot==8.6.0 cheroot==8.6.0
cherrypy==18.6.1 cherrypy==18.6.1