diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py index d619fb2c..d271811d 100644 --- a/lib/bleach/__init__.py +++ b/lib/bleach/__init__.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -import packaging.version - from bleach.linkifier import ( DEFAULT_CALLBACKS, Linker, @@ -9,17 +5,15 @@ from bleach.linkifier import ( from bleach.sanitizer import ( ALLOWED_ATTRIBUTES, ALLOWED_PROTOCOLS, - ALLOWED_STYLES, ALLOWED_TAGS, Cleaner, ) # yyyymmdd -__releasedate__ = "20210825" +__releasedate__ = "20220407" # x.y.z or x.y.z.dev0 -- semver -__version__ = "4.1.0" -VERSION = packaging.version.Version(__version__) +__version__ = "5.0.0" __all__ = ["clean", "linkify"] @@ -29,10 +23,10 @@ def clean( text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, - styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, + css_sanitizer=None, ): """Clean an HTML fragment of malicious content and return it @@ -64,9 +58,6 @@ def clean( :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` - :arg list styles: allowed list of css styles; defaults to - ``bleach.sanitizer.ALLOWED_STYLES`` - :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` @@ -74,16 +65,19 @@ def clean( :arg bool strip_comments: whether or not to strip HTML comments + :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for + sanitizing style attribute values and style text; defaults to None + :returns: cleaned text as unicode """ cleaner = Cleaner( tags=tags, attributes=attributes, - styles=styles, protocols=protocols, strip=strip, strip_comments=strip_comments, + css_sanitizer=css_sanitizer, ) return cleaner.clean(text) diff --git a/lib/bleach/_vendor/html5lib-1.1.dist-info/LICENSE b/lib/bleach/_vendor/html5lib-1.1.dist-info/LICENSE new file mode 100644 index 00000000..c87fa7a0 --- /dev/null +++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2006-2013 James Graham and other contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/lib/bleach/_vendor/html5lib-1.1.dist-info/REQUESTED b/lib/bleach/_vendor/html5lib-1.1.dist-info/REQUESTED new file mode 100644 index 00000000..e69de29b diff --git a/lib/bleach/_vendor/parse.py.SHA256SUM b/lib/bleach/_vendor/parse.py.SHA256SUM new file mode 100644 index 00000000..a324b193 --- /dev/null +++ b/lib/bleach/_vendor/parse.py.SHA256SUM @@ -0,0 +1 @@ +46af966e33b6247ae1d57d9459115a3eb46cda9f809c9f14e052abc2fe8dacb2 parse.py diff --git a/lib/bleach/css_sanitizer.py b/lib/bleach/css_sanitizer.py new file mode 100644 index 00000000..489dd6bc --- /dev/null +++ b/lib/bleach/css_sanitizer.py @@ -0,0 +1,104 @@ +import tinycss2 + + +ALLOWED_CSS_PROPERTIES = frozenset( + ( + "azimuth", + "background-color", + "border-bottom-color", + "border-collapse", + "border-color", + "border-left-color", + "border-right-color", + "border-top-color", + "clear", + "color", + "cursor", + "direction", + "display", + "elevation", + "float", + "font", + "font-family", + "font-size", + "font-style", + "font-variant", + "font-weight", + "height", + "letter-spacing", + "line-height", + "overflow", + "pause", + "pause-after", + "pause-before", + "pitch", + "pitch-range", + "richness", + "speak", + "speak-header", + "speak-numeral", + "speak-punctuation", + "speech-rate", + "stress", + "text-align", + "text-decoration", + "text-indent", + "unicode-bidi", + "vertical-align", + "voice-family", + "volume", + "white-space", + "width", + ) +) + + +ALLOWED_SVG_PROPERTIES = frozenset( + ( + "fill", + "fill-opacity", + "fill-rule", + "stroke", + "stroke-width", + "stroke-linecap", + "stroke-linejoin", + "stroke-opacity", + ) +) + + +class CSSSanitizer: + def __init__( + self, + allowed_css_properties=ALLOWED_CSS_PROPERTIES, + allowed_svg_properties=ALLOWED_SVG_PROPERTIES, + ): + self.allowed_css_properties = allowed_css_properties + self.allowed_svg_properties = allowed_svg_properties + + def sanitize_css(self, style): + """Sanitizes css in style tags""" + parsed = tinycss2.parse_declaration_list(style) + + if not parsed: + return "" + + new_tokens = [] + for token in parsed: + if token.type == "declaration": + if ( + token.lower_name in self.allowed_css_properties + or token.lower_name in self.allowed_svg_properties + ): + new_tokens.append(token) + elif token.type in ("comment", "whitespace"): + if new_tokens and new_tokens[-1].type != token.type: + new_tokens.append(token) + + # NOTE(willkg): We currently don't handle AtRule or ParseError and + # so both get silently thrown out + + if not new_tokens: + return "" + + return tinycss2.serialize(new_tokens).strip() diff --git a/lib/bleach/html5lib_shim.py b/lib/bleach/html5lib_shim.py index 3c9c3306..6fc90485 100644 --- a/lib/bleach/html5lib_shim.py +++ b/lib/bleach/html5lib_shim.py @@ -36,6 +36,8 @@ from bleach._vendor.html5lib.filters.base import ( ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( allowed_protocols, + allowed_css_properties, + allowed_svg_properties, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( Filter as SanitizerFilter, @@ -68,8 +70,10 @@ TAG_TOKEN_TYPES = { constants.tokenTypes["EndTag"], constants.tokenTypes["EmptyTag"], } -CHARACTERS_TYPE = constants.tokenTypes["Characters"] -PARSEERROR_TYPE = constants.tokenTypes["ParseError"] +TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"] +TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"] +TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"] +TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"] #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 @@ -190,6 +194,48 @@ HTML_TAGS = [ ] +#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 +#: from mozilla on 2019.07.11 +#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements +HTML_TAGS_BLOCK_LEVEL = frozenset( + [ + "address", + "article", + "aside", + "blockquote", + "details", + "dialog", + "dd", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hgroup", + "hr", + "li", + "main", + "nav", + "ol", + "p", + "pre", + "section", + "table", + "ul", + ] +) + + class InputStreamWithMemory: """Wraps an HTMLInputStream to remember characters since last < @@ -257,17 +303,20 @@ class BleachHTMLTokenizer(HTMLTokenizer): """Tokenizer that doesn't consume character entities""" def __init__(self, consume_entities=False, **kwargs): - super(BleachHTMLTokenizer, self).__init__(**kwargs) + super().__init__(**kwargs) self.consume_entities = consume_entities # Wrap the stream with one that remembers the history self.stream = InputStreamWithMemory(self.stream) + # Remember the last token emitted; needed for block element spacing + self.emitted_last_token = None + def __iter__(self): last_error_token = None - for token in super(BleachHTMLTokenizer, self).__iter__(): + for token in super().__iter__(): if last_error_token is not None: if ( last_error_token["data"] == "invalid-character-in-attribute-name" @@ -309,12 +358,12 @@ class BleachHTMLTokenizer(HTMLTokenizer): # If this is not an allowed tag, then we convert it to # characters and it'll get escaped in the sanitizer. token["data"] = self.stream.get_tag() - token["type"] = CHARACTERS_TYPE + token["type"] = TAG_TOKEN_TYPE_CHARACTERS last_error_token = None yield token - elif token["type"] == PARSEERROR_TYPE: + elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR: # If the token is a parse error, then let the last_error_token # go, and make token the new last_error_token yield last_error_token @@ -329,7 +378,7 @@ class BleachHTMLTokenizer(HTMLTokenizer): # If the token is a ParseError, we hold on to it so we can get the # next token and potentially fix it. - if token["type"] == PARSEERROR_TYPE: + if token["type"] == TAG_TOKEN_TYPE_PARSEERROR: last_error_token = token continue @@ -342,9 +391,7 @@ class BleachHTMLTokenizer(HTMLTokenizer): # If this tokenizer is set to consume entities, then we can let the # superclass do its thing. if self.consume_entities: - return super(BleachHTMLTokenizer, self).consumeEntity( - allowedChar, fromAttribute - ) + return super().consumeEntity(allowedChar, fromAttribute) # If this tokenizer is set to not consume entities, then we don't want # to consume and convert them, so this overrides the html5lib tokenizer's @@ -356,7 +403,7 @@ class BleachHTMLTokenizer(HTMLTokenizer): self.currentToken["data"][-1][1] += "&" else: - self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"}) + self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"}) def tagOpenState(self): # This state marks a < that is either a StartTag, EndTag, EmptyTag, @@ -364,7 +411,7 @@ class BleachHTMLTokenizer(HTMLTokenizer): # we've collected so far and we do that by calling start_tag() on # the input stream wrapper. self.stream.start_tag() - return super(BleachHTMLTokenizer, self).tagOpenState() + return super().tagOpenState() def emitCurrentToken(self): token = self.currentToken @@ -378,9 +425,19 @@ class BleachHTMLTokenizer(HTMLTokenizer): # allowed list, then it gets stripped or escaped. In both of these # cases it gets converted to a Characters token. if self.parser.strip: - # If we're stripping the token, we just throw in an empty - # string token. - new_data = "" + if ( + self.emitted_last_token + and token["type"] == TAG_TOKEN_TYPE_START + and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL + ): + # If this is a block level tag we're stripping, we drop it + # for a newline because that's what a browser would parse + # it as + new_data = "\n" + else: + # For all other things being stripped, we throw in an empty + # string token + new_data = "" else: # If we're escaping the token, we want to escape the exact @@ -390,14 +447,15 @@ class BleachHTMLTokenizer(HTMLTokenizer): # string and use that. new_data = self.stream.get_tag() - new_token = {"type": CHARACTERS_TYPE, "data": new_data} + new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data} - self.currentToken = new_token + self.currentToken = self.emitted_last_token = new_token self.tokenQueue.append(new_token) self.state = self.dataState return - super(BleachHTMLTokenizer, self).emitCurrentToken() + self.emitted_last_token = self.currentToken + super().emitCurrentToken() class BleachHTMLParser(HTMLParser): @@ -416,7 +474,7 @@ class BleachHTMLParser(HTMLParser): self.tags = [tag.lower() for tag in tags] if tags is not None else None self.strip = strip self.consume_entities = consume_entities - super(BleachHTMLParser, self).__init__(**kwargs) + super().__init__(**kwargs) def _parse( self, stream, innerHTML=False, container="div", scripting=True, **kwargs @@ -514,13 +572,13 @@ def convert_entities(text): def match_entity(stream): """Returns first entity in stream or None if no entity exists - Note: For Bleach purposes, entities must start with a "&" and end with - a ";". This ignoresambiguous character entities that have no ";" at the - end. + Note: For Bleach purposes, entities must start with a "&" and end with a + ";". This ignores ambiguous character entities that have no ";" at the end. :arg stream: the character stream - :returns: ``None`` or the entity string without "&" or ";" + :returns: the entity string without "&" or ";" if it's a valid character + entity; ``None`` otherwise """ # Nix the & at the beginning @@ -559,9 +617,11 @@ def match_entity(stream): # Handle character entities while stream and stream[0] not in end_characters: c = stream.pop(0) - if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): - break possible_entity += c + if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): + # If it's not a prefix, then it's not an entity and we're + # out + return None if possible_entity and stream and stream[0] == ";": return possible_entity @@ -642,15 +702,14 @@ class BleachHTMLSerializer(HTMLSerializer): in_tag = False after_equals = False - for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding): + for stoken in super().serialize(treewalker, encoding): if in_tag: if stoken == ">": in_tag = False elif after_equals: if stoken != '"': - for part in self.escape_base_amp(stoken): - yield part + yield from self.escape_base_amp(stoken) after_equals = False continue diff --git a/lib/bleach/linkifier.py b/lib/bleach/linkifier.py index 759882e9..68a4042e 100644 --- a/lib/bleach/linkifier.py +++ b/lib/bleach/linkifier.py @@ -2,7 +2,6 @@ import re from bleach import callbacks as linkify_callbacks from bleach import html5lib_shim -from bleach.utils import alphabetize_attributes #: List of default callbacks @@ -155,7 +154,7 @@ class Linker: omit_optional_tags=False, # linkify does not sanitize sanitize=False, - # linkify alphabetizes + # linkify preserves attr order alphabetical_attributes=False, ) @@ -228,7 +227,7 @@ class LinkifyFilter(html5lib_shim.Filter): :arg re email_re: email matching regex """ - super(LinkifyFilter, self).__init__(source) + super().__init__(source) self.callbacks = callbacks or [] self.skip_tags = skip_tags or [] @@ -316,7 +315,6 @@ class LinkifyFilter(html5lib_shim.Filter): else: # Add an "a" tag for the new link _text = attrs.pop("_text", "") - attrs = alphabetize_attributes(attrs) new_tokens.extend( [ {"type": "StartTag", "name": "a", "data": attrs}, @@ -332,8 +330,7 @@ class LinkifyFilter(html5lib_shim.Filter): if end < len(text): new_tokens.append({"type": "Characters", "data": text[end:]}) - for new_token in new_tokens: - yield new_token + yield from new_tokens continue @@ -439,8 +436,6 @@ class LinkifyFilter(html5lib_shim.Filter): new_tokens.append({"type": "Characters", "data": prefix}) _text = attrs.pop("_text", "") - attrs = alphabetize_attributes(attrs) - new_tokens.extend( [ {"type": "StartTag", "name": "a", "data": attrs}, @@ -460,8 +455,7 @@ class LinkifyFilter(html5lib_shim.Filter): if end < len(text): new_tokens.append({"type": "Characters", "data": text[end:]}) - for new_token in new_tokens: - yield new_token + yield from new_tokens continue @@ -493,14 +487,13 @@ class LinkifyFilter(html5lib_shim.Filter): else: new_text = attrs.pop("_text", "") - a_token["data"] = alphabetize_attributes(attrs) + a_token["data"] = attrs if text == new_text: # The callbacks didn't change the text, so we yield the new "a" # token, then whatever else was there, then the end "a" token yield a_token - for mem in token_buffer[1:]: - yield mem + yield from token_buffer[1:] else: # If the callbacks changed the text, then we're going to drop @@ -516,7 +509,7 @@ class LinkifyFilter(html5lib_shim.Filter): token_buffer = [] - for token in super(LinkifyFilter, self).__iter__(): + for token in super().__iter__(): if in_a: # Handle the case where we're in an "a" tag--we want to buffer tokens # until we hit an end "a" tag. @@ -524,8 +517,7 @@ class LinkifyFilter(html5lib_shim.Filter): # Add the end tag to the token buffer and then handle them # and yield anything returned token_buffer.append(token) - for new_token in self.handle_a_tag(token_buffer): - yield new_token + yield from self.handle_a_tag(token_buffer) # Clear "a" related state and continue since we've yielded all # the tokens we're going to yield diff --git a/lib/bleach/sanitizer.py b/lib/bleach/sanitizer.py index 89aff1f4..0816cfd0 100644 --- a/lib/bleach/sanitizer.py +++ b/lib/bleach/sanitizer.py @@ -6,7 +6,6 @@ from bleach._vendor.parse import urlparse from xml.sax.saxutils import unescape from bleach import html5lib_shim -from bleach.utils import alphabetize_attributes #: List of allowed tags @@ -33,9 +32,6 @@ ALLOWED_ATTRIBUTES = { "acronym": ["title"], } -#: List of allowed styles -ALLOWED_STYLES = [] - #: List of allowed protocols ALLOWED_PROTOCOLS = ["http", "https", "mailto"] @@ -85,11 +81,11 @@ class Cleaner: self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, - styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True, filters=None, + css_sanitizer=None, ): """Initializes a Cleaner @@ -99,9 +95,6 @@ class Cleaner: :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` - :arg list styles: allowed list of css styles; defaults to - ``bleach.sanitizer.ALLOWED_STYLES`` - :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` @@ -118,14 +111,17 @@ class Cleaner: Using filters changes the output of ``bleach.Cleaner.clean``. Make sure the way the filters change the output are secure. + :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for + sanitizing style attribute values and style text; defaults to None + """ self.tags = tags self.attributes = attributes - self.styles = styles self.protocols = protocols self.strip = strip self.strip_comments = strip_comments self.filters = filters or [] + self.css_sanitizer = css_sanitizer self.parser = html5lib_shim.BleachHTMLParser( tags=self.tags, @@ -143,7 +139,7 @@ class Cleaner: resolve_entities=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, - # Bleach sanitizer alphabetizes already, so don't use the html5lib one + # clean preserves attr order alphabetical_attributes=False, ) @@ -175,11 +171,10 @@ class Cleaner: attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, + css_sanitizer=self.css_sanitizer, # html5lib-sanitizer things allowed_elements=self.tags, - allowed_css_properties=self.styles, allowed_protocols=self.protocols, - allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter @@ -242,25 +237,25 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): def __init__( self, source, + allowed_elements=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, + allowed_protocols=ALLOWED_PROTOCOLS, strip_disallowed_elements=False, strip_html_comments=True, + css_sanitizer=None, **kwargs, ): """Creates a BleachSanitizerFilter instance :arg Treewalker source: stream - :arg list tags: allowed list of tags; defaults to + :arg list allowed_elements: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` - :arg list styles: allowed list of css styles; defaults to - ``bleach.sanitizer.ALLOWED_STYLES`` - - :arg list protocols: allowed list of protocols for links; defaults + :arg list allowed_protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip_disallowed_elements: whether or not to strip disallowed @@ -268,10 +263,14 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): :arg bool strip_html_comments: whether or not to strip HTML comments + :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for + sanitizing style attribute values and style text; defaults to None + """ self.attr_filter = attribute_filter_factory(attributes) self.strip_disallowed_elements = strip_disallowed_elements self.strip_html_comments = strip_html_comments + self.css_sanitizer = css_sanitizer # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init warnings.filterwarnings( @@ -280,7 +279,12 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): category=DeprecationWarning, module="bleach._vendor.html5lib", ) - return super(BleachSanitizerFilter, self).__init__(source, **kwargs) + return super().__init__( + source, + allowed_elements=allowed_elements, + allowed_protocols=allowed_protocols, + **kwargs, + ) def sanitize_stream(self, token_iterator): for token in token_iterator: @@ -290,8 +294,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): continue if isinstance(ret, list): - for subtoken in ret: - yield subtoken + yield from ret else: yield ret @@ -358,10 +361,6 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): return None else: - if "data" in token: - # Alphabetize the attributes before calling .disallowed_token() - # so that the resulting string is stable - token["data"] = alphabetize_attributes(token["data"]) return self.disallowed_token(token) elif token_type == "Comment": @@ -547,12 +546,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): # If it's a style attribute, sanitize it if namespaced_name == (None, "style"): - val = self.sanitize_css(val) + if self.css_sanitizer: + val = self.css_sanitizer.sanitize_css(val) + else: + # FIXME(willkg): if style is allowed, but no + # css_sanitizer was set up, then this is probably a + # mistake and we should raise an error here + # + # For now, we're going to set the value to "" because + # there was no sanitizer set + val = "" # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val - token["data"] = alphabetize_attributes(attrs) + token["data"] = attrs return token @@ -575,7 +583,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): if ns is None or ns not in html5lib_shim.prefixes: namespaced_name = name else: - namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name) + namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name) attrs.append( ' %s="%s"' @@ -587,7 +595,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): v, ) ) - token["data"] = "<%s%s>" % (token["name"], "".join(attrs)) + token["data"] = "<{}{}>".format(token["name"], "".join(attrs)) else: token["data"] = "<%s>" % token["name"] @@ -599,47 +607,3 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): del token["name"] return token - - def sanitize_css(self, style): - """Sanitizes css in style tags""" - # Convert entities in the style so that it can be parsed as CSS - style = html5lib_shim.convert_entities(style) - - # Drop any url values before we do anything else - style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style) - - # The gauntlet of sanitization - - # Validate the css in the style tag and if it's not valid, then drop - # the whole thing. - parts = style.split(";") - gauntlet = re.compile( - r"""^( # consider a style attribute value as composed of: -[/:,#%!.\s\w] # a non-newline character -|\w-\w # 3 characters in the form \w-\w -|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space -|"[\s\w]+" # a double quoted string of [\s\w]+ -|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ... -)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)' - flags=re.U | re.VERBOSE, - ) - - for part in parts: - if not gauntlet.match(part): - return "" - - if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): - return "" - - clean = [] - for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): - if not value: - continue - - if prop.lower() in self.allowed_css_properties: - clean.append(prop + ": " + value + ";") - - elif prop.lower() in self.allowed_svg_properties: - clean.append(prop + ": " + value + ";") - - return " ".join(clean) diff --git a/lib/bleach/utils.py b/lib/bleach/utils.py deleted file mode 100644 index 6be59f6f..00000000 --- a/lib/bleach/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -from collections import OrderedDict - - -def _attr_key(attr): - """Returns appropriate key for sorting attribute names - - Attribute names are a tuple of ``(namespace, name)`` where namespace can be - ``None`` or a string. These can't be compared in Python 3, so we conver the - ``None`` to an empty string. - - """ - key = (attr[0][0] or ""), attr[0][1] - return key - - -def alphabetize_attributes(attrs): - """Takes a dict of attributes (or None) and returns them alphabetized""" - if not attrs: - return attrs - - return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]) diff --git a/requirements.txt b/requirements.txt index b91e719f..5e22dccb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ backports.csv==1.0.7 backports.functools-lru-cache==1.6.4 backports.zoneinfo==0.2.1 beautifulsoup4==4.10.0 -bleach==4.1.0 +bleach==5.0.0 certifi==2021.10.8 cheroot==8.6.0 cherrypy==18.6.1