diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py index d271811d..bbcc2e03 100644 --- a/lib/bleach/__init__.py +++ b/lib/bleach/__init__.py @@ -11,9 +11,9 @@ from bleach.sanitizer import ( # yyyymmdd -__releasedate__ = "20220407" +__releasedate__ = "20220627" # x.y.z or x.y.z.dev0 -- semver -__version__ = "5.0.0" +__version__ = "5.0.1" __all__ = ["clean", "linkify"] diff --git a/lib/bleach/html5lib_shim.py b/lib/bleach/html5lib_shim.py index 6fc90485..d121953b 100644 --- a/lib/bleach/html5lib_shim.py +++ b/lib/bleach/html5lib_shim.py @@ -385,7 +385,17 @@ class BleachHTMLTokenizer(HTMLTokenizer): yield token if last_error_token: - yield last_error_token + if last_error_token["data"] == "eof-in-tag-name": + # Handle the case where the text being parsed ends with < + # followed by a series of characters. It's treated as a tag + # name that abruptly ends, but we should treat that like + # character data + yield { + "type": TAG_TOKEN_TYPE_CHARACTERS, + "data": "<" + self.currentToken["name"], + } + else: + yield last_error_token def consumeEntity(self, allowedChar=None, fromAttribute=False): # If this tokenizer is set to consume entities, then we can let the diff --git a/lib/bleach/linkifier.py b/lib/bleach/linkifier.py index 68a4042e..b3b83e62 100644 --- a/lib/bleach/linkifier.py +++ b/lib/bleach/linkifier.py @@ -1,5 +1,7 @@ import re +from urllib.parse import quote + from bleach import callbacks as linkify_callbacks from bleach import html5lib_shim @@ -124,11 +126,11 @@ class Linker: :arg bool parse_email: whether or not to linkify email addresses - :arg re url_re: url matching regex + :arg url_re: url matching regex - :arg re email_re: email matching regex + :arg email_re: email matching regex - :arg list-of-strings recognized_tags: the list of tags that linkify knows about; + :arg list recognized_tags: the list of tags that linkify knows about; everything else gets escaped :returns: linkified text as unicode @@ -211,7 +213,7 @@ class LinkifyFilter(html5lib_shim.Filter): ): """Creates a LinkifyFilter instance - :arg TreeWalker source: stream + :arg source: stream as an html5lib TreeWalker :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` @@ -222,9 +224,9 @@ class LinkifyFilter(html5lib_shim.Filter): :arg bool parse_email: whether or not to linkify email addresses - :arg re url_re: url matching regex + :arg url_re: url matching regex - :arg re email_re: email matching regex + :arg email_re: email matching regex """ super().__init__(source) @@ -298,10 +300,15 @@ class LinkifyFilter(html5lib_shim.Filter): {"type": "Characters", "data": text[end : match.start()]} ) + # URL-encode the "local-part" according to RFC6068 + parts = match.group(0).split("@") + parts[0] = quote(parts[0]) + address = "@".join(parts) + # Run attributes through the callbacks to see what we # should do with this match attrs = { - (None, "href"): "mailto:%s" % match.group(0), + (None, "href"): "mailto:%s" % address, "_text": match.group(0), } attrs = self.apply_callbacks(attrs, True) diff --git a/lib/bleach/parse_shim.py b/lib/bleach/parse_shim.py new file mode 100644 index 00000000..c7ce2d44 --- /dev/null +++ b/lib/bleach/parse_shim.py @@ -0,0 +1 @@ +from bleach._vendor.parse import urlparse # noqa diff --git a/lib/bleach/sanitizer.py b/lib/bleach/sanitizer.py index 0816cfd0..6527ac03 100644 --- a/lib/bleach/sanitizer.py +++ b/lib/bleach/sanitizer.py @@ -2,10 +2,10 @@ from itertools import chain import re import warnings -from bleach._vendor.parse import urlparse from xml.sax.saxutils import unescape from bleach import html5lib_shim +from bleach import parse_shim #: List of allowed tags @@ -247,7 +247,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): ): """Creates a BleachSanitizerFilter instance - :arg Treewalker source: stream + :arg source: html5lib TreeWalker stream as an html5lib TreeWalker :arg list allowed_elements: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` @@ -449,27 +449,27 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): :returns: allowed value or None """ - # NOTE(willkg): This transforms the value into one that's easier to - # match and verify, but shouldn't get returned since it's vastly - # different than the original value. + # NOTE(willkg): This transforms the value into a normalized one that's + # easier to match and verify, but shouldn't get returned since it's + # vastly different than the original value. # Convert all character entities in the value - new_value = html5lib_shim.convert_entities(value) + normalized_uri = html5lib_shim.convert_entities(value) # Nix backtick, space characters, and control characters - new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value) + normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri) # Remove REPLACEMENT characters - new_value = new_value.replace("\ufffd", "") + normalized_uri = normalized_uri.replace("\ufffd", "") # Lowercase it--this breaks the value, but makes it easier to match # against - new_value = new_value.lower() + normalized_uri = normalized_uri.lower() try: # Drop attributes with uri values that have protocols that aren't # allowed - parsed = urlparse(new_value) + parsed = parse_shim.urlparse(normalized_uri) except ValueError: # URI is impossible to parse, therefore it's not allowed return None @@ -481,16 +481,19 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): else: # Allow uris that are just an anchor - if new_value.startswith("#"): + if normalized_uri.startswith("#"): return value # Handle protocols that urlparse doesn't recognize like "myprotocol" - if ":" in new_value and new_value.split(":")[0] in allowed_protocols: + if ( + ":" in normalized_uri + and normalized_uri.split(":")[0] in allowed_protocols + ): return value - # If there's no protocol/scheme specified, then assume it's "http" - # and see if that's allowed - if "http" in allowed_protocols: + # If there's no protocol/scheme specified, then assume it's "http" or + # "https" and see if that's allowed + if "http" in allowed_protocols or "https" in allowed_protocols: return value return None diff --git a/requirements.txt b/requirements.txt index 2e5930f3..a949d18b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ backports.csv==1.0.7 backports.functools-lru-cache==1.6.4 backports.zoneinfo==0.2.1 beautifulsoup4==4.11.1 -bleach==5.0.0 +bleach==5.0.1 certifi==2022.9.24 cheroot==8.6.0 cherrypy==18.6.1