diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py index 4e87eb80..12e93b4d 100644 --- a/lib/bleach/__init__.py +++ b/lib/bleach/__init__.py @@ -11,9 +11,9 @@ from bleach.sanitizer import ( # yyyymmdd -__releasedate__ = "20230123" +__releasedate__ = "20231006" # x.y.z or x.y.z.dev0 -- semver -__version__ = "6.0.0" +__version__ = "6.1.0" __all__ = ["clean", "linkify"] diff --git a/lib/bleach/html5lib_shim.py b/lib/bleach/html5lib_shim.py index aa5189b1..ca1cc8c8 100644 --- a/lib/bleach/html5lib_shim.py +++ b/lib/bleach/html5lib_shim.py @@ -395,10 +395,17 @@ class BleachHTMLTokenizer(HTMLTokenizer): # followed by a series of characters. It's treated as a tag # name that abruptly ends, but we should treat that like # character data - yield { - "type": TAG_TOKEN_TYPE_CHARACTERS, - "data": "<" + self.currentToken["name"], - } + yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} + elif last_error_token["data"] in ( + "eof-in-attribute-name", + "eof-in-attribute-value-no-quotes", + ): + # Handle the case where the text being parsed ends with < + # followed by a series of characters and then space and then + # more characters. It's treated as a tag name followed by an + # attribute that abruptly ends, but we should treat that like + # character data. + yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} else: yield last_error_token diff --git a/lib/bleach/linkifier.py b/lib/bleach/linkifier.py index 679d7ead..8fcefb2c 100644 --- a/lib/bleach/linkifier.py +++ b/lib/bleach/linkifier.py @@ -45,8 +45,8 @@ def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): r"""\(* # Match any opening parentheses. \b(?"]*)? - # /path/zz (excluding "unsafe" chars from RFC 1738, + (?:[/?][^\s\{{\}}\|\\\^`<>"]*)? + # /path/zz (excluding "unsafe" chars from RFC 3986, # except for # and ~, which happen in practice) """.format( "|".join(sorted(protocols)), "|".join(sorted(tlds)) @@ -591,7 +591,7 @@ class LinkifyFilter(html5lib_shim.Filter): in_a = False token_buffer = [] else: - token_buffer.append(token) + token_buffer.extend(list(self.extract_entities(token))) continue if token["type"] in ["StartTag", "EmptyTag"]: diff --git a/requirements.txt b/requirements.txt index 7b0b98a8..8c8e91f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ backports.csv==1.0.7 backports.functools-lru-cache==1.6.6 backports.zoneinfo==0.2.1;python_version<"3.9" beautifulsoup4==4.12.2 -bleach==6.0.0 +bleach==6.1.0 certifi==2023.7.22 cheroot==10.0.0 cherrypy==18.8.0