diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py index 12e93b4d..942987d9 100644 --- a/lib/bleach/__init__.py +++ b/lib/bleach/__init__.py @@ -11,9 +11,9 @@ from bleach.sanitizer import ( # yyyymmdd -__releasedate__ = "20231006" +__releasedate__ = "20241029" # x.y.z or x.y.z.dev0 -- semver -__version__ = "6.1.0" +__version__ = "6.2.0" __all__ = ["clean", "linkify"] diff --git a/lib/bleach/_vendor/html5lib/_inputstream.py b/lib/bleach/_vendor/html5lib/_inputstream.py index 0207dd21..09762517 100644 --- a/lib/bleach/_vendor/html5lib/_inputstream.py +++ b/lib/bleach/_vendor/html5lib/_inputstream.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type -from six.moves import http_client, urllib +from bleach.six_shim import text_type +from bleach.six_shim import http_client, urllib import codecs import re diff --git a/lib/bleach/_vendor/html5lib/_tokenizer.py b/lib/bleach/_vendor/html5lib/_tokenizer.py index 4748a197..d8848016 100644 --- a/lib/bleach/_vendor/html5lib/_tokenizer.py +++ b/lib/bleach/_vendor/html5lib/_tokenizer.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from six import unichr as chr +from bleach.six_shim import unichr as chr from collections import deque, OrderedDict from sys import version_info diff --git a/lib/bleach/_vendor/html5lib/_trie/py.py b/lib/bleach/_vendor/html5lib/_trie/py.py index c2ba3da7..56f66bd5 100644 --- a/lib/bleach/_vendor/html5lib/_trie/py.py +++ b/lib/bleach/_vendor/html5lib/_trie/py.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from bleach.six_shim import text_type from bisect import bisect_left diff --git a/lib/bleach/_vendor/html5lib/_utils.py b/lib/bleach/_vendor/html5lib/_utils.py index 9ea57942..635bb024 100644 --- a/lib/bleach/_vendor/html5lib/_utils.py +++ b/lib/bleach/_vendor/html5lib/_utils.py @@ -7,7 +7,7 @@ try: except ImportError: from collections import Mapping -from six import text_type, PY3 +from bleach.six_shim import text_type, PY3 if PY3: import xml.etree.ElementTree as default_etree diff --git a/lib/bleach/_vendor/html5lib/filters/lint.py b/lib/bleach/_vendor/html5lib/filters/lint.py index acd4d7a2..1340d972 100644 --- a/lib/bleach/_vendor/html5lib/filters/lint.py +++ b/lib/bleach/_vendor/html5lib/filters/lint.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from bleach.six_shim import text_type from . import base from ..constants import namespaces, voidElements diff --git a/lib/bleach/_vendor/html5lib/filters/sanitizer.py b/lib/bleach/_vendor/html5lib/filters/sanitizer.py index 70ef9066..5c31e974 100644 --- a/lib/bleach/_vendor/html5lib/filters/sanitizer.py +++ b/lib/bleach/_vendor/html5lib/filters/sanitizer.py @@ -12,7 +12,7 @@ import re import warnings from xml.sax.saxutils import escape, unescape -from six.moves import urllib_parse as urlparse +from bleach.six_shim import urllib_parse as urlparse from . import base from ..constants import namespaces, prefixes diff --git a/lib/bleach/_vendor/html5lib/html5parser.py b/lib/bleach/_vendor/html5lib/html5parser.py index 74d829d9..5427b7dd 100644 --- a/lib/bleach/_vendor/html5lib/html5parser.py +++ b/lib/bleach/_vendor/html5lib/html5parser.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass, viewkeys +from bleach.six_shim import viewkeys import types @@ -423,7 +423,7 @@ def getPhases(debug): return type # pylint:disable=unused-argument - class Phase(with_metaclass(getMetaclass(debug, log))): + class Phase(metaclass=getMetaclass(debug, log)): """Base class for helper object that implements each phase of processing """ __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") diff --git a/lib/bleach/_vendor/html5lib/serializer.py b/lib/bleach/_vendor/html5lib/serializer.py index c66df683..5666f49a 100644 --- a/lib/bleach/_vendor/html5lib/serializer.py +++ b/lib/bleach/_vendor/html5lib/serializer.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from bleach.six_shim import text_type import re diff --git a/lib/bleach/_vendor/html5lib/treebuilders/base.py b/lib/bleach/_vendor/html5lib/treebuilders/base.py index e4a3d710..2869da00 100644 --- a/lib/bleach/_vendor/html5lib/treebuilders/base.py +++ b/lib/bleach/_vendor/html5lib/treebuilders/base.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from bleach.six_shim import text_type from ..constants import scopingElements, tableInsertModeElements, namespaces diff --git a/lib/bleach/_vendor/html5lib/treebuilders/etree.py b/lib/bleach/_vendor/html5lib/treebuilders/etree.py index 086bed4e..5ccfc4d6 100644 --- a/lib/bleach/_vendor/html5lib/treebuilders/etree.py +++ b/lib/bleach/_vendor/html5lib/treebuilders/etree.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, unicode_literals # pylint:disable=protected-access -from six import text_type +from bleach.six_shim import text_type import re diff --git a/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py b/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py index e73de61a..f4622322 100644 --- a/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py +++ b/lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py @@ -28,7 +28,7 @@ from . import etree as etree_builders from .. import _ihatexml import lxml.etree as etree -from six import PY3, binary_type +from bleach.six_shim import PY3, binary_type fullTree = True diff --git a/lib/bleach/_vendor/html5lib/treewalkers/etree.py b/lib/bleach/_vendor/html5lib/treewalkers/etree.py index 44653372..a9d9450c 100644 --- a/lib/bleach/_vendor/html5lib/treewalkers/etree.py +++ b/lib/bleach/_vendor/html5lib/treewalkers/etree.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, division, unicode_literals from collections import OrderedDict import re -from six import string_types +from bleach.six_shim import string_types from . import base from .._utils import moduleFactoryFactory diff --git a/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py b/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py index a614ac5b..ef42163b 100644 --- a/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py +++ b/lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from bleach.six_shim import text_type from collections import OrderedDict diff --git a/lib/bleach/_vendor/vendor_install.sh b/lib/bleach/_vendor/vendor_install.sh index 6e61c348..6c896ee4 100644 --- a/lib/bleach/_vendor/vendor_install.sh +++ b/lib/bleach/_vendor/vendor_install.sh @@ -7,8 +7,12 @@ set -o pipefail BLEACH_VENDOR_DIR=${BLEACH_VENDOR_DIR:-"."} DEST=${DEST:-"."} +# Install with no dependencies pip install --no-binary all --no-compile --no-deps -r "${BLEACH_VENDOR_DIR}/vendor.txt" --target "${DEST}" +# Apply patches +(cd "${DEST}" && patch -p2 < 01_html5lib_six.patch) + # install Python 3.6.14 urllib.urlparse for #536 curl --proto '=https' --tlsv1.2 -o "${DEST}/parse.py" https://raw.githubusercontent.com/python/cpython/v3.6.14/Lib/urllib/parse.py (cd "${DEST}" && sha256sum parse.py > parse.py.SHA256SUM) diff --git a/lib/bleach/html5lib_shim.py b/lib/bleach/html5lib_shim.py index ca1cc8c8..f083db75 100644 --- a/lib/bleach/html5lib_shim.py +++ b/lib/bleach/html5lib_shim.py @@ -396,16 +396,25 @@ class BleachHTMLTokenizer(HTMLTokenizer): # name that abruptly ends, but we should treat that like # character data yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} + elif last_error_token["data"] in ( + "duplicate-attribute", "eof-in-attribute-name", "eof-in-attribute-value-no-quotes", + "expected-end-of-tag-but-got-eof", ): # Handle the case where the text being parsed ends with < - # followed by a series of characters and then space and then - # more characters. It's treated as a tag name followed by an + # followed by characters and then space and then: + # + # * more characters + # * more characters repeated with a space between (e.g. "abc abc") + # * more characters and then a space and then an EOF (e.g. "abc def ") + # + # These cases are treated as a tag name followed by an # attribute that abruptly ends, but we should treat that like - # character data. + # character data instead. yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} + else: yield last_error_token diff --git a/lib/bleach/six_shim.py b/lib/bleach/six_shim.py new file mode 100644 index 00000000..7db96011 --- /dev/null +++ b/lib/bleach/six_shim.py @@ -0,0 +1,19 @@ +""" +Replacement module for what html5lib uses six for. +""" + +import http.client +import operator +import urllib + + +PY3 = True +binary_type = bytes +string_types = (str,) +text_type = str +unichr = chr +viewkeys = operator.methodcaller("keys") + +http_client = http.client +urllib = urllib +urllib_parse = urllib.parse diff --git a/requirements.txt b/requirements.txt index f3a1cad8..abee3abd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ apscheduler==3.10.1 arrow==1.3.0 beautifulsoup4==4.12.3 -bleach==6.1.0 +bleach==6.2.0 certifi==2024.8.30 cheroot==10.0.1 cherrypy==18.10.0