Bump bleach from 5.0.1 to 6.0.0 (#1979)

* Bump bleach from 5.0.1 to 6.0.0 Bumps [bleach](https://github.com/mozilla/bleach) from 5.0.1 to 6.0.0. - [Release notes](https://github.com/mozilla/bleach/releases) - [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES) - [Commits](https://github.com/mozilla/bleach/compare/v5.0.1...v6.0.0) --- updated-dependencies: - dependency-name: bleach dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> * Update bleach==6.0.0 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-07-16 02:02:58 -07:00 · 2023-03-02 20:55:01 -08:00 · 2023-03-02 20:55:01 -08:00 · 1466a391d1
commit 1466a391d1
parent 6b1b6d0f32
5 changed files with 291 additions and 198 deletions
--- a/lib/bleach/linkifier.py
+++ b/lib/bleach/linkifier.py
@ -120,9 +120,10 @@ class Linker:
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
-            linkifying contents of ``pre`` tags
+        :arg set skip_tags: set of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``{'pre'}`` to skip
+            linkifying contents of ``pre`` tags; ``None`` means you don't
+            want linkify to skip any tags

        :arg bool parse_email: whether or not to linkify email addresses

@ -130,7 +131,7 @@ class Linker:

        :arg email_re: email matching regex

-        :arg list recognized_tags: the list of tags that linkify knows about;
+        :arg set recognized_tags: the set of tags that linkify knows about;
            everything else gets escaped

        :returns: linkified text as unicode
@ -145,15 +146,18 @@ class Linker:
        # Create a parser/tokenizer that allows all HTML tags and escapes
        # anything not in that list.
        self.parser = html5lib_shim.BleachHTMLParser(
-            tags=recognized_tags,
+            tags=frozenset(recognized_tags),
            strip=False,
-            consume_entities=True,
+            consume_entities=False,
            namespaceHTMLElements=False,
        )
        self.walker = html5lib_shim.getTreeWalker("etree")
        self.serializer = html5lib_shim.BleachHTMLSerializer(
            quote_attr_values="always",
            omit_optional_tags=False,
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
            # linkify does not sanitize
            sanitize=False,
            # linkify preserves attr order
@ -218,8 +222,8 @@ class LinkifyFilter(html5lib_shim.Filter):
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
+        :arg set skip_tags: set of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``{'pre'}`` to skip
            linkifying contents of ``pre`` tags

        :arg bool parse_email: whether or not to linkify email addresses
@ -232,7 +236,7 @@ class LinkifyFilter(html5lib_shim.Filter):
        super().__init__(source)

        self.callbacks = callbacks or []
-        self.skip_tags = skip_tags or []
+        self.skip_tags = skip_tags or {}
        self.parse_email = parse_email

        self.url_re = url_re
@ -510,6 +514,62 @@ class LinkifyFilter(html5lib_shim.Filter):
                yield {"type": "Characters", "data": str(new_text)}
                yield token_buffer[-1]

+    def extract_entities(self, token):
+        """Handles Characters tokens with entities
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: generator of tokens
+
+        """
+        data = token.get("data", "")
+
+        # If there isn't a & in the data, we can return now
+        if "&" not in data:
+            yield token
+            return
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in html5lib_shim.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = html5lib_shim.match_entity(part)
+                if entity is not None:
+                    if entity == "amp":
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({"type": "Characters", "data": "&"})
+                    else:
+                        new_tokens.append({"type": "Entity", "name": entity})
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    remainder = part[len(entity) + 2 :]
+                    if remainder:
+                        new_tokens.append({"type": "Characters", "data": remainder})
+                    continue
+
+            new_tokens.append({"type": "Characters", "data": part})
+
+        yield from new_tokens
+
    def __iter__(self):
        in_a = False
        in_skip_tag = None
@ -564,8 +624,8 @@ class LinkifyFilter(html5lib_shim.Filter):

                new_stream = self.handle_links(new_stream)

-                for token in new_stream:
-                    yield token
+                for new_token in new_stream:
+                    yield from self.extract_entities(new_token)

                # We've already yielded this token, so continue
                continue