Bump bleach from 5.0.1 to 6.0.0 (#1979)

* Bump bleach from 5.0.1 to 6.0.0

Bumps [bleach](https://github.com/mozilla/bleach) from 5.0.1 to 6.0.0.
- [Release notes](https://github.com/mozilla/bleach/releases)
- [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES)
- [Commits](https://github.com/mozilla/bleach/compare/v5.0.1...v6.0.0)

---
updated-dependencies:
- dependency-name: bleach
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update bleach==6.0.0

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2023-03-02 20:55:01 -08:00 committed by GitHub
parent 6b1b6d0f32
commit 1466a391d1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 291 additions and 198 deletions

View file

@ -120,9 +120,10 @@ class Linker:
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
:arg list skip_tags: list of tags that you don't want to linkify the
contents of; for example, you could set this to ``['pre']`` to skip
linkifying contents of ``pre`` tags
:arg set skip_tags: set of tags that you don't want to linkify the
contents of; for example, you could set this to ``{'pre'}`` to skip
linkifying contents of ``pre`` tags; ``None`` means you don't
want linkify to skip any tags
:arg bool parse_email: whether or not to linkify email addresses
@ -130,7 +131,7 @@ class Linker:
:arg email_re: email matching regex
:arg list recognized_tags: the list of tags that linkify knows about;
:arg set recognized_tags: the set of tags that linkify knows about;
everything else gets escaped
:returns: linkified text as unicode
@ -145,15 +146,18 @@ class Linker:
# Create a parser/tokenizer that allows all HTML tags and escapes
# anything not in that list.
self.parser = html5lib_shim.BleachHTMLParser(
tags=recognized_tags,
tags=frozenset(recognized_tags),
strip=False,
consume_entities=True,
consume_entities=False,
namespaceHTMLElements=False,
)
self.walker = html5lib_shim.getTreeWalker("etree")
self.serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values="always",
omit_optional_tags=False,
# We want to leave entities as they are without escaping or
# resolving or expanding
resolve_entities=False,
# linkify does not sanitize
sanitize=False,
# linkify preserves attr order
@ -218,8 +222,8 @@ class LinkifyFilter(html5lib_shim.Filter):
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
:arg list skip_tags: list of tags that you don't want to linkify the
contents of; for example, you could set this to ``['pre']`` to skip
:arg set skip_tags: set of tags that you don't want to linkify the
contents of; for example, you could set this to ``{'pre'}`` to skip
linkifying contents of ``pre`` tags
:arg bool parse_email: whether or not to linkify email addresses
@ -232,7 +236,7 @@ class LinkifyFilter(html5lib_shim.Filter):
super().__init__(source)
self.callbacks = callbacks or []
self.skip_tags = skip_tags or []
self.skip_tags = skip_tags or {}
self.parse_email = parse_email
self.url_re = url_re
@ -510,6 +514,62 @@ class LinkifyFilter(html5lib_shim.Filter):
yield {"type": "Characters", "data": str(new_text)}
yield token_buffer[-1]
def extract_entities(self, token):
"""Handles Characters tokens with entities
Our overridden tokenizer doesn't do anything with entities. However,
that means that the serializer will convert all ``&`` in Characters
tokens to ``&amp;``.
Since we don't want that, we extract entities here and convert them to
Entity tokens so the serializer will let them be.
:arg token: the Characters token to work on
:returns: generator of tokens
"""
data = token.get("data", "")
# If there isn't a & in the data, we can return now
if "&" not in data:
yield token
return
new_tokens = []
# For each possible entity that starts with a "&", we try to extract an
# actual entity and re-tokenize accordingly
for part in html5lib_shim.next_possible_entity(data):
if not part:
continue
if part.startswith("&"):
entity = html5lib_shim.match_entity(part)
if entity is not None:
if entity == "amp":
# LinkifyFilter can't match urls across token boundaries
# which is problematic with &amp; since that shows up in
# querystrings all the time. This special-cases &amp;
# and converts it to a & and sticks it in as a
# Characters token. It'll get merged with surrounding
# tokens in the BleachSanitizerfilter.__iter__ and
# escaped in the serializer.
new_tokens.append({"type": "Characters", "data": "&"})
else:
new_tokens.append({"type": "Entity", "name": entity})
# Length of the entity plus 2--one for & at the beginning
# and one for ; at the end
remainder = part[len(entity) + 2 :]
if remainder:
new_tokens.append({"type": "Characters", "data": remainder})
continue
new_tokens.append({"type": "Characters", "data": part})
yield from new_tokens
def __iter__(self):
in_a = False
in_skip_tag = None
@ -564,8 +624,8 @@ class LinkifyFilter(html5lib_shim.Filter):
new_stream = self.handle_links(new_stream)
for token in new_stream:
yield token
for new_token in new_stream:
yield from self.extract_entities(new_token)
# We've already yielded this token, so continue
continue