mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 13:11:15 -07:00
Update bleach-4.1.0
This commit is contained in:
parent
4086529906
commit
a4130d6c56
51 changed files with 17071 additions and 568 deletions
|
@ -1,401 +1,131 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
import packaging.version
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
|
|
||||||
import html5lib
|
from bleach.linkifier import (
|
||||||
from html5lib.sanitizer import HTMLSanitizer
|
DEFAULT_CALLBACKS,
|
||||||
from html5lib.serializer.htmlserializer import HTMLSerializer
|
Linker,
|
||||||
|
)
|
||||||
from . import callbacks as linkify_callbacks
|
from bleach.sanitizer import (
|
||||||
from .encoding import force_unicode
|
ALLOWED_ATTRIBUTES,
|
||||||
from .sanitizer import BleachSanitizer
|
ALLOWED_PROTOCOLS,
|
||||||
|
ALLOWED_STYLES,
|
||||||
|
ALLOWED_TAGS,
|
||||||
|
Cleaner,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
VERSION = (1, 4, 2)
|
# yyyymmdd
|
||||||
__version__ = '.'.join([str(n) for n in VERSION])
|
__releasedate__ = "20210825"
|
||||||
|
# x.y.z or x.y.z.dev0 -- semver
|
||||||
__all__ = ['clean', 'linkify']
|
__version__ = "4.1.0"
|
||||||
|
VERSION = packaging.version.Version(__version__)
|
||||||
log = logging.getLogger('bleach')
|
|
||||||
|
|
||||||
ALLOWED_TAGS = [
|
|
||||||
'a',
|
|
||||||
'abbr',
|
|
||||||
'acronym',
|
|
||||||
'b',
|
|
||||||
'blockquote',
|
|
||||||
'code',
|
|
||||||
'em',
|
|
||||||
'i',
|
|
||||||
'li',
|
|
||||||
'ol',
|
|
||||||
'strong',
|
|
||||||
'ul',
|
|
||||||
]
|
|
||||||
|
|
||||||
ALLOWED_ATTRIBUTES = {
|
|
||||||
'a': ['href', 'title'],
|
|
||||||
'abbr': ['title'],
|
|
||||||
'acronym': ['title'],
|
|
||||||
}
|
|
||||||
|
|
||||||
ALLOWED_STYLES = []
|
|
||||||
|
|
||||||
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
|
|
||||||
|
|
||||||
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
|
||||||
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
|
||||||
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
|
||||||
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
|
||||||
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
|
||||||
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
|
||||||
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
|
||||||
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
|
||||||
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
|
||||||
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
|
||||||
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
|
||||||
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
|
||||||
xn xxx ye yt yu za zm zw""".split()
|
|
||||||
|
|
||||||
# Make sure that .com doesn't get matched by .co first
|
|
||||||
TLDS.reverse()
|
|
||||||
|
|
||||||
PROTOCOLS = HTMLSanitizer.acceptable_protocols
|
|
||||||
|
|
||||||
url_re = re.compile(
|
|
||||||
r"""\(* # Match any opening parentheses.
|
|
||||||
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
|
||||||
([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
|
||||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
|
||||||
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
|
||||||
# except for # and ~, which happen in practice)
|
|
||||||
""".format('|'.join(PROTOCOLS), '|'.join(TLDS)),
|
|
||||||
re.IGNORECASE | re.VERBOSE | re.UNICODE)
|
|
||||||
|
|
||||||
proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
|
|
||||||
|
|
||||||
punct_re = re.compile(r'([\.,]+)$')
|
|
||||||
|
|
||||||
email_re = re.compile(
|
|
||||||
r"""(?<!//)
|
|
||||||
(([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
|
|
||||||
(\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom
|
|
||||||
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
|
||||||
|\\[\001-011\013\014\016-\177])*" # quoted-string
|
|
||||||
)@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain
|
|
||||||
""",
|
|
||||||
re.IGNORECASE | re.MULTILINE | re.VERBOSE)
|
|
||||||
|
|
||||||
NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
|
|
||||||
|
|
||||||
ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
|
|
||||||
# a simple routine that returns the tag name with the namespace prefix
|
|
||||||
# as returned by etree's Element.tag attribute
|
|
||||||
|
|
||||||
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
|
||||||
|
|
||||||
|
|
||||||
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
|
__all__ = ["clean", "linkify"]
|
||||||
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
|
|
||||||
strip_comments=True):
|
|
||||||
"""Clean an HTML fragment and return it
|
|
||||||
|
|
||||||
:arg text: the text to clean
|
|
||||||
:arg tags: whitelist of allowed tags; defaults to
|
def clean(
|
||||||
``bleach.ALLOWED_TAGS``
|
text,
|
||||||
:arg attributes: whitelist of allowed attributes; defaults to
|
tags=ALLOWED_TAGS,
|
||||||
``bleach.ALLOWED_ATTRIBUTES``
|
attributes=ALLOWED_ATTRIBUTES,
|
||||||
:arg styles: whitelist of allowed css; defaults to
|
styles=ALLOWED_STYLES,
|
||||||
``bleach.ALLOWED_STYLES``
|
protocols=ALLOWED_PROTOCOLS,
|
||||||
:arg protocols: whitelist of allowed protocols for links; defaults
|
strip=False,
|
||||||
to ``bleach.ALLOWED_PROTOCOLS``
|
strip_comments=True,
|
||||||
:arg strip: whether or not to strip disallowed elements
|
):
|
||||||
:arg strip_comments: whether or not to strip HTML comments
|
"""Clean an HTML fragment of malicious content and return it
|
||||||
|
|
||||||
|
This function is a security-focused function whose sole purpose is to
|
||||||
|
remove malicious content from a string such that it can be displayed as
|
||||||
|
content in a web page.
|
||||||
|
|
||||||
|
This function is not designed to use to transform content to be used in
|
||||||
|
non-web-page contexts.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
import bleach
|
||||||
|
|
||||||
|
better_text = bleach.clean(yucky_text)
|
||||||
|
|
||||||
|
|
||||||
|
.. Note::
|
||||||
|
|
||||||
|
If you're cleaning a lot of text and passing the same argument values or
|
||||||
|
you want more configurability, consider using a
|
||||||
|
:py:class:`bleach.sanitizer.Cleaner` instance.
|
||||||
|
|
||||||
|
:arg str text: the text to clean
|
||||||
|
|
||||||
|
:arg list tags: allowed list of tags; defaults to
|
||||||
|
``bleach.sanitizer.ALLOWED_TAGS``
|
||||||
|
|
||||||
|
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||||
|
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||||
|
|
||||||
|
:arg list styles: allowed list of css styles; defaults to
|
||||||
|
``bleach.sanitizer.ALLOWED_STYLES``
|
||||||
|
|
||||||
|
:arg list protocols: allowed list of protocols for links; defaults
|
||||||
|
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||||
|
|
||||||
|
:arg bool strip: whether or not to strip disallowed elements
|
||||||
|
|
||||||
|
:arg bool strip_comments: whether or not to strip HTML comments
|
||||||
|
|
||||||
|
:returns: cleaned text as unicode
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if not text:
|
cleaner = Cleaner(
|
||||||
return ''
|
tags=tags,
|
||||||
|
attributes=attributes,
|
||||||
text = force_unicode(text)
|
styles=styles,
|
||||||
|
protocols=protocols,
|
||||||
class s(BleachSanitizer):
|
strip=strip,
|
||||||
allowed_elements = tags
|
strip_comments=strip_comments,
|
||||||
allowed_attributes = attributes
|
)
|
||||||
allowed_css_properties = styles
|
return cleaner.clean(text)
|
||||||
allowed_protocols = protocols
|
|
||||||
strip_disallowed_elements = strip
|
|
||||||
strip_html_comments = strip_comments
|
|
||||||
|
|
||||||
parser = html5lib.HTMLParser(tokenizer=s)
|
|
||||||
|
|
||||||
return _render(parser.parseFragment(text))
|
|
||||||
|
|
||||||
|
|
||||||
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
|
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
|
||||||
parse_email=False, tokenizer=HTMLSanitizer):
|
"""Convert URL-like strings in an HTML fragment to links
|
||||||
"""Convert URL-like strings in an HTML fragment to links.
|
|
||||||
|
This function converts strings that look like URLs, domain names and email
|
||||||
|
addresses in text that may be an HTML fragment to links, while preserving:
|
||||||
|
|
||||||
|
1. links already in the string
|
||||||
|
2. urls found in attributes
|
||||||
|
3. email addresses
|
||||||
|
|
||||||
|
linkify does a best-effort approach and tries to recover from bad
|
||||||
|
situations due to crazy text.
|
||||||
|
|
||||||
|
.. Note::
|
||||||
|
|
||||||
|
If you're linking a lot of text and passing the same argument values or
|
||||||
|
you want more configurability, consider using a
|
||||||
|
:py:class:`bleach.linkifier.Linker` instance.
|
||||||
|
|
||||||
|
.. Note::
|
||||||
|
|
||||||
|
If you have text that you want to clean and then linkify, consider using
|
||||||
|
the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
|
||||||
|
pass. That way you're not parsing the HTML twice.
|
||||||
|
|
||||||
|
:arg str text: the text to linkify
|
||||||
|
|
||||||
|
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||||
|
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||||
|
|
||||||
|
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||||
|
contents of; for example, you could set this to ``['pre']`` to skip
|
||||||
|
linkifying contents of ``pre`` tags
|
||||||
|
|
||||||
|
:arg bool parse_email: whether or not to linkify email addresses
|
||||||
|
|
||||||
|
:returns: linkified text as unicode
|
||||||
|
|
||||||
linkify() converts strings that look like URLs or domain names in a
|
|
||||||
blob of text that may be an HTML fragment to links, while preserving
|
|
||||||
(a) links already in the string, (b) urls found in attributes, and
|
|
||||||
(c) email addresses.
|
|
||||||
"""
|
"""
|
||||||
text = force_unicode(text)
|
linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email)
|
||||||
|
return linker.linkify(text)
|
||||||
if not text:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
parser = html5lib.HTMLParser(tokenizer=tokenizer)
|
|
||||||
|
|
||||||
forest = parser.parseFragment(text)
|
|
||||||
_seen = set([])
|
|
||||||
|
|
||||||
def replace_nodes(tree, new_frag, node, index=0):
|
|
||||||
"""
|
|
||||||
Doesn't really replace nodes, but inserts the nodes contained in
|
|
||||||
new_frag into the treee at position index and returns the number
|
|
||||||
of nodes inserted.
|
|
||||||
If node is passed in, it is removed from the tree
|
|
||||||
"""
|
|
||||||
count = 0
|
|
||||||
new_tree = parser.parseFragment(new_frag)
|
|
||||||
# capture any non-tag text at the start of the fragment
|
|
||||||
if new_tree.text:
|
|
||||||
if index == 0:
|
|
||||||
tree.text = tree.text or ''
|
|
||||||
tree.text += new_tree.text
|
|
||||||
else:
|
|
||||||
tree[index - 1].tail = tree[index - 1].tail or ''
|
|
||||||
tree[index - 1].tail += new_tree.text
|
|
||||||
# the put in the tagged elements into the old tree
|
|
||||||
for n in new_tree:
|
|
||||||
if n.tag == ETREE_TAG('a'):
|
|
||||||
_seen.add(n)
|
|
||||||
tree.insert(index + count, n)
|
|
||||||
count += 1
|
|
||||||
# if we got a node to remove...
|
|
||||||
if node is not None:
|
|
||||||
tree.remove(node)
|
|
||||||
return count
|
|
||||||
|
|
||||||
def strip_wrapping_parentheses(fragment):
|
|
||||||
"""Strips wrapping parentheses.
|
|
||||||
|
|
||||||
Returns a tuple of the following format::
|
|
||||||
|
|
||||||
(string stripped from wrapping parentheses,
|
|
||||||
count of stripped opening parentheses,
|
|
||||||
count of stripped closing parentheses)
|
|
||||||
"""
|
|
||||||
opening_parentheses = closing_parentheses = 0
|
|
||||||
# Count consecutive opening parentheses
|
|
||||||
# at the beginning of the fragment (string).
|
|
||||||
for char in fragment:
|
|
||||||
if char == '(':
|
|
||||||
opening_parentheses += 1
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
if opening_parentheses:
|
|
||||||
newer_frag = ''
|
|
||||||
# Cut the consecutive opening brackets from the fragment.
|
|
||||||
fragment = fragment[opening_parentheses:]
|
|
||||||
# Reverse the fragment for easier detection of parentheses
|
|
||||||
# inside the URL.
|
|
||||||
reverse_fragment = fragment[::-1]
|
|
||||||
skip = False
|
|
||||||
for char in reverse_fragment:
|
|
||||||
# Remove the closing parentheses if it has a matching
|
|
||||||
# opening parentheses (they are balanced).
|
|
||||||
if (char == ')' and
|
|
||||||
closing_parentheses < opening_parentheses and
|
|
||||||
not skip):
|
|
||||||
closing_parentheses += 1
|
|
||||||
continue
|
|
||||||
# Do not remove ')' from the URL itself.
|
|
||||||
elif char != ')':
|
|
||||||
skip = True
|
|
||||||
newer_frag += char
|
|
||||||
fragment = newer_frag[::-1]
|
|
||||||
|
|
||||||
return fragment, opening_parentheses, closing_parentheses
|
|
||||||
|
|
||||||
def apply_callbacks(attrs, new):
|
|
||||||
for cb in callbacks:
|
|
||||||
attrs = cb(attrs, new)
|
|
||||||
if attrs is None:
|
|
||||||
return None
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
def _render_inner(node):
|
|
||||||
out = ['' if node.text is None else node.text]
|
|
||||||
for subnode in node:
|
|
||||||
out.append(_render(subnode))
|
|
||||||
if subnode.tail:
|
|
||||||
out.append(subnode.tail)
|
|
||||||
return ''.join(out)
|
|
||||||
|
|
||||||
def linkify_nodes(tree, parse_text=True):
|
|
||||||
children = len(tree)
|
|
||||||
current_child = -1
|
|
||||||
# start at -1 to process the parent first
|
|
||||||
while current_child < len(tree):
|
|
||||||
if current_child < 0:
|
|
||||||
node = tree
|
|
||||||
if parse_text and node.text:
|
|
||||||
new_txt = old_txt = node.text
|
|
||||||
if parse_email:
|
|
||||||
new_txt = re.sub(email_re, email_repl, node.text)
|
|
||||||
if new_txt and new_txt != node.text:
|
|
||||||
node.text = ''
|
|
||||||
adj = replace_nodes(tree, new_txt, None, 0)
|
|
||||||
children += adj
|
|
||||||
current_child += adj
|
|
||||||
linkify_nodes(tree, True)
|
|
||||||
continue
|
|
||||||
|
|
||||||
new_txt = re.sub(url_re, link_repl, new_txt)
|
|
||||||
if new_txt != old_txt:
|
|
||||||
node.text = ''
|
|
||||||
adj = replace_nodes(tree, new_txt, None, 0)
|
|
||||||
children += adj
|
|
||||||
current_child += adj
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
node = tree[current_child]
|
|
||||||
|
|
||||||
if parse_text and node.tail:
|
|
||||||
new_tail = old_tail = node.tail
|
|
||||||
if parse_email:
|
|
||||||
new_tail = re.sub(email_re, email_repl, new_tail)
|
|
||||||
if new_tail != node.tail:
|
|
||||||
node.tail = ''
|
|
||||||
adj = replace_nodes(tree, new_tail, None,
|
|
||||||
current_child + 1)
|
|
||||||
# Insert the new nodes made from my tail into
|
|
||||||
# the tree right after me. current_child+1
|
|
||||||
children += adj
|
|
||||||
continue
|
|
||||||
|
|
||||||
new_tail = re.sub(url_re, link_repl, new_tail)
|
|
||||||
if new_tail != old_tail:
|
|
||||||
node.tail = ''
|
|
||||||
adj = replace_nodes(tree, new_tail, None,
|
|
||||||
current_child + 1)
|
|
||||||
children += adj
|
|
||||||
|
|
||||||
if node.tag == ETREE_TAG('a') and not (node in _seen):
|
|
||||||
if not node.get('href', None) is None:
|
|
||||||
attrs = dict(node.items())
|
|
||||||
|
|
||||||
_text = attrs['_text'] = _render_inner(node)
|
|
||||||
|
|
||||||
attrs = apply_callbacks(attrs, False)
|
|
||||||
|
|
||||||
if attrs is None:
|
|
||||||
# <a> tag replaced by the text within it
|
|
||||||
adj = replace_nodes(tree, _text, node,
|
|
||||||
current_child)
|
|
||||||
current_child -= 1
|
|
||||||
# pull back current_child by 1 to scan the
|
|
||||||
# new nodes again.
|
|
||||||
else:
|
|
||||||
text = force_unicode(attrs.pop('_text'))
|
|
||||||
for attr_key, attr_val in attrs.items():
|
|
||||||
node.set(attr_key, attr_val)
|
|
||||||
|
|
||||||
for n in reversed(list(node)):
|
|
||||||
node.remove(n)
|
|
||||||
text = parser.parseFragment(text)
|
|
||||||
node.text = text.text
|
|
||||||
for n in text:
|
|
||||||
node.append(n)
|
|
||||||
_seen.add(node)
|
|
||||||
|
|
||||||
elif current_child >= 0:
|
|
||||||
if node.tag == ETREE_TAG('pre') and skip_pre:
|
|
||||||
linkify_nodes(node, False)
|
|
||||||
elif not (node in _seen):
|
|
||||||
linkify_nodes(node, True)
|
|
||||||
|
|
||||||
current_child += 1
|
|
||||||
|
|
||||||
def email_repl(match):
|
|
||||||
addr = match.group(0).replace('"', '"')
|
|
||||||
link = {
|
|
||||||
'_text': addr,
|
|
||||||
'href': 'mailto:{0!s}'.format(addr),
|
|
||||||
}
|
|
||||||
link = apply_callbacks(link, True)
|
|
||||||
|
|
||||||
if link is None:
|
|
||||||
return addr
|
|
||||||
|
|
||||||
_href = link.pop('href')
|
|
||||||
_text = link.pop('_text')
|
|
||||||
|
|
||||||
repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
|
|
||||||
attr = '{0!s}="{1!s}"'
|
|
||||||
attribs = ' '.join(attr.format(k, v) for k, v in link.items())
|
|
||||||
return repl.format(_href, attribs, _text)
|
|
||||||
|
|
||||||
def link_repl(match):
|
|
||||||
url = match.group(0)
|
|
||||||
open_brackets = close_brackets = 0
|
|
||||||
if url.startswith('('):
|
|
||||||
_wrapping = strip_wrapping_parentheses(url)
|
|
||||||
url, open_brackets, close_brackets = _wrapping
|
|
||||||
end = ''
|
|
||||||
m = re.search(punct_re, url)
|
|
||||||
if m:
|
|
||||||
end = m.group(0)
|
|
||||||
url = url[0:m.start()]
|
|
||||||
if re.search(proto_re, url):
|
|
||||||
href = url
|
|
||||||
else:
|
|
||||||
href = ''.join(['http://', url])
|
|
||||||
|
|
||||||
link = {
|
|
||||||
'_text': url,
|
|
||||||
'href': href,
|
|
||||||
}
|
|
||||||
|
|
||||||
link = apply_callbacks(link, True)
|
|
||||||
|
|
||||||
if link is None:
|
|
||||||
return '(' * open_brackets + url + ')' * close_brackets
|
|
||||||
|
|
||||||
_text = link.pop('_text')
|
|
||||||
_href = link.pop('href')
|
|
||||||
|
|
||||||
repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
|
|
||||||
attr = '{0!s}="{1!s}"'
|
|
||||||
attribs = ' '.join(attr.format(k, v) for k, v in link.items())
|
|
||||||
|
|
||||||
return repl.format('(' * open_brackets,
|
|
||||||
_href, attribs, _text, end,
|
|
||||||
')' * close_brackets)
|
|
||||||
|
|
||||||
try:
|
|
||||||
linkify_nodes(forest)
|
|
||||||
except RuntimeError as e:
|
|
||||||
# If we hit the max recursion depth, just return what we've got.
|
|
||||||
log.exception('Probable recursion error: {0!r}'.format(e))
|
|
||||||
|
|
||||||
return _render(forest)
|
|
||||||
|
|
||||||
|
|
||||||
def _render(tree):
|
|
||||||
"""Try rendering as HTML, then XML, then give up."""
|
|
||||||
return force_unicode(_serialize(tree))
|
|
||||||
|
|
||||||
|
|
||||||
def _serialize(domtree):
|
|
||||||
walker = html5lib.treewalkers.getTreeWalker('etree')
|
|
||||||
stream = walker(domtree)
|
|
||||||
serializer = HTMLSerializer(quote_attr_values=True,
|
|
||||||
alphabetical_attributes=True,
|
|
||||||
omit_optional_tags=False)
|
|
||||||
return serializer.render(stream)
|
|
||||||
|
|
61
lib/bleach/_vendor/README.rst
Normal file
61
lib/bleach/_vendor/README.rst
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
=======================
|
||||||
|
Vendored library policy
|
||||||
|
=======================
|
||||||
|
|
||||||
|
To simplify Bleach development, we're now vendoring certain libraries that
|
||||||
|
we use.
|
||||||
|
|
||||||
|
Vendored libraries must follow these rules:
|
||||||
|
|
||||||
|
1. Vendored libraries must be pure Python--no compiling.
|
||||||
|
2. Source code for the libary is included in this directory.
|
||||||
|
3. License must be included in this repo and in the Bleach distribution.
|
||||||
|
4. Requirements of the library become requirements of Bleach.
|
||||||
|
5. No modifications to the library may be made.
|
||||||
|
|
||||||
|
|
||||||
|
Adding/Updating a vendored library
|
||||||
|
==================================
|
||||||
|
|
||||||
|
Way to vendor a library or update a version:
|
||||||
|
|
||||||
|
1. Update ``vendor.txt`` with the library, version, and hash. You can use
|
||||||
|
`hashin <https://pypi.org/project/hashin/>`_.
|
||||||
|
2. Remove all old files and directories of the old version.
|
||||||
|
3. Run ``pip_install_vendor.sh`` and check everything it produced in including
|
||||||
|
the ``.dist-info`` directory and contents.
|
||||||
|
4. Update the bleach minor version in the next release.
|
||||||
|
|
||||||
|
|
||||||
|
Reviewing a change involving a vendored library
|
||||||
|
===============================================
|
||||||
|
|
||||||
|
Way to verify a vendored library addition/update:
|
||||||
|
|
||||||
|
1. Pull down the branch.
|
||||||
|
2. Delete all the old files and directories of the old version.
|
||||||
|
3. Run ``pip_install_vendor.sh``.
|
||||||
|
4. Run ``git diff`` and verify there are no changes.
|
||||||
|
|
||||||
|
|
||||||
|
NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info
|
||||||
|
|
||||||
|
|
||||||
|
Removing/Unvendoring a vendored library
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
A vendored library might be removed for any of the following reasons:
|
||||||
|
|
||||||
|
* it violates the vendoring policy (e.g. an incompatible license
|
||||||
|
change)
|
||||||
|
* a suitable replacement is found
|
||||||
|
* bleach has the resources to test and QA new bleach releases against
|
||||||
|
multiple versions of the previously vendored library
|
||||||
|
|
||||||
|
To unvendor a library:
|
||||||
|
|
||||||
|
1. Remove the library and its hashes from ``vendor.txt``.
|
||||||
|
2. Remove library files and directories from this directory.
|
||||||
|
3. Run ``install_vendor.sh`` and check the previously vendored library including
|
||||||
|
the ``.dist-info`` directory and contents is not installed.
|
||||||
|
4. Update the bleach minor version in the next release.
|
0
lib/bleach/_vendor/__init__.py
Normal file
0
lib/bleach/_vendor/__init__.py
Normal file
66
lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
Normal file
66
lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
Credits
|
||||||
|
=======
|
||||||
|
|
||||||
|
``html5lib`` is written and maintained by:
|
||||||
|
|
||||||
|
- James Graham
|
||||||
|
- Sam Sneddon
|
||||||
|
- Łukasz Langa
|
||||||
|
- Will Kahn-Greene
|
||||||
|
|
||||||
|
|
||||||
|
Patches and suggestions
|
||||||
|
-----------------------
|
||||||
|
(In chronological order, by first commit:)
|
||||||
|
|
||||||
|
- Anne van Kesteren
|
||||||
|
- Lachlan Hunt
|
||||||
|
- lantis63
|
||||||
|
- Sam Ruby
|
||||||
|
- Thomas Broyer
|
||||||
|
- Tim Fletcher
|
||||||
|
- Mark Pilgrim
|
||||||
|
- Ryan King
|
||||||
|
- Philip Taylor
|
||||||
|
- Edward Z. Yang
|
||||||
|
- fantasai
|
||||||
|
- Philip Jägenstedt
|
||||||
|
- Ms2ger
|
||||||
|
- Mohammad Taha Jahangir
|
||||||
|
- Andy Wingo
|
||||||
|
- Andreas Madsack
|
||||||
|
- Karim Valiev
|
||||||
|
- Juan Carlos Garcia Segovia
|
||||||
|
- Mike West
|
||||||
|
- Marc DM
|
||||||
|
- Simon Sapin
|
||||||
|
- Michael[tm] Smith
|
||||||
|
- Ritwik Gupta
|
||||||
|
- Marc Abramowitz
|
||||||
|
- Tony Lopes
|
||||||
|
- lilbludevil
|
||||||
|
- Kevin
|
||||||
|
- Drew Hubl
|
||||||
|
- Austin Kumbera
|
||||||
|
- Jim Baker
|
||||||
|
- Jon Dufresne
|
||||||
|
- Donald Stufft
|
||||||
|
- Alex Gaynor
|
||||||
|
- Nik Nyby
|
||||||
|
- Jakub Wilk
|
||||||
|
- Sigmund Cherem
|
||||||
|
- Gabi Davar
|
||||||
|
- Florian Mounier
|
||||||
|
- neumond
|
||||||
|
- Vitalik Verhovodov
|
||||||
|
- Kovid Goyal
|
||||||
|
- Adam Chainz
|
||||||
|
- John Vandenberg
|
||||||
|
- Eric Amorde
|
||||||
|
- Benedikt Morbach
|
||||||
|
- Jonathan Vanasco
|
||||||
|
- Tom Most
|
||||||
|
- Ville Skyttä
|
||||||
|
- Hugo van Kemenade
|
||||||
|
- Mark Vasilkov
|
||||||
|
|
1
lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
Normal file
1
lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
Normal file
|
@ -0,0 +1 @@
|
||||||
|
pip
|
552
lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
Normal file
552
lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
Normal file
|
@ -0,0 +1,552 @@
|
||||||
|
Metadata-Version: 2.1
|
||||||
|
Name: html5lib
|
||||||
|
Version: 1.1
|
||||||
|
Summary: HTML parser based on the WHATWG HTML specification
|
||||||
|
Home-page: https://github.com/html5lib/html5lib-python
|
||||||
|
Maintainer: James Graham
|
||||||
|
Maintainer-email: james@hoppipolla.co.uk
|
||||||
|
License: MIT License
|
||||||
|
Platform: UNKNOWN
|
||||||
|
Classifier: Development Status :: 5 - Production/Stable
|
||||||
|
Classifier: Intended Audience :: Developers
|
||||||
|
Classifier: License :: OSI Approved :: MIT License
|
||||||
|
Classifier: Operating System :: OS Independent
|
||||||
|
Classifier: Programming Language :: Python
|
||||||
|
Classifier: Programming Language :: Python :: 2
|
||||||
|
Classifier: Programming Language :: Python :: 2.7
|
||||||
|
Classifier: Programming Language :: Python :: 3
|
||||||
|
Classifier: Programming Language :: Python :: 3.5
|
||||||
|
Classifier: Programming Language :: Python :: 3.6
|
||||||
|
Classifier: Programming Language :: Python :: 3.7
|
||||||
|
Classifier: Programming Language :: Python :: 3.8
|
||||||
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||||
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||||
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||||
|
Classifier: Topic :: Text Processing :: Markup :: HTML
|
||||||
|
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
|
||||||
|
Requires-Dist: six (>=1.9)
|
||||||
|
Requires-Dist: webencodings
|
||||||
|
Provides-Extra: all
|
||||||
|
Requires-Dist: genshi ; extra == 'all'
|
||||||
|
Requires-Dist: chardet (>=2.2) ; extra == 'all'
|
||||||
|
Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all'
|
||||||
|
Provides-Extra: chardet
|
||||||
|
Requires-Dist: chardet (>=2.2) ; extra == 'chardet'
|
||||||
|
Provides-Extra: genshi
|
||||||
|
Requires-Dist: genshi ; extra == 'genshi'
|
||||||
|
Provides-Extra: lxml
|
||||||
|
Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml'
|
||||||
|
|
||||||
|
html5lib
|
||||||
|
========
|
||||||
|
|
||||||
|
.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master
|
||||||
|
:target: https://travis-ci.org/html5lib/html5lib-python
|
||||||
|
|
||||||
|
|
||||||
|
html5lib is a pure-python library for parsing HTML. It is designed to
|
||||||
|
conform to the WHATWG HTML specification, as is implemented by all major
|
||||||
|
web browsers.
|
||||||
|
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
Simple usage follows this pattern:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
with open("mydocument.html", "rb") as f:
|
||||||
|
document = html5lib.parse(f)
|
||||||
|
|
||||||
|
or:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
document = html5lib.parse("<p>Hello World!")
|
||||||
|
|
||||||
|
By default, the ``document`` will be an ``xml.etree`` element instance.
|
||||||
|
Whenever possible, html5lib chooses the accelerated ``ElementTree``
|
||||||
|
implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
|
||||||
|
|
||||||
|
Two other tree types are supported: ``xml.dom.minidom`` and
|
||||||
|
``lxml.etree``. To use an alternative format, specify the name of
|
||||||
|
a treebuilder:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
with open("mydocument.html", "rb") as f:
|
||||||
|
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
|
||||||
|
|
||||||
|
When using with ``urllib2`` (Python 2), the charset from HTTP should be
|
||||||
|
pass into html5lib as follows:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from contextlib import closing
|
||||||
|
from urllib2 import urlopen
|
||||||
|
import html5lib
|
||||||
|
|
||||||
|
with closing(urlopen("http://example.com/")) as f:
|
||||||
|
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
|
||||||
|
|
||||||
|
When using with ``urllib.request`` (Python 3), the charset from HTTP
|
||||||
|
should be pass into html5lib as follows:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from urllib.request import urlopen
|
||||||
|
import html5lib
|
||||||
|
|
||||||
|
with urlopen("http://example.com/") as f:
|
||||||
|
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
|
||||||
|
|
||||||
|
To have more control over the parser, create a parser object explicitly.
|
||||||
|
For instance, to make the parser raise exceptions on parse errors, use:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
with open("mydocument.html", "rb") as f:
|
||||||
|
parser = html5lib.HTMLParser(strict=True)
|
||||||
|
document = parser.parse(f)
|
||||||
|
|
||||||
|
When you're instantiating parser objects explicitly, pass a treebuilder
|
||||||
|
class as the ``tree`` keyword argument to use an alternative document
|
||||||
|
format:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
|
||||||
|
minidom_document = parser.parse("<p>Hello World!")
|
||||||
|
|
||||||
|
More documentation is available at https://html5lib.readthedocs.io/.
|
||||||
|
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
$ pip install html5lib
|
||||||
|
|
||||||
|
The goal is to support a (non-strict) superset of the versions that `pip
|
||||||
|
supports
|
||||||
|
<https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
|
||||||
|
|
||||||
|
Optional Dependencies
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The following third-party libraries may be used for additional
|
||||||
|
functionality:
|
||||||
|
|
||||||
|
- ``lxml`` is supported as a tree format (for both building and
|
||||||
|
walking) under CPython (but *not* PyPy where it is known to cause
|
||||||
|
segfaults);
|
||||||
|
|
||||||
|
- ``genshi`` has a treewalker (but not builder); and
|
||||||
|
|
||||||
|
- ``chardet`` can be used as a fallback when character encoding cannot
|
||||||
|
be determined.
|
||||||
|
|
||||||
|
|
||||||
|
Bugs
|
||||||
|
----
|
||||||
|
|
||||||
|
Please report any bugs on the `issue tracker
|
||||||
|
<https://github.com/html5lib/html5lib-python/issues>`_.
|
||||||
|
|
||||||
|
|
||||||
|
Tests
|
||||||
|
-----
|
||||||
|
|
||||||
|
Unit tests require the ``pytest`` and ``mock`` libraries and can be
|
||||||
|
run using the ``py.test`` command in the root directory.
|
||||||
|
|
||||||
|
Test data are contained in a separate `html5lib-tests
|
||||||
|
<https://github.com/html5lib/html5lib-tests>`_ repository and included
|
||||||
|
as a submodule, thus for git checkouts they must be initialized::
|
||||||
|
|
||||||
|
$ git submodule init
|
||||||
|
$ git submodule update
|
||||||
|
|
||||||
|
If you have all compatible Python implementations available on your
|
||||||
|
system, you can run tests on all of them using the ``tox`` utility,
|
||||||
|
which can be found on PyPI.
|
||||||
|
|
||||||
|
|
||||||
|
Questions?
|
||||||
|
----------
|
||||||
|
|
||||||
|
There's a mailing list available for support on Google Groups,
|
||||||
|
`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
|
||||||
|
though you may get a quicker response asking on IRC in `#whatwg on
|
||||||
|
irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
|
||||||
|
|
||||||
|
Change Log
|
||||||
|
----------
|
||||||
|
|
||||||
|
1.1
|
||||||
|
~~~
|
||||||
|
|
||||||
|
UNRELEASED
|
||||||
|
|
||||||
|
Breaking changes:
|
||||||
|
|
||||||
|
* Drop support for Python 3.3. (#358)
|
||||||
|
* Drop support for Python 3.4. (#421)
|
||||||
|
|
||||||
|
Deprecations:
|
||||||
|
|
||||||
|
* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
|
||||||
|
``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
|
||||||
|
<https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
|
||||||
|
use. (#443)
|
||||||
|
|
||||||
|
Other changes:
|
||||||
|
|
||||||
|
* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
|
||||||
|
``html5lib`` keeps working in future Python versions. (#403)
|
||||||
|
* Drop optional ``datrie`` dependency. (#442)
|
||||||
|
|
||||||
|
|
||||||
|
1.0.1
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on December 7, 2017
|
||||||
|
|
||||||
|
Breaking changes:
|
||||||
|
|
||||||
|
* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
|
||||||
|
* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
|
||||||
|
|
||||||
|
Features:
|
||||||
|
|
||||||
|
* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
|
||||||
|
Will Kahn-Greene!)
|
||||||
|
* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
|
||||||
|
* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
|
||||||
|
* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
|
||||||
|
* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
|
||||||
|
* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
|
||||||
|
Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
|
||||||
|
* Semver-compliant version number.
|
||||||
|
|
||||||
|
Bug fixes:
|
||||||
|
|
||||||
|
* Add support for setuptools < 18.5 to support environment markers. (Thank you,
|
||||||
|
John Vandenberg!)
|
||||||
|
* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
|
||||||
|
* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
|
||||||
|
you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
|
||||||
|
* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
|
||||||
|
Kahn-Greene!)
|
||||||
|
* Include license file in generated wheel package. (#350) (Thank you, Jon
|
||||||
|
Dufresne!)
|
||||||
|
* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
|
||||||
|
* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
|
||||||
|
Komal Dembla, Hugo!)
|
||||||
|
|
||||||
|
|
||||||
|
1.0
|
||||||
|
~~~
|
||||||
|
|
||||||
|
Released and unreleased on December 7, 2017. Badly packaged release.
|
||||||
|
|
||||||
|
|
||||||
|
0.999999999/1.0b10
|
||||||
|
~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on July 15, 2016
|
||||||
|
|
||||||
|
* Fix attribute order going to the tree builder to be document order
|
||||||
|
instead of reverse document order(!).
|
||||||
|
|
||||||
|
|
||||||
|
0.99999999/1.0b9
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on July 14, 2016
|
||||||
|
|
||||||
|
* **Added ordereddict as a mandatory dependency on Python 2.6.**
|
||||||
|
|
||||||
|
* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
|
||||||
|
extras that will do the right thing based on the specific
|
||||||
|
interpreter implementation.
|
||||||
|
|
||||||
|
* Now requires the ``mock`` package for the testsuite.
|
||||||
|
|
||||||
|
* Cease supporting DATrie under PyPy.
|
||||||
|
|
||||||
|
* **Remove PullDOM support, as this hasn't ever been properly
|
||||||
|
tested, doesn't entirely work, and as far as I can tell is
|
||||||
|
completely unused by anyone.**
|
||||||
|
|
||||||
|
* Move testsuite to ``py.test``.
|
||||||
|
|
||||||
|
* **Fix #124: move to webencodings for decoding the input byte stream;
|
||||||
|
this makes html5lib compliant with the Encoding Standard, and
|
||||||
|
introduces a required dependency on webencodings.**
|
||||||
|
|
||||||
|
* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
|
||||||
|
|
||||||
|
* **Fix comments containing double-dash with lxml 3.5 and above.**
|
||||||
|
|
||||||
|
* **Use scripting disabled by default (as we don't implement
|
||||||
|
scripting).**
|
||||||
|
|
||||||
|
* **Fix #11, avoiding the XSS bug potentially caused by serializer
|
||||||
|
allowing attribute values to be escaped out of in old browser versions,
|
||||||
|
changing the quote_attr_values option on serializer to take one of
|
||||||
|
three values, "always" (the old True value), "legacy" (the new option,
|
||||||
|
and the new default), and "spec" (the old False value, and the old
|
||||||
|
default).**
|
||||||
|
|
||||||
|
* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
|
||||||
|
(instead of the tokenizer); as such, this will require amending all
|
||||||
|
callers of it to use it via the treewalker API.**
|
||||||
|
|
||||||
|
* **Drop support of charade, now that chardet is supported once more.**
|
||||||
|
|
||||||
|
* **Replace the charset keyword argument on parse and related methods
|
||||||
|
with a set of keyword arguments: override_encoding, transport_encoding,
|
||||||
|
same_origin_parent_encoding, likely_encoding, and default_encoding.**
|
||||||
|
|
||||||
|
* **Move filters._base, treebuilder._base, and treewalkers._base to .base
|
||||||
|
to clarify their status as public.**
|
||||||
|
|
||||||
|
* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
|
||||||
|
sanitizer.htmlsanitizer module and move that to sanitizer. This means
|
||||||
|
anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
|
||||||
|
code changes.**
|
||||||
|
|
||||||
|
* **Rename treewalkers.lxmletree to .etree_lxml and
|
||||||
|
treewalkers.genshistream to .genshi to have a consistent API.**
|
||||||
|
|
||||||
|
* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
|
||||||
|
utils) to be underscore prefixed to clarify their status as private.
|
||||||
|
|
||||||
|
|
||||||
|
0.9999999/1.0b8
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on September 10, 2015
|
||||||
|
|
||||||
|
* Fix #195: fix the sanitizer to drop broken URLs (it threw an
|
||||||
|
exception between 0.9999 and 0.999999).
|
||||||
|
|
||||||
|
|
||||||
|
0.999999/1.0b7
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on July 7, 2015
|
||||||
|
|
||||||
|
* Fix #189: fix the sanitizer to allow relative URLs again (as it did
|
||||||
|
prior to 0.9999/1.0b5).
|
||||||
|
|
||||||
|
|
||||||
|
0.99999/1.0b6
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on April 30, 2015
|
||||||
|
|
||||||
|
* Fix #188: fix the sanitizer to not throw an exception when sanitizing
|
||||||
|
bogus data URLs.
|
||||||
|
|
||||||
|
|
||||||
|
0.9999/1.0b5
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on April 29, 2015
|
||||||
|
|
||||||
|
* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
|
||||||
|
this sounds, this has no known security implications. No known version
|
||||||
|
of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
|
||||||
|
Chrome (1 to current), or Opera (12 to current) will run any script
|
||||||
|
provided in these attributes.
|
||||||
|
|
||||||
|
* Pass error message to the ParseError exception in strict parsing mode.
|
||||||
|
|
||||||
|
* Allow data URIs in the sanitizer, with a whitelist of content-types.
|
||||||
|
|
||||||
|
* Add support for Python implementations that don't support lone
|
||||||
|
surrogates (read: Jython). Fixes #2.
|
||||||
|
|
||||||
|
* Remove localization of error messages. This functionality was totally
|
||||||
|
unused (and untested that everything was localizable), so we may as
|
||||||
|
well follow numerous browsers in not supporting translating technical
|
||||||
|
strings.
|
||||||
|
|
||||||
|
* Expose treewalkers.pprint as a public API.
|
||||||
|
|
||||||
|
* Add a documentEncoding property to HTML5Parser, fix #121.
|
||||||
|
|
||||||
|
|
||||||
|
0.999
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on December 23, 2013
|
||||||
|
|
||||||
|
* Fix #127: add work-around for CPython issue #20007: .read(0) on
|
||||||
|
http.client.HTTPResponse drops the rest of the content.
|
||||||
|
|
||||||
|
* Fix #115: lxml treewalker can now deal with fragments containing, at
|
||||||
|
their root level, text nodes with non-ASCII characters on Python 2.
|
||||||
|
|
||||||
|
|
||||||
|
0.99
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on September 10, 2013
|
||||||
|
|
||||||
|
* No library changes from 1.0b3; released as 0.99 as pip has changed
|
||||||
|
behaviour from 1.4 to avoid installing pre-release versions per
|
||||||
|
PEP 440.
|
||||||
|
|
||||||
|
|
||||||
|
1.0b3
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on July 24, 2013
|
||||||
|
|
||||||
|
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
|
||||||
|
implementation using it should be moved to
|
||||||
|
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
|
||||||
|
for years.
|
||||||
|
|
||||||
|
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
|
||||||
|
object, thereby fixing any case where html5lib is passed a
|
||||||
|
non-seekable RawIOBase-like object.
|
||||||
|
|
||||||
|
|
||||||
|
1.0b2
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on June 27, 2013
|
||||||
|
|
||||||
|
* Removed reordering of attributes within the serializer. There is now
|
||||||
|
an ``alphabetical_attributes`` option which preserves the previous
|
||||||
|
behaviour through a new filter. This allows attribute order to be
|
||||||
|
preserved through html5lib if the tree builder preserves order.
|
||||||
|
|
||||||
|
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
|
||||||
|
``treeadapters.sax.to_sax`` which is generic and supports any
|
||||||
|
treewalker; it also resolves all known bugs with ``dom2sax``.
|
||||||
|
|
||||||
|
* Fix treewalker assertions on hitting bytes strings on
|
||||||
|
Python 2. Previous to 1.0b1, treewalkers coped with mixed
|
||||||
|
bytes/unicode data on Python 2; this reintroduces this prior
|
||||||
|
behaviour on Python 2. Behaviour is unchanged on Python 3.
|
||||||
|
|
||||||
|
|
||||||
|
1.0b1
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on May 17, 2013
|
||||||
|
|
||||||
|
* Implementation updated to implement the `HTML specification
|
||||||
|
<http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
|
||||||
|
2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
|
||||||
|
|
||||||
|
* Python 3.2+ supported in a single codebase using the ``six`` library.
|
||||||
|
|
||||||
|
* Removed support for Python 2.5 and older.
|
||||||
|
|
||||||
|
* Removed the deprecated Beautiful Soup 3 treebuilder.
|
||||||
|
``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
|
||||||
|
since it doesn't support namespaces, foreign content like SVG and
|
||||||
|
MathML is parsed incorrectly.
|
||||||
|
|
||||||
|
* Removed ``simpletree`` from the package. The default tree builder is
|
||||||
|
now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
|
||||||
|
available, and ``xml.etree.ElementTree`` otherwise).
|
||||||
|
|
||||||
|
* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
|
||||||
|
output was well-formed XML, and hence provided little of use.
|
||||||
|
|
||||||
|
* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
|
||||||
|
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
|
||||||
|
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
|
||||||
|
|
||||||
|
* Optional heuristic character encoding detection now based on
|
||||||
|
``charade`` for Python 2.6 - 3.3 compatibility.
|
||||||
|
|
||||||
|
* Optional ``Genshi`` treewalker support fixed.
|
||||||
|
|
||||||
|
* Many bugfixes, including:
|
||||||
|
|
||||||
|
* #33: null in attribute value breaks XML AttValue;
|
||||||
|
|
||||||
|
* #4: nested, indirect descendant, <button> causes infinite loop;
|
||||||
|
|
||||||
|
* `Google Code 215
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
|
||||||
|
detect seekable streams;
|
||||||
|
|
||||||
|
* `Google Code 206
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
|
||||||
|
support for <video preload=...>, <audio preload=...>;
|
||||||
|
|
||||||
|
* `Google Code 205
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
|
||||||
|
support for <video poster=...>;
|
||||||
|
|
||||||
|
* `Google Code 202
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
|
||||||
|
file breaks InputStream.
|
||||||
|
|
||||||
|
* Source code is now mostly PEP 8 compliant.
|
||||||
|
|
||||||
|
* Test harness has been improved and now depends on ``nose``.
|
||||||
|
|
||||||
|
* Documentation updated and moved to https://html5lib.readthedocs.io/.
|
||||||
|
|
||||||
|
|
||||||
|
0.95
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on February 11, 2012
|
||||||
|
|
||||||
|
|
||||||
|
0.90
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on January 17, 2010
|
||||||
|
|
||||||
|
|
||||||
|
0.11.1
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
Released on June 12, 2008
|
||||||
|
|
||||||
|
|
||||||
|
0.11
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on June 10, 2008
|
||||||
|
|
||||||
|
|
||||||
|
0.10
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on October 7, 2007
|
||||||
|
|
||||||
|
|
||||||
|
0.9
|
||||||
|
~~~
|
||||||
|
|
||||||
|
Released on March 11, 2007
|
||||||
|
|
||||||
|
|
||||||
|
0.2
|
||||||
|
~~~
|
||||||
|
|
||||||
|
Released on January 8, 2007
|
||||||
|
|
||||||
|
|
41
lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
Normal file
41
lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
html5lib-1.1.dist-info/AUTHORS.rst,sha256=DrNAMifoDpuQyJn-KW-H6K8Tt2a5rKnV2UF4-DRrGUI,983
|
||||||
|
html5lib-1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||||
|
html5lib-1.1.dist-info/LICENSE,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
|
||||||
|
html5lib-1.1.dist-info/METADATA,sha256=Y3w-nd_22HQnQRy3yypVsV_ke2FF94uUD4-vGpc2DnI,16076
|
||||||
|
html5lib-1.1.dist-info/RECORD,,
|
||||||
|
html5lib-1.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||||
|
html5lib-1.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
||||||
|
html5lib-1.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
|
||||||
|
html5lib/__init__.py,sha256=pWnYcfZ69wNLrdQL7bpr49FUi8O8w0KhKCOHsyRgYGQ,1143
|
||||||
|
html5lib/_ihatexml.py,sha256=ifOwF7pXqmyThIXc3boWc96s4MDezqRrRVp7FwDYUFs,16728
|
||||||
|
html5lib/_inputstream.py,sha256=IKuMiY8rzb7pqIGCpbvTqsxysLEpgEHWYvYEFu4LUAI,32300
|
||||||
|
html5lib/_tokenizer.py,sha256=WvJQa2Mli4NtTmhLXkX8Jy5FcWttqCaiDTiKyaw8D-k,77028
|
||||||
|
html5lib/_trie/__init__.py,sha256=nqfgO910329BEVJ5T4psVwQtjd2iJyEXQ2-X8c1YxwU,109
|
||||||
|
html5lib/_trie/_base.py,sha256=CaybYyMro8uERQYjby2tTeSUatnWDfWroUN9N7ety5w,1013
|
||||||
|
html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
|
||||||
|
html5lib/_utils.py,sha256=AxAJSG15eyarCgKMnlUwzs1X6jFHXqEvhlYEOxAFmis,4919
|
||||||
|
html5lib/constants.py,sha256=Ll-yzLU_jcjyAI_h57zkqZ7aQWE5t5xA4y_jQgoUUhw,83464
|
||||||
|
html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||||
|
html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
|
||||||
|
html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
|
||||||
|
html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
|
||||||
|
html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
|
||||||
|
html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
|
||||||
|
html5lib/filters/sanitizer.py,sha256=XGNSdzIqDTaHot1V-rRj1V_XOolApJ7n95tHP9JcgNU,26885
|
||||||
|
html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
|
||||||
|
html5lib/html5parser.py,sha256=w5hZJh0cvD3g4CS196DiTmuGpSKCMYe1GS46-yf_WZQ,117174
|
||||||
|
html5lib/serializer.py,sha256=K2kfoLyMPMFPfdusfR30SrxNkf0mJB92-P5_RntyaaI,15747
|
||||||
|
html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
|
||||||
|
html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
|
||||||
|
html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
|
||||||
|
html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
|
||||||
|
html5lib/treebuilders/base.py,sha256=oeZNGEB-kt90YJGVH05gb5a8E7ids2AbYwGRsVCieWk,14553
|
||||||
|
html5lib/treebuilders/dom.py,sha256=22whb0C71zXIsai5mamg6qzBEiigcBIvaDy4Asw3at0,8925
|
||||||
|
html5lib/treebuilders/etree.py,sha256=EbmHx-wQ-11MVucTPtF7Ul92-mQGN3Udu_KfDn-Ifhk,12824
|
||||||
|
html5lib/treebuilders/etree_lxml.py,sha256=OazDHZGO_q4FnVs4Dhs4hzzn2JwGAOs-rfV8LAlUGW4,14754
|
||||||
|
html5lib/treewalkers/__init__.py,sha256=OBPtc1TU5mGyy18QDMxKEyYEz0wxFUUNj5v0-XgmYhY,5719
|
||||||
|
html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
|
||||||
|
html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
|
||||||
|
html5lib/treewalkers/etree.py,sha256=gkD4tfEfRWPsEGvgHHJxZmKZXUvBzVVGz3v5C_MIiOE,4539
|
||||||
|
html5lib/treewalkers/etree_lxml.py,sha256=eLedbn6nPjlpebibsWVijey7WEpzDwxU3ubwUoudBuA,6345
|
||||||
|
html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
|
6
lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
Normal file
6
lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
Wheel-Version: 1.0
|
||||||
|
Generator: bdist_wheel (0.34.2)
|
||||||
|
Root-Is-Purelib: true
|
||||||
|
Tag: py2-none-any
|
||||||
|
Tag: py3-none-any
|
||||||
|
|
1
lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
Normal file
1
lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
html5lib
|
35
lib/bleach/_vendor/html5lib/__init__.py
Normal file
35
lib/bleach/_vendor/html5lib/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
"""
|
||||||
|
HTML parsing library based on the `WHATWG HTML specification
|
||||||
|
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
||||||
|
existing HTML found in the wild and implements well-defined error recovery that
|
||||||
|
is largely compatible with modern desktop web browsers.
|
||||||
|
|
||||||
|
Example usage::
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
with open("my_document.html", "rb") as f:
|
||||||
|
tree = html5lib.parse(f)
|
||||||
|
|
||||||
|
For convenience, this module re-exports the following names:
|
||||||
|
|
||||||
|
* :func:`~.html5parser.parse`
|
||||||
|
* :func:`~.html5parser.parseFragment`
|
||||||
|
* :class:`~.html5parser.HTMLParser`
|
||||||
|
* :func:`~.treebuilders.getTreeBuilder`
|
||||||
|
* :func:`~.treewalkers.getTreeWalker`
|
||||||
|
* :func:`~.serializer.serialize`
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from .html5parser import HTMLParser, parse, parseFragment
|
||||||
|
from .treebuilders import getTreeBuilder
|
||||||
|
from .treewalkers import getTreeWalker
|
||||||
|
from .serializer import serialize
|
||||||
|
|
||||||
|
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||||
|
"getTreeWalker", "serialize"]
|
||||||
|
|
||||||
|
# this has to be at the top level, see how setup.py parses this
|
||||||
|
#: Distribution version number.
|
||||||
|
__version__ = "1.1"
|
289
lib/bleach/_vendor/html5lib/_ihatexml.py
Normal file
289
lib/bleach/_vendor/html5lib/_ihatexml.py
Normal file
|
@ -0,0 +1,289 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .constants import DataLossWarning
|
||||||
|
|
||||||
|
baseChar = """
|
||||||
|
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
||||||
|
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
||||||
|
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
||||||
|
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
||||||
|
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
||||||
|
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
||||||
|
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
||||||
|
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
||||||
|
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
||||||
|
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
||||||
|
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
||||||
|
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
||||||
|
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
||||||
|
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
||||||
|
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
||||||
|
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
||||||
|
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
||||||
|
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
||||||
|
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
||||||
|
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
||||||
|
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
||||||
|
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
||||||
|
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
||||||
|
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
||||||
|
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
||||||
|
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
||||||
|
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
||||||
|
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
||||||
|
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
||||||
|
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
||||||
|
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
||||||
|
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
||||||
|
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
||||||
|
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
||||||
|
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
||||||
|
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
||||||
|
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
||||||
|
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
||||||
|
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
||||||
|
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
||||||
|
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
||||||
|
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
||||||
|
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
||||||
|
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
||||||
|
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
||||||
|
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||||
|
|
||||||
|
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||||
|
|
||||||
|
combiningCharacter = """
|
||||||
|
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
||||||
|
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
||||||
|
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
||||||
|
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
||||||
|
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
||||||
|
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
||||||
|
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
||||||
|
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
||||||
|
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
||||||
|
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
||||||
|
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
||||||
|
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
||||||
|
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
||||||
|
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
||||||
|
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
||||||
|
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
||||||
|
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
||||||
|
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
||||||
|
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
||||||
|
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
||||||
|
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
||||||
|
#x3099 | #x309A"""
|
||||||
|
|
||||||
|
digit = """
|
||||||
|
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
||||||
|
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
||||||
|
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
||||||
|
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||||
|
|
||||||
|
extender = """
|
||||||
|
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
||||||
|
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||||
|
|
||||||
|
letter = " | ".join([baseChar, ideographic])
|
||||||
|
|
||||||
|
# Without the
|
||||||
|
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||||
|
extender])
|
||||||
|
nameFirst = " | ".join([letter, "_"])
|
||||||
|
|
||||||
|
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||||
|
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||||
|
|
||||||
|
|
||||||
|
def charStringToList(chars):
|
||||||
|
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||||
|
rv = []
|
||||||
|
for item in charRanges:
|
||||||
|
foundMatch = False
|
||||||
|
for regexp in (reChar, reCharRange):
|
||||||
|
match = regexp.match(item)
|
||||||
|
if match is not None:
|
||||||
|
rv.append([hexToInt(item) for item in match.groups()])
|
||||||
|
if len(rv[-1]) == 1:
|
||||||
|
rv[-1] = rv[-1] * 2
|
||||||
|
foundMatch = True
|
||||||
|
break
|
||||||
|
if not foundMatch:
|
||||||
|
assert len(item) == 1
|
||||||
|
|
||||||
|
rv.append([ord(item)] * 2)
|
||||||
|
rv = normaliseCharList(rv)
|
||||||
|
return rv
|
||||||
|
|
||||||
|
|
||||||
|
def normaliseCharList(charList):
|
||||||
|
charList = sorted(charList)
|
||||||
|
for item in charList:
|
||||||
|
assert item[1] >= item[0]
|
||||||
|
rv = []
|
||||||
|
i = 0
|
||||||
|
while i < len(charList):
|
||||||
|
j = 1
|
||||||
|
rv.append(charList[i])
|
||||||
|
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
|
||||||
|
rv[-1][1] = charList[i + j][1]
|
||||||
|
j += 1
|
||||||
|
i += j
|
||||||
|
return rv
|
||||||
|
|
||||||
|
|
||||||
|
# We don't really support characters above the BMP :(
|
||||||
|
max_unicode = int("FFFF", 16)
|
||||||
|
|
||||||
|
|
||||||
|
def missingRanges(charList):
|
||||||
|
rv = []
|
||||||
|
if charList[0] != 0:
|
||||||
|
rv.append([0, charList[0][0] - 1])
|
||||||
|
for i, item in enumerate(charList[:-1]):
|
||||||
|
rv.append([item[1] + 1, charList[i + 1][0] - 1])
|
||||||
|
if charList[-1][1] != max_unicode:
|
||||||
|
rv.append([charList[-1][1] + 1, max_unicode])
|
||||||
|
return rv
|
||||||
|
|
||||||
|
|
||||||
|
def listToRegexpStr(charList):
|
||||||
|
rv = []
|
||||||
|
for item in charList:
|
||||||
|
if item[0] == item[1]:
|
||||||
|
rv.append(escapeRegexp(chr(item[0])))
|
||||||
|
else:
|
||||||
|
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
||||||
|
escapeRegexp(chr(item[1])))
|
||||||
|
return "[%s]" % "".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
def hexToInt(hex_str):
|
||||||
|
return int(hex_str, 16)
|
||||||
|
|
||||||
|
|
||||||
|
def escapeRegexp(string):
|
||||||
|
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||||
|
"[", "]", "|", "(", ")", "-")
|
||||||
|
for char in specialCharacters:
|
||||||
|
string = string.replace(char, "\\" + char)
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
# output from the above
|
||||||
|
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||||
|
|
||||||
|
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||||
|
|
||||||
|
# Simpler things
|
||||||
|
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
||||||
|
|
||||||
|
|
||||||
|
class InfosetFilter(object):
|
||||||
|
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
dropXmlnsLocalName=False,
|
||||||
|
dropXmlnsAttrNs=False,
|
||||||
|
preventDoubleDashComments=False,
|
||||||
|
preventDashAtCommentEnd=False,
|
||||||
|
replaceFormFeedCharacters=True,
|
||||||
|
preventSingleQuotePubid=False):
|
||||||
|
|
||||||
|
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||||
|
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||||
|
|
||||||
|
self.preventDoubleDashComments = preventDoubleDashComments
|
||||||
|
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||||
|
|
||||||
|
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||||
|
|
||||||
|
self.preventSingleQuotePubid = preventSingleQuotePubid
|
||||||
|
|
||||||
|
self.replaceCache = {}
|
||||||
|
|
||||||
|
def coerceAttribute(self, name, namespace=None):
|
||||||
|
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||||
|
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
||||||
|
return None
|
||||||
|
elif (self.dropXmlnsAttrNs and
|
||||||
|
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||||
|
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self.toXmlName(name)
|
||||||
|
|
||||||
|
def coerceElement(self, name):
|
||||||
|
return self.toXmlName(name)
|
||||||
|
|
||||||
|
def coerceComment(self, data):
|
||||||
|
if self.preventDoubleDashComments:
|
||||||
|
while "--" in data:
|
||||||
|
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||||
|
data = data.replace("--", "- -")
|
||||||
|
if data.endswith("-"):
|
||||||
|
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
||||||
|
data += " "
|
||||||
|
return data
|
||||||
|
|
||||||
|
def coerceCharacters(self, data):
|
||||||
|
if self.replaceFormFeedCharacters:
|
||||||
|
for _ in range(data.count("\x0C")):
|
||||||
|
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||||
|
data = data.replace("\x0C", " ")
|
||||||
|
# Other non-xml characters
|
||||||
|
return data
|
||||||
|
|
||||||
|
def coercePubid(self, data):
|
||||||
|
dataOutput = data
|
||||||
|
for char in nonPubidCharRegexp.findall(data):
|
||||||
|
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
||||||
|
replacement = self.getReplacementCharacter(char)
|
||||||
|
dataOutput = dataOutput.replace(char, replacement)
|
||||||
|
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
||||||
|
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
||||||
|
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
||||||
|
return dataOutput
|
||||||
|
|
||||||
|
def toXmlName(self, name):
|
||||||
|
nameFirst = name[0]
|
||||||
|
nameRest = name[1:]
|
||||||
|
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||||
|
if m:
|
||||||
|
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||||
|
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||||
|
else:
|
||||||
|
nameFirstOutput = nameFirst
|
||||||
|
|
||||||
|
nameRestOutput = nameRest
|
||||||
|
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||||
|
for char in replaceChars:
|
||||||
|
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||||
|
replacement = self.getReplacementCharacter(char)
|
||||||
|
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||||
|
return nameFirstOutput + nameRestOutput
|
||||||
|
|
||||||
|
def getReplacementCharacter(self, char):
|
||||||
|
if char in self.replaceCache:
|
||||||
|
replacement = self.replaceCache[char]
|
||||||
|
else:
|
||||||
|
replacement = self.escapeChar(char)
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
def fromXmlName(self, name):
|
||||||
|
for item in set(self.replacementRegexp.findall(name)):
|
||||||
|
name = name.replace(item, self.unescapeChar(item))
|
||||||
|
return name
|
||||||
|
|
||||||
|
def escapeChar(self, char):
|
||||||
|
replacement = "U%05X" % ord(char)
|
||||||
|
self.replaceCache[char] = replacement
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
def unescapeChar(self, charcode):
|
||||||
|
return chr(int(charcode[1:], 16))
|
918
lib/bleach/_vendor/html5lib/_inputstream.py
Normal file
918
lib/bleach/_vendor/html5lib/_inputstream.py
Normal file
|
@ -0,0 +1,918 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
from six.moves import http_client, urllib
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import re
|
||||||
|
from io import BytesIO, StringIO
|
||||||
|
|
||||||
|
import webencodings
|
||||||
|
|
||||||
|
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||||
|
from .constants import _ReparseException
|
||||||
|
from . import _utils
|
||||||
|
|
||||||
|
# Non-unicode versions of constants for use in the pre-parser
|
||||||
|
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||||
|
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||||
|
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||||
|
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||||
|
|
||||||
|
|
||||||
|
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
||||||
|
|
||||||
|
if _utils.supports_lone_surrogates:
|
||||||
|
# Use one extra step of indirection and create surrogates with
|
||||||
|
# eval. Not using this indirection would introduce an illegal
|
||||||
|
# unicode literal on platforms not supporting such lone
|
||||||
|
# surrogates.
|
||||||
|
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
||||||
|
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
||||||
|
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
||||||
|
"]")
|
||||||
|
else:
|
||||||
|
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||||
|
|
||||||
|
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||||
|
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||||
|
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||||
|
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||||
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||||
|
0x10FFFE, 0x10FFFF}
|
||||||
|
|
||||||
|
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
||||||
|
|
||||||
|
# Cache for charsUntil()
|
||||||
|
charsUntilRegEx = {}
|
||||||
|
|
||||||
|
|
||||||
|
class BufferedStream(object):
|
||||||
|
"""Buffering for streams that do not have buffering of their own
|
||||||
|
|
||||||
|
The buffer is implemented as a list of chunks on the assumption that
|
||||||
|
joining many strings will be slow since it is O(n**2)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, stream):
|
||||||
|
self.stream = stream
|
||||||
|
self.buffer = []
|
||||||
|
self.position = [-1, 0] # chunk number, offset
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
pos = 0
|
||||||
|
for chunk in self.buffer[:self.position[0]]:
|
||||||
|
pos += len(chunk)
|
||||||
|
pos += self.position[1]
|
||||||
|
return pos
|
||||||
|
|
||||||
|
def seek(self, pos):
|
||||||
|
assert pos <= self._bufferedBytes()
|
||||||
|
offset = pos
|
||||||
|
i = 0
|
||||||
|
while len(self.buffer[i]) < offset:
|
||||||
|
offset -= len(self.buffer[i])
|
||||||
|
i += 1
|
||||||
|
self.position = [i, offset]
|
||||||
|
|
||||||
|
def read(self, bytes):
|
||||||
|
if not self.buffer:
|
||||||
|
return self._readStream(bytes)
|
||||||
|
elif (self.position[0] == len(self.buffer) and
|
||||||
|
self.position[1] == len(self.buffer[-1])):
|
||||||
|
return self._readStream(bytes)
|
||||||
|
else:
|
||||||
|
return self._readFromBuffer(bytes)
|
||||||
|
|
||||||
|
def _bufferedBytes(self):
|
||||||
|
return sum([len(item) for item in self.buffer])
|
||||||
|
|
||||||
|
def _readStream(self, bytes):
|
||||||
|
data = self.stream.read(bytes)
|
||||||
|
self.buffer.append(data)
|
||||||
|
self.position[0] += 1
|
||||||
|
self.position[1] = len(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _readFromBuffer(self, bytes):
|
||||||
|
remainingBytes = bytes
|
||||||
|
rv = []
|
||||||
|
bufferIndex = self.position[0]
|
||||||
|
bufferOffset = self.position[1]
|
||||||
|
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||||
|
assert remainingBytes > 0
|
||||||
|
bufferedData = self.buffer[bufferIndex]
|
||||||
|
|
||||||
|
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||||
|
bytesToRead = remainingBytes
|
||||||
|
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||||
|
else:
|
||||||
|
bytesToRead = len(bufferedData) - bufferOffset
|
||||||
|
self.position = [bufferIndex, len(bufferedData)]
|
||||||
|
bufferIndex += 1
|
||||||
|
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
||||||
|
remainingBytes -= bytesToRead
|
||||||
|
|
||||||
|
bufferOffset = 0
|
||||||
|
|
||||||
|
if remainingBytes:
|
||||||
|
rv.append(self._readStream(remainingBytes))
|
||||||
|
|
||||||
|
return b"".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
def HTMLInputStream(source, **kwargs):
|
||||||
|
# Work around Python bug #20007: read(0) closes the connection.
|
||||||
|
# http://bugs.python.org/issue20007
|
||||||
|
if (isinstance(source, http_client.HTTPResponse) or
|
||||||
|
# Also check for addinfourl wrapping HTTPResponse
|
||||||
|
(isinstance(source, urllib.response.addbase) and
|
||||||
|
isinstance(source.fp, http_client.HTTPResponse))):
|
||||||
|
isUnicode = False
|
||||||
|
elif hasattr(source, "read"):
|
||||||
|
isUnicode = isinstance(source.read(0), text_type)
|
||||||
|
else:
|
||||||
|
isUnicode = isinstance(source, text_type)
|
||||||
|
|
||||||
|
if isUnicode:
|
||||||
|
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
||||||
|
if encodings:
|
||||||
|
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
||||||
|
|
||||||
|
return HTMLUnicodeInputStream(source, **kwargs)
|
||||||
|
else:
|
||||||
|
return HTMLBinaryInputStream(source, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLUnicodeInputStream(object):
|
||||||
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
|
This class takes care of character encoding and removing or replacing
|
||||||
|
incorrect byte-sequences and also provides column and line tracking.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
_defaultChunkSize = 10240
|
||||||
|
|
||||||
|
def __init__(self, source):
|
||||||
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
|
for use by html5lib.
|
||||||
|
|
||||||
|
source can be either a file-object, local filename or a string.
|
||||||
|
|
||||||
|
The optional encoding parameter must be a string that indicates
|
||||||
|
the encoding. If specified, that encoding will be used,
|
||||||
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
|
element)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not _utils.supports_lone_surrogates:
|
||||||
|
# Such platforms will have already checked for such
|
||||||
|
# surrogate errors, so no need to do this checking.
|
||||||
|
self.reportCharacterErrors = None
|
||||||
|
elif len("\U0010FFFF") == 1:
|
||||||
|
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||||
|
else:
|
||||||
|
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||||
|
|
||||||
|
# List of where new lines occur
|
||||||
|
self.newLines = [0]
|
||||||
|
|
||||||
|
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
||||||
|
self.dataStream = self.openStream(source)
|
||||||
|
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.chunk = ""
|
||||||
|
self.chunkSize = 0
|
||||||
|
self.chunkOffset = 0
|
||||||
|
self.errors = []
|
||||||
|
|
||||||
|
# number of (complete) lines in previous chunks
|
||||||
|
self.prevNumLines = 0
|
||||||
|
# number of columns in the last line of the previous chunk
|
||||||
|
self.prevNumCols = 0
|
||||||
|
|
||||||
|
# Deal with CR LF and surrogates split over chunk boundaries
|
||||||
|
self._bufferedCharacter = None
|
||||||
|
|
||||||
|
def openStream(self, source):
|
||||||
|
"""Produces a file object from source.
|
||||||
|
|
||||||
|
source can be either a file object, local filename or a string.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Already a file object
|
||||||
|
if hasattr(source, 'read'):
|
||||||
|
stream = source
|
||||||
|
else:
|
||||||
|
stream = StringIO(source)
|
||||||
|
|
||||||
|
return stream
|
||||||
|
|
||||||
|
def _position(self, offset):
|
||||||
|
chunk = self.chunk
|
||||||
|
nLines = chunk.count('\n', 0, offset)
|
||||||
|
positionLine = self.prevNumLines + nLines
|
||||||
|
lastLinePos = chunk.rfind('\n', 0, offset)
|
||||||
|
if lastLinePos == -1:
|
||||||
|
positionColumn = self.prevNumCols + offset
|
||||||
|
else:
|
||||||
|
positionColumn = offset - (lastLinePos + 1)
|
||||||
|
return (positionLine, positionColumn)
|
||||||
|
|
||||||
|
def position(self):
|
||||||
|
"""Returns (line, col) of the current position in the stream."""
|
||||||
|
line, col = self._position(self.chunkOffset)
|
||||||
|
return (line + 1, col)
|
||||||
|
|
||||||
|
def char(self):
|
||||||
|
""" Read one character from the stream or queue if available. Return
|
||||||
|
EOF when EOF is reached.
|
||||||
|
"""
|
||||||
|
# Read a new chunk from the input stream if necessary
|
||||||
|
if self.chunkOffset >= self.chunkSize:
|
||||||
|
if not self.readChunk():
|
||||||
|
return EOF
|
||||||
|
|
||||||
|
chunkOffset = self.chunkOffset
|
||||||
|
char = self.chunk[chunkOffset]
|
||||||
|
self.chunkOffset = chunkOffset + 1
|
||||||
|
|
||||||
|
return char
|
||||||
|
|
||||||
|
def readChunk(self, chunkSize=None):
|
||||||
|
if chunkSize is None:
|
||||||
|
chunkSize = self._defaultChunkSize
|
||||||
|
|
||||||
|
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||||
|
|
||||||
|
self.chunk = ""
|
||||||
|
self.chunkSize = 0
|
||||||
|
self.chunkOffset = 0
|
||||||
|
|
||||||
|
data = self.dataStream.read(chunkSize)
|
||||||
|
|
||||||
|
# Deal with CR LF and surrogates broken across chunks
|
||||||
|
if self._bufferedCharacter:
|
||||||
|
data = self._bufferedCharacter + data
|
||||||
|
self._bufferedCharacter = None
|
||||||
|
elif not data:
|
||||||
|
# We have no more data, bye-bye stream
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(data) > 1:
|
||||||
|
lastv = ord(data[-1])
|
||||||
|
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||||
|
self._bufferedCharacter = data[-1]
|
||||||
|
data = data[:-1]
|
||||||
|
|
||||||
|
if self.reportCharacterErrors:
|
||||||
|
self.reportCharacterErrors(data)
|
||||||
|
|
||||||
|
# Replace invalid characters
|
||||||
|
data = data.replace("\r\n", "\n")
|
||||||
|
data = data.replace("\r", "\n")
|
||||||
|
|
||||||
|
self.chunk = data
|
||||||
|
self.chunkSize = len(data)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def characterErrorsUCS4(self, data):
|
||||||
|
for _ in range(len(invalid_unicode_re.findall(data))):
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
|
||||||
|
def characterErrorsUCS2(self, data):
|
||||||
|
# Someone picked the wrong compile option
|
||||||
|
# You lose
|
||||||
|
skip = False
|
||||||
|
for match in invalid_unicode_re.finditer(data):
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
codepoint = ord(match.group())
|
||||||
|
pos = match.start()
|
||||||
|
# Pretty sure there should be endianness issues here
|
||||||
|
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
||||||
|
# We have a surrogate pair!
|
||||||
|
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||||
|
if char_val in non_bmp_invalid_codepoints:
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
skip = True
|
||||||
|
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||||
|
pos == len(data) - 1):
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
else:
|
||||||
|
skip = False
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
|
||||||
|
def charsUntil(self, characters, opposite=False):
|
||||||
|
""" Returns a string of characters from the stream up to but not
|
||||||
|
including any character in 'characters' or EOF. 'characters' must be
|
||||||
|
a container that supports the 'in' method and iteration over its
|
||||||
|
characters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Use a cache of regexps to find the required characters
|
||||||
|
try:
|
||||||
|
chars = charsUntilRegEx[(characters, opposite)]
|
||||||
|
except KeyError:
|
||||||
|
if __debug__:
|
||||||
|
for c in characters:
|
||||||
|
assert(ord(c) < 128)
|
||||||
|
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
||||||
|
if not opposite:
|
||||||
|
regex = "^%s" % regex
|
||||||
|
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
||||||
|
|
||||||
|
rv = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Find the longest matching prefix
|
||||||
|
m = chars.match(self.chunk, self.chunkOffset)
|
||||||
|
if m is None:
|
||||||
|
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||||
|
# then stop
|
||||||
|
if self.chunkOffset != self.chunkSize:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
end = m.end()
|
||||||
|
# If not the whole chunk matched, return everything
|
||||||
|
# up to the part that didn't match
|
||||||
|
if end != self.chunkSize:
|
||||||
|
rv.append(self.chunk[self.chunkOffset:end])
|
||||||
|
self.chunkOffset = end
|
||||||
|
break
|
||||||
|
# If the whole remainder of the chunk matched,
|
||||||
|
# use it all and read the next chunk
|
||||||
|
rv.append(self.chunk[self.chunkOffset:])
|
||||||
|
if not self.readChunk():
|
||||||
|
# Reached EOF
|
||||||
|
break
|
||||||
|
|
||||||
|
r = "".join(rv)
|
||||||
|
return r
|
||||||
|
|
||||||
|
def unget(self, char):
|
||||||
|
# Only one character is allowed to be ungotten at once - it must
|
||||||
|
# be consumed again before any further call to unget
|
||||||
|
if char is not EOF:
|
||||||
|
if self.chunkOffset == 0:
|
||||||
|
# unget is called quite rarely, so it's a good idea to do
|
||||||
|
# more work here if it saves a bit of work in the frequently
|
||||||
|
# called char and charsUntil.
|
||||||
|
# So, just prepend the ungotten character onto the current
|
||||||
|
# chunk:
|
||||||
|
self.chunk = char + self.chunk
|
||||||
|
self.chunkSize += 1
|
||||||
|
else:
|
||||||
|
self.chunkOffset -= 1
|
||||||
|
assert self.chunk[self.chunkOffset] == char
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
|
This class takes care of character encoding and removing or replacing
|
||||||
|
incorrect byte-sequences and also provides column and line tracking.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
||||||
|
same_origin_parent_encoding=None, likely_encoding=None,
|
||||||
|
default_encoding="windows-1252", useChardet=True):
|
||||||
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
|
for use by html5lib.
|
||||||
|
|
||||||
|
source can be either a file-object, local filename or a string.
|
||||||
|
|
||||||
|
The optional encoding parameter must be a string that indicates
|
||||||
|
the encoding. If specified, that encoding will be used,
|
||||||
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
|
element)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||||
|
# self.charEncoding as appropriate
|
||||||
|
self.rawStream = self.openStream(source)
|
||||||
|
|
||||||
|
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||||
|
|
||||||
|
# Encoding Information
|
||||||
|
# Number of bytes to use when looking for a meta element with
|
||||||
|
# encoding information
|
||||||
|
self.numBytesMeta = 1024
|
||||||
|
# Number of bytes to use when using detecting encoding using chardet
|
||||||
|
self.numBytesChardet = 100
|
||||||
|
# Things from args
|
||||||
|
self.override_encoding = override_encoding
|
||||||
|
self.transport_encoding = transport_encoding
|
||||||
|
self.same_origin_parent_encoding = same_origin_parent_encoding
|
||||||
|
self.likely_encoding = likely_encoding
|
||||||
|
self.default_encoding = default_encoding
|
||||||
|
|
||||||
|
# Determine encoding
|
||||||
|
self.charEncoding = self.determineEncoding(useChardet)
|
||||||
|
assert self.charEncoding[0] is not None
|
||||||
|
|
||||||
|
# Call superclass
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
||||||
|
HTMLUnicodeInputStream.reset(self)
|
||||||
|
|
||||||
|
def openStream(self, source):
|
||||||
|
"""Produces a file object from source.
|
||||||
|
|
||||||
|
source can be either a file object, local filename or a string.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Already a file object
|
||||||
|
if hasattr(source, 'read'):
|
||||||
|
stream = source
|
||||||
|
else:
|
||||||
|
stream = BytesIO(source)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stream.seek(stream.tell())
|
||||||
|
except Exception:
|
||||||
|
stream = BufferedStream(stream)
|
||||||
|
|
||||||
|
return stream
|
||||||
|
|
||||||
|
def determineEncoding(self, chardet=True):
|
||||||
|
# BOMs take precedence over everything
|
||||||
|
# This will also read past the BOM if present
|
||||||
|
charEncoding = self.detectBOM(), "certain"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# If we've been overridden, we've been overridden
|
||||||
|
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Now check the transport layer
|
||||||
|
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Look for meta elements with encoding information
|
||||||
|
charEncoding = self.detectEncodingMeta(), "tentative"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Parent document encoding
|
||||||
|
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
||||||
|
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# "likely" encoding
|
||||||
|
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Guess with chardet, if available
|
||||||
|
if chardet:
|
||||||
|
try:
|
||||||
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
buffers = []
|
||||||
|
detector = UniversalDetector()
|
||||||
|
while not detector.done:
|
||||||
|
buffer = self.rawStream.read(self.numBytesChardet)
|
||||||
|
assert isinstance(buffer, bytes)
|
||||||
|
if not buffer:
|
||||||
|
break
|
||||||
|
buffers.append(buffer)
|
||||||
|
detector.feed(buffer)
|
||||||
|
detector.close()
|
||||||
|
encoding = lookupEncoding(detector.result['encoding'])
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
if encoding is not None:
|
||||||
|
return encoding, "tentative"
|
||||||
|
|
||||||
|
# Try the default encoding
|
||||||
|
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Fallback to html5lib's default if even that hasn't worked
|
||||||
|
return lookupEncoding("windows-1252"), "tentative"
|
||||||
|
|
||||||
|
def changeEncoding(self, newEncoding):
|
||||||
|
assert self.charEncoding[1] != "certain"
|
||||||
|
newEncoding = lookupEncoding(newEncoding)
|
||||||
|
if newEncoding is None:
|
||||||
|
return
|
||||||
|
if newEncoding.name in ("utf-16be", "utf-16le"):
|
||||||
|
newEncoding = lookupEncoding("utf-8")
|
||||||
|
assert newEncoding is not None
|
||||||
|
elif newEncoding == self.charEncoding[0]:
|
||||||
|
self.charEncoding = (self.charEncoding[0], "certain")
|
||||||
|
else:
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
self.charEncoding = (newEncoding, "certain")
|
||||||
|
self.reset()
|
||||||
|
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||||
|
|
||||||
|
def detectBOM(self):
|
||||||
|
"""Attempts to detect at BOM at the start of the stream. If
|
||||||
|
an encoding can be determined from the BOM return the name of the
|
||||||
|
encoding otherwise return None"""
|
||||||
|
bomDict = {
|
||||||
|
codecs.BOM_UTF8: 'utf-8',
|
||||||
|
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
||||||
|
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
string = self.rawStream.read(4)
|
||||||
|
assert isinstance(string, bytes)
|
||||||
|
|
||||||
|
# Try detecting the BOM using bytes from the string
|
||||||
|
encoding = bomDict.get(string[:3]) # UTF-8
|
||||||
|
seek = 3
|
||||||
|
if not encoding:
|
||||||
|
# Need to detect UTF-32 before UTF-16
|
||||||
|
encoding = bomDict.get(string) # UTF-32
|
||||||
|
seek = 4
|
||||||
|
if not encoding:
|
||||||
|
encoding = bomDict.get(string[:2]) # UTF-16
|
||||||
|
seek = 2
|
||||||
|
|
||||||
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
|
# set it to the start of the stream
|
||||||
|
if encoding:
|
||||||
|
self.rawStream.seek(seek)
|
||||||
|
return lookupEncoding(encoding)
|
||||||
|
else:
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def detectEncodingMeta(self):
|
||||||
|
"""Report the encoding declared by the meta element
|
||||||
|
"""
|
||||||
|
buffer = self.rawStream.read(self.numBytesMeta)
|
||||||
|
assert isinstance(buffer, bytes)
|
||||||
|
parser = EncodingParser(buffer)
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
encoding = parser.getEncoding()
|
||||||
|
|
||||||
|
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
||||||
|
encoding = lookupEncoding("utf-8")
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
class EncodingBytes(bytes):
|
||||||
|
"""String-like object with an associated position and various extra methods
|
||||||
|
If the position is ever greater than the string length then an exception is
|
||||||
|
raised"""
|
||||||
|
def __new__(self, value):
|
||||||
|
assert isinstance(value, bytes)
|
||||||
|
return bytes.__new__(self, value.lower())
|
||||||
|
|
||||||
|
def __init__(self, value):
|
||||||
|
# pylint:disable=unused-argument
|
||||||
|
self._position = -1
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
p = self._position = self._position + 1
|
||||||
|
if p >= len(self):
|
||||||
|
raise StopIteration
|
||||||
|
elif p < 0:
|
||||||
|
raise TypeError
|
||||||
|
return self[p:p + 1]
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
# Py2 compat
|
||||||
|
return self.__next__()
|
||||||
|
|
||||||
|
def previous(self):
|
||||||
|
p = self._position
|
||||||
|
if p >= len(self):
|
||||||
|
raise StopIteration
|
||||||
|
elif p < 0:
|
||||||
|
raise TypeError
|
||||||
|
self._position = p = p - 1
|
||||||
|
return self[p:p + 1]
|
||||||
|
|
||||||
|
def setPosition(self, position):
|
||||||
|
if self._position >= len(self):
|
||||||
|
raise StopIteration
|
||||||
|
self._position = position
|
||||||
|
|
||||||
|
def getPosition(self):
|
||||||
|
if self._position >= len(self):
|
||||||
|
raise StopIteration
|
||||||
|
if self._position >= 0:
|
||||||
|
return self._position
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
position = property(getPosition, setPosition)
|
||||||
|
|
||||||
|
def getCurrentByte(self):
|
||||||
|
return self[self.position:self.position + 1]
|
||||||
|
|
||||||
|
currentByte = property(getCurrentByte)
|
||||||
|
|
||||||
|
def skip(self, chars=spaceCharactersBytes):
|
||||||
|
"""Skip past a list of characters"""
|
||||||
|
p = self.position # use property for the error-checking
|
||||||
|
while p < len(self):
|
||||||
|
c = self[p:p + 1]
|
||||||
|
if c not in chars:
|
||||||
|
self._position = p
|
||||||
|
return c
|
||||||
|
p += 1
|
||||||
|
self._position = p
|
||||||
|
return None
|
||||||
|
|
||||||
|
def skipUntil(self, chars):
|
||||||
|
p = self.position
|
||||||
|
while p < len(self):
|
||||||
|
c = self[p:p + 1]
|
||||||
|
if c in chars:
|
||||||
|
self._position = p
|
||||||
|
return c
|
||||||
|
p += 1
|
||||||
|
self._position = p
|
||||||
|
return None
|
||||||
|
|
||||||
|
def matchBytes(self, bytes):
|
||||||
|
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||||
|
are found return True and advance the position to the byte after the
|
||||||
|
match. Otherwise return False and leave the position alone"""
|
||||||
|
rv = self.startswith(bytes, self.position)
|
||||||
|
if rv:
|
||||||
|
self.position += len(bytes)
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def jumpTo(self, bytes):
|
||||||
|
"""Look for the next sequence of bytes matching a given sequence. If
|
||||||
|
a match is found advance the position to the last byte of the match"""
|
||||||
|
try:
|
||||||
|
self._position = self.index(bytes, self.position) + len(bytes) - 1
|
||||||
|
except ValueError:
|
||||||
|
raise StopIteration
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class EncodingParser(object):
|
||||||
|
"""Mini parser for detecting character encoding from meta elements"""
|
||||||
|
|
||||||
|
def __init__(self, data):
|
||||||
|
"""string - the data to work on for encoding detection"""
|
||||||
|
self.data = EncodingBytes(data)
|
||||||
|
self.encoding = None
|
||||||
|
|
||||||
|
def getEncoding(self):
|
||||||
|
if b"<meta" not in self.data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
methodDispatch = (
|
||||||
|
(b"<!--", self.handleComment),
|
||||||
|
(b"<meta", self.handleMeta),
|
||||||
|
(b"</", self.handlePossibleEndTag),
|
||||||
|
(b"<!", self.handleOther),
|
||||||
|
(b"<?", self.handleOther),
|
||||||
|
(b"<", self.handlePossibleStartTag))
|
||||||
|
for _ in self.data:
|
||||||
|
keepParsing = True
|
||||||
|
try:
|
||||||
|
self.data.jumpTo(b"<")
|
||||||
|
except StopIteration:
|
||||||
|
break
|
||||||
|
for key, method in methodDispatch:
|
||||||
|
if self.data.matchBytes(key):
|
||||||
|
try:
|
||||||
|
keepParsing = method()
|
||||||
|
break
|
||||||
|
except StopIteration:
|
||||||
|
keepParsing = False
|
||||||
|
break
|
||||||
|
if not keepParsing:
|
||||||
|
break
|
||||||
|
|
||||||
|
return self.encoding
|
||||||
|
|
||||||
|
def handleComment(self):
|
||||||
|
"""Skip over comments"""
|
||||||
|
return self.data.jumpTo(b"-->")
|
||||||
|
|
||||||
|
def handleMeta(self):
|
||||||
|
if self.data.currentByte not in spaceCharactersBytes:
|
||||||
|
# if we have <meta not followed by a space so just keep going
|
||||||
|
return True
|
||||||
|
# We have a valid meta element we want to search for attributes
|
||||||
|
hasPragma = False
|
||||||
|
pendingEncoding = None
|
||||||
|
while True:
|
||||||
|
# Try to find the next attribute after the current position
|
||||||
|
attr = self.getAttribute()
|
||||||
|
if attr is None:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
if attr[0] == b"http-equiv":
|
||||||
|
hasPragma = attr[1] == b"content-type"
|
||||||
|
if hasPragma and pendingEncoding is not None:
|
||||||
|
self.encoding = pendingEncoding
|
||||||
|
return False
|
||||||
|
elif attr[0] == b"charset":
|
||||||
|
tentativeEncoding = attr[1]
|
||||||
|
codec = lookupEncoding(tentativeEncoding)
|
||||||
|
if codec is not None:
|
||||||
|
self.encoding = codec
|
||||||
|
return False
|
||||||
|
elif attr[0] == b"content":
|
||||||
|
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||||
|
tentativeEncoding = contentParser.parse()
|
||||||
|
if tentativeEncoding is not None:
|
||||||
|
codec = lookupEncoding(tentativeEncoding)
|
||||||
|
if codec is not None:
|
||||||
|
if hasPragma:
|
||||||
|
self.encoding = codec
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
pendingEncoding = codec
|
||||||
|
|
||||||
|
def handlePossibleStartTag(self):
|
||||||
|
return self.handlePossibleTag(False)
|
||||||
|
|
||||||
|
def handlePossibleEndTag(self):
|
||||||
|
next(self.data)
|
||||||
|
return self.handlePossibleTag(True)
|
||||||
|
|
||||||
|
def handlePossibleTag(self, endTag):
|
||||||
|
data = self.data
|
||||||
|
if data.currentByte not in asciiLettersBytes:
|
||||||
|
# If the next byte is not an ascii letter either ignore this
|
||||||
|
# fragment (possible start tag case) or treat it according to
|
||||||
|
# handleOther
|
||||||
|
if endTag:
|
||||||
|
data.previous()
|
||||||
|
self.handleOther()
|
||||||
|
return True
|
||||||
|
|
||||||
|
c = data.skipUntil(spacesAngleBrackets)
|
||||||
|
if c == b"<":
|
||||||
|
# return to the first step in the overall "two step" algorithm
|
||||||
|
# reprocessing the < byte
|
||||||
|
data.previous()
|
||||||
|
else:
|
||||||
|
# Read all attributes
|
||||||
|
attr = self.getAttribute()
|
||||||
|
while attr is not None:
|
||||||
|
attr = self.getAttribute()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def handleOther(self):
|
||||||
|
return self.data.jumpTo(b">")
|
||||||
|
|
||||||
|
def getAttribute(self):
|
||||||
|
"""Return a name,value pair for the next attribute in the stream,
|
||||||
|
if one is found, or None"""
|
||||||
|
data = self.data
|
||||||
|
# Step 1 (skip chars)
|
||||||
|
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
||||||
|
assert c is None or len(c) == 1
|
||||||
|
# Step 2
|
||||||
|
if c in (b">", None):
|
||||||
|
return None
|
||||||
|
# Step 3
|
||||||
|
attrName = []
|
||||||
|
attrValue = []
|
||||||
|
# Step 4 attribute name
|
||||||
|
while True:
|
||||||
|
if c == b"=" and attrName:
|
||||||
|
break
|
||||||
|
elif c in spaceCharactersBytes:
|
||||||
|
# Step 6!
|
||||||
|
c = data.skip()
|
||||||
|
break
|
||||||
|
elif c in (b"/", b">"):
|
||||||
|
return b"".join(attrName), b""
|
||||||
|
elif c in asciiUppercaseBytes:
|
||||||
|
attrName.append(c.lower())
|
||||||
|
elif c is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
attrName.append(c)
|
||||||
|
# Step 5
|
||||||
|
c = next(data)
|
||||||
|
# Step 7
|
||||||
|
if c != b"=":
|
||||||
|
data.previous()
|
||||||
|
return b"".join(attrName), b""
|
||||||
|
# Step 8
|
||||||
|
next(data)
|
||||||
|
# Step 9
|
||||||
|
c = data.skip()
|
||||||
|
# Step 10
|
||||||
|
if c in (b"'", b'"'):
|
||||||
|
# 10.1
|
||||||
|
quoteChar = c
|
||||||
|
while True:
|
||||||
|
# 10.2
|
||||||
|
c = next(data)
|
||||||
|
# 10.3
|
||||||
|
if c == quoteChar:
|
||||||
|
next(data)
|
||||||
|
return b"".join(attrName), b"".join(attrValue)
|
||||||
|
# 10.4
|
||||||
|
elif c in asciiUppercaseBytes:
|
||||||
|
attrValue.append(c.lower())
|
||||||
|
# 10.5
|
||||||
|
else:
|
||||||
|
attrValue.append(c)
|
||||||
|
elif c == b">":
|
||||||
|
return b"".join(attrName), b""
|
||||||
|
elif c in asciiUppercaseBytes:
|
||||||
|
attrValue.append(c.lower())
|
||||||
|
elif c is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
attrValue.append(c)
|
||||||
|
# Step 11
|
||||||
|
while True:
|
||||||
|
c = next(data)
|
||||||
|
if c in spacesAngleBrackets:
|
||||||
|
return b"".join(attrName), b"".join(attrValue)
|
||||||
|
elif c in asciiUppercaseBytes:
|
||||||
|
attrValue.append(c.lower())
|
||||||
|
elif c is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
attrValue.append(c)
|
||||||
|
|
||||||
|
|
||||||
|
class ContentAttrParser(object):
|
||||||
|
def __init__(self, data):
|
||||||
|
assert isinstance(data, bytes)
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def parse(self):
|
||||||
|
try:
|
||||||
|
# Check if the attr name is charset
|
||||||
|
# otherwise return
|
||||||
|
self.data.jumpTo(b"charset")
|
||||||
|
self.data.position += 1
|
||||||
|
self.data.skip()
|
||||||
|
if not self.data.currentByte == b"=":
|
||||||
|
# If there is no = sign keep looking for attrs
|
||||||
|
return None
|
||||||
|
self.data.position += 1
|
||||||
|
self.data.skip()
|
||||||
|
# Look for an encoding between matching quote marks
|
||||||
|
if self.data.currentByte in (b'"', b"'"):
|
||||||
|
quoteMark = self.data.currentByte
|
||||||
|
self.data.position += 1
|
||||||
|
oldPosition = self.data.position
|
||||||
|
if self.data.jumpTo(quoteMark):
|
||||||
|
return self.data[oldPosition:self.data.position]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
# Unquoted value
|
||||||
|
oldPosition = self.data.position
|
||||||
|
try:
|
||||||
|
self.data.skipUntil(spaceCharactersBytes)
|
||||||
|
return self.data[oldPosition:self.data.position]
|
||||||
|
except StopIteration:
|
||||||
|
# Return the whole remaining value
|
||||||
|
return self.data[oldPosition:]
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def lookupEncoding(encoding):
|
||||||
|
"""Return the python codec name corresponding to an encoding or None if the
|
||||||
|
string doesn't correspond to a valid encoding."""
|
||||||
|
if isinstance(encoding, bytes):
|
||||||
|
try:
|
||||||
|
encoding = encoding.decode("ascii")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if encoding is not None:
|
||||||
|
try:
|
||||||
|
return webencodings.lookup(encoding)
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
1735
lib/bleach/_vendor/html5lib/_tokenizer.py
Normal file
1735
lib/bleach/_vendor/html5lib/_tokenizer.py
Normal file
File diff suppressed because it is too large
Load diff
5
lib/bleach/_vendor/html5lib/_trie/__init__.py
Normal file
5
lib/bleach/_vendor/html5lib/_trie/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from .py import Trie
|
||||||
|
|
||||||
|
__all__ = ["Trie"]
|
40
lib/bleach/_vendor/html5lib/_trie/_base.py
Normal file
40
lib/bleach/_vendor/html5lib/_trie/_base.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import Mapping
|
||||||
|
except ImportError: # Python 2.7
|
||||||
|
from collections import Mapping
|
||||||
|
|
||||||
|
|
||||||
|
class Trie(Mapping):
|
||||||
|
"""Abstract base class for tries"""
|
||||||
|
|
||||||
|
def keys(self, prefix=None):
|
||||||
|
# pylint:disable=arguments-differ
|
||||||
|
keys = super(Trie, self).keys()
|
||||||
|
|
||||||
|
if prefix is None:
|
||||||
|
return set(keys)
|
||||||
|
|
||||||
|
return {x for x in keys if x.startswith(prefix)}
|
||||||
|
|
||||||
|
def has_keys_with_prefix(self, prefix):
|
||||||
|
for key in self.keys():
|
||||||
|
if key.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def longest_prefix(self, prefix):
|
||||||
|
if prefix in self:
|
||||||
|
return prefix
|
||||||
|
|
||||||
|
for i in range(1, len(prefix) + 1):
|
||||||
|
if prefix[:-i] in self:
|
||||||
|
return prefix[:-i]
|
||||||
|
|
||||||
|
raise KeyError(prefix)
|
||||||
|
|
||||||
|
def longest_prefix_item(self, prefix):
|
||||||
|
lprefix = self.longest_prefix(prefix)
|
||||||
|
return (lprefix, self[lprefix])
|
67
lib/bleach/_vendor/html5lib/_trie/py.py
Normal file
67
lib/bleach/_vendor/html5lib/_trie/py.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from bisect import bisect_left
|
||||||
|
|
||||||
|
from ._base import Trie as ABCTrie
|
||||||
|
|
||||||
|
|
||||||
|
class Trie(ABCTrie):
|
||||||
|
def __init__(self, data):
|
||||||
|
if not all(isinstance(x, text_type) for x in data.keys()):
|
||||||
|
raise TypeError("All keys must be strings")
|
||||||
|
|
||||||
|
self._data = data
|
||||||
|
self._keys = sorted(data.keys())
|
||||||
|
self._cachestr = ""
|
||||||
|
self._cachepoints = (0, len(data))
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
return key in self._data
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._data)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self._data[key]
|
||||||
|
|
||||||
|
def keys(self, prefix=None):
|
||||||
|
if prefix is None or prefix == "" or not self._keys:
|
||||||
|
return set(self._keys)
|
||||||
|
|
||||||
|
if prefix.startswith(self._cachestr):
|
||||||
|
lo, hi = self._cachepoints
|
||||||
|
start = i = bisect_left(self._keys, prefix, lo, hi)
|
||||||
|
else:
|
||||||
|
start = i = bisect_left(self._keys, prefix)
|
||||||
|
|
||||||
|
keys = set()
|
||||||
|
if start == len(self._keys):
|
||||||
|
return keys
|
||||||
|
|
||||||
|
while self._keys[i].startswith(prefix):
|
||||||
|
keys.add(self._keys[i])
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
self._cachestr = prefix
|
||||||
|
self._cachepoints = (start, i)
|
||||||
|
|
||||||
|
return keys
|
||||||
|
|
||||||
|
def has_keys_with_prefix(self, prefix):
|
||||||
|
if prefix in self._data:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if prefix.startswith(self._cachestr):
|
||||||
|
lo, hi = self._cachepoints
|
||||||
|
i = bisect_left(self._keys, prefix, lo, hi)
|
||||||
|
else:
|
||||||
|
i = bisect_left(self._keys, prefix)
|
||||||
|
|
||||||
|
if i == len(self._keys):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return self._keys[i].startswith(prefix)
|
159
lib/bleach/_vendor/html5lib/_utils.py
Normal file
159
lib/bleach/_vendor/html5lib/_utils.py
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from types import ModuleType
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import Mapping
|
||||||
|
except ImportError:
|
||||||
|
from collections import Mapping
|
||||||
|
|
||||||
|
from six import text_type, PY3
|
||||||
|
|
||||||
|
if PY3:
|
||||||
|
import xml.etree.ElementTree as default_etree
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import xml.etree.cElementTree as default_etree
|
||||||
|
except ImportError:
|
||||||
|
import xml.etree.ElementTree as default_etree
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||||
|
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||||
|
"supports_lone_surrogates"]
|
||||||
|
|
||||||
|
|
||||||
|
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||||
|
# caught by the below test. In general this would be any platform
|
||||||
|
# using UTF-16 as its encoding of unicode strings, such as
|
||||||
|
# Jython. This is because UTF-16 itself is based on the use of such
|
||||||
|
# surrogates, and there is no mechanism to further escape such
|
||||||
|
# escapes.
|
||||||
|
try:
|
||||||
|
_x = eval('"\\uD800"') # pylint:disable=eval-used
|
||||||
|
if not isinstance(_x, text_type):
|
||||||
|
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||||
|
_x = eval('u"\\uD800"') # pylint:disable=eval-used
|
||||||
|
assert isinstance(_x, text_type)
|
||||||
|
except Exception:
|
||||||
|
supports_lone_surrogates = False
|
||||||
|
else:
|
||||||
|
supports_lone_surrogates = True
|
||||||
|
|
||||||
|
|
||||||
|
class MethodDispatcher(dict):
|
||||||
|
"""Dict with 2 special properties:
|
||||||
|
|
||||||
|
On initiation, keys that are lists, sets or tuples are converted to
|
||||||
|
multiple keys so accessing any one of the items in the original
|
||||||
|
list-like object returns the matching value
|
||||||
|
|
||||||
|
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||||
|
md["foo"] == "baz"
|
||||||
|
|
||||||
|
A default value which can be set through the default attribute.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, items=()):
|
||||||
|
_dictEntries = []
|
||||||
|
for name, value in items:
|
||||||
|
if isinstance(name, (list, tuple, frozenset, set)):
|
||||||
|
for item in name:
|
||||||
|
_dictEntries.append((item, value))
|
||||||
|
else:
|
||||||
|
_dictEntries.append((name, value))
|
||||||
|
dict.__init__(self, _dictEntries)
|
||||||
|
assert len(self) == len(_dictEntries)
|
||||||
|
self.default = None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return dict.get(self, key, self.default)
|
||||||
|
|
||||||
|
def __get__(self, instance, owner=None):
|
||||||
|
return BoundMethodDispatcher(instance, self)
|
||||||
|
|
||||||
|
|
||||||
|
class BoundMethodDispatcher(Mapping):
|
||||||
|
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
|
||||||
|
def __init__(self, instance, dispatcher):
|
||||||
|
self.instance = instance
|
||||||
|
self.dispatcher = dispatcher
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
|
||||||
|
# on a function, __get__ is used to bind a function to an instance as a bound method
|
||||||
|
return self.dispatcher[key].__get__(self.instance)
|
||||||
|
|
||||||
|
def get(self, key, default):
|
||||||
|
if key in self.dispatcher:
|
||||||
|
return self[key]
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.dispatcher)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.dispatcher)
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
return key in self.dispatcher
|
||||||
|
|
||||||
|
|
||||||
|
# Some utility functions to deal with weirdness around UCS2 vs UCS4
|
||||||
|
# python builds
|
||||||
|
|
||||||
|
def isSurrogatePair(data):
|
||||||
|
return (len(data) == 2 and
|
||||||
|
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||||
|
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||||
|
|
||||||
|
|
||||||
|
def surrogatePairToCodepoint(data):
|
||||||
|
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||||
|
(ord(data[1]) - 0xDC00))
|
||||||
|
return char_val
|
||||||
|
|
||||||
|
# Module Factory Factory (no, this isn't Java, I know)
|
||||||
|
# Here to stop this being duplicated all over the place.
|
||||||
|
|
||||||
|
|
||||||
|
def moduleFactoryFactory(factory):
|
||||||
|
moduleCache = {}
|
||||||
|
|
||||||
|
def moduleFactory(baseModule, *args, **kwargs):
|
||||||
|
if isinstance(ModuleType.__name__, type("")):
|
||||||
|
name = "_%s_factory" % baseModule.__name__
|
||||||
|
else:
|
||||||
|
name = b"_%s_factory" % baseModule.__name__
|
||||||
|
|
||||||
|
kwargs_tuple = tuple(kwargs.items())
|
||||||
|
|
||||||
|
try:
|
||||||
|
return moduleCache[name][args][kwargs_tuple]
|
||||||
|
except KeyError:
|
||||||
|
mod = ModuleType(name)
|
||||||
|
objs = factory(baseModule, *args, **kwargs)
|
||||||
|
mod.__dict__.update(objs)
|
||||||
|
if "name" not in moduleCache:
|
||||||
|
moduleCache[name] = {}
|
||||||
|
if "args" not in moduleCache[name]:
|
||||||
|
moduleCache[name][args] = {}
|
||||||
|
if "kwargs" not in moduleCache[name][args]:
|
||||||
|
moduleCache[name][args][kwargs_tuple] = {}
|
||||||
|
moduleCache[name][args][kwargs_tuple] = mod
|
||||||
|
return mod
|
||||||
|
|
||||||
|
return moduleFactory
|
||||||
|
|
||||||
|
|
||||||
|
def memoize(func):
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
key = (tuple(args), tuple(kwargs.items()))
|
||||||
|
if key not in cache:
|
||||||
|
cache[key] = func(*args, **kwargs)
|
||||||
|
return cache[key]
|
||||||
|
|
||||||
|
return wrapped
|
2946
lib/bleach/_vendor/html5lib/constants.py
Normal file
2946
lib/bleach/_vendor/html5lib/constants.py
Normal file
File diff suppressed because it is too large
Load diff
0
lib/bleach/_vendor/html5lib/filters/__init__.py
Normal file
0
lib/bleach/_vendor/html5lib/filters/__init__.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
def _attr_key(attr):
|
||||||
|
"""Return an appropriate key for an attribute for sorting
|
||||||
|
|
||||||
|
Attributes have a namespace that can be either ``None`` or a string. We
|
||||||
|
can't compare the two because they're different types, so we convert
|
||||||
|
``None`` to an empty string first.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (attr[0][0] or ''), attr[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Alphabetizes attributes for elements"""
|
||||||
|
def __iter__(self):
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
if token["type"] in ("StartTag", "EmptyTag"):
|
||||||
|
attrs = OrderedDict()
|
||||||
|
for name, value in sorted(token["data"].items(),
|
||||||
|
key=_attr_key):
|
||||||
|
attrs[name] = value
|
||||||
|
token["data"] = attrs
|
||||||
|
yield token
|
12
lib/bleach/_vendor/html5lib/filters/base.py
Normal file
12
lib/bleach/_vendor/html5lib/filters/base.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(object):
|
||||||
|
def __init__(self, source):
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.source)
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.source, name)
|
73
lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
Normal file
73
lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
|
||||||
|
def __init__(self, source, encoding):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg source: the source token stream
|
||||||
|
|
||||||
|
:arg encoding: the encoding to set
|
||||||
|
|
||||||
|
"""
|
||||||
|
base.Filter.__init__(self, source)
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
state = "pre_head"
|
||||||
|
meta_found = (self.encoding is None)
|
||||||
|
pending = []
|
||||||
|
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type == "StartTag":
|
||||||
|
if token["name"].lower() == "head":
|
||||||
|
state = "in_head"
|
||||||
|
|
||||||
|
elif type == "EmptyTag":
|
||||||
|
if token["name"].lower() == "meta":
|
||||||
|
# replace charset with actual encoding
|
||||||
|
has_http_equiv_content_type = False
|
||||||
|
for (namespace, name), value in token["data"].items():
|
||||||
|
if namespace is not None:
|
||||||
|
continue
|
||||||
|
elif name.lower() == 'charset':
|
||||||
|
token["data"][(namespace, name)] = self.encoding
|
||||||
|
meta_found = True
|
||||||
|
break
|
||||||
|
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||||
|
has_http_equiv_content_type = True
|
||||||
|
else:
|
||||||
|
if has_http_equiv_content_type and (None, "content") in token["data"]:
|
||||||
|
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
|
||||||
|
meta_found = True
|
||||||
|
|
||||||
|
elif token["name"].lower() == "head" and not meta_found:
|
||||||
|
# insert meta into empty head
|
||||||
|
yield {"type": "StartTag", "name": "head",
|
||||||
|
"data": token["data"]}
|
||||||
|
yield {"type": "EmptyTag", "name": "meta",
|
||||||
|
"data": {(None, "charset"): self.encoding}}
|
||||||
|
yield {"type": "EndTag", "name": "head"}
|
||||||
|
meta_found = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
if token["name"].lower() == "head" and pending:
|
||||||
|
# insert meta into head (if necessary) and flush pending queue
|
||||||
|
yield pending.pop(0)
|
||||||
|
if not meta_found:
|
||||||
|
yield {"type": "EmptyTag", "name": "meta",
|
||||||
|
"data": {(None, "charset"): self.encoding}}
|
||||||
|
while pending:
|
||||||
|
yield pending.pop(0)
|
||||||
|
meta_found = True
|
||||||
|
state = "post_head"
|
||||||
|
|
||||||
|
if state == "in_head":
|
||||||
|
pending.append(token)
|
||||||
|
else:
|
||||||
|
yield token
|
93
lib/bleach/_vendor/html5lib/filters/lint.py
Normal file
93
lib/bleach/_vendor/html5lib/filters/lint.py
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from ..constants import namespaces, voidElements
|
||||||
|
|
||||||
|
from ..constants import spaceCharacters
|
||||||
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Lints the token stream for errors
|
||||||
|
|
||||||
|
If it finds any errors, it'll raise an ``AssertionError``.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, source, require_matching_tags=True):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg source: the source token stream
|
||||||
|
|
||||||
|
:arg require_matching_tags: whether or not to require matching tags
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(Filter, self).__init__(source)
|
||||||
|
self.require_matching_tags = require_matching_tags
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
open_elements = []
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
namespace = token["namespace"]
|
||||||
|
name = token["name"]
|
||||||
|
assert namespace is None or isinstance(namespace, text_type)
|
||||||
|
assert namespace != ""
|
||||||
|
assert isinstance(name, text_type)
|
||||||
|
assert name != ""
|
||||||
|
assert isinstance(token["data"], dict)
|
||||||
|
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||||
|
assert type == "EmptyTag"
|
||||||
|
else:
|
||||||
|
assert type == "StartTag"
|
||||||
|
if type == "StartTag" and self.require_matching_tags:
|
||||||
|
open_elements.append((namespace, name))
|
||||||
|
for (namespace, name), value in token["data"].items():
|
||||||
|
assert namespace is None or isinstance(namespace, text_type)
|
||||||
|
assert namespace != ""
|
||||||
|
assert isinstance(name, text_type)
|
||||||
|
assert name != ""
|
||||||
|
assert isinstance(value, text_type)
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
namespace = token["namespace"]
|
||||||
|
name = token["name"]
|
||||||
|
assert namespace is None or isinstance(namespace, text_type)
|
||||||
|
assert namespace != ""
|
||||||
|
assert isinstance(name, text_type)
|
||||||
|
assert name != ""
|
||||||
|
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||||
|
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
|
||||||
|
elif self.require_matching_tags:
|
||||||
|
start = open_elements.pop()
|
||||||
|
assert start == (namespace, name)
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
data = token["data"]
|
||||||
|
assert isinstance(data, text_type)
|
||||||
|
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
data = token["data"]
|
||||||
|
assert isinstance(data, text_type)
|
||||||
|
assert data != ""
|
||||||
|
if type == "SpaceCharacters":
|
||||||
|
assert data.strip(spaceCharacters) == ""
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
name = token["name"]
|
||||||
|
assert name is None or isinstance(name, text_type)
|
||||||
|
assert token["publicId"] is None or isinstance(name, text_type)
|
||||||
|
assert token["systemId"] is None or isinstance(name, text_type)
|
||||||
|
|
||||||
|
elif type == "Entity":
|
||||||
|
assert isinstance(token["name"], text_type)
|
||||||
|
|
||||||
|
elif type == "SerializerError":
|
||||||
|
assert isinstance(token["data"], text_type)
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert False, "Unknown token type: %(type)s" % {"type": type}
|
||||||
|
|
||||||
|
yield token
|
207
lib/bleach/_vendor/html5lib/filters/optionaltags.py
Normal file
207
lib/bleach/_vendor/html5lib/filters/optionaltags.py
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Removes optional tags from the token stream"""
|
||||||
|
def slider(self):
|
||||||
|
previous1 = previous2 = None
|
||||||
|
for token in self.source:
|
||||||
|
if previous1 is not None:
|
||||||
|
yield previous2, previous1, token
|
||||||
|
previous2 = previous1
|
||||||
|
previous1 = token
|
||||||
|
if previous1 is not None:
|
||||||
|
yield previous2, previous1, None
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for previous, token, next in self.slider():
|
||||||
|
type = token["type"]
|
||||||
|
if type == "StartTag":
|
||||||
|
if (token["data"] or
|
||||||
|
not self.is_optional_start(token["name"], previous, next)):
|
||||||
|
yield token
|
||||||
|
elif type == "EndTag":
|
||||||
|
if not self.is_optional_end(token["name"], next):
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def is_optional_start(self, tagname, previous, next):
|
||||||
|
type = next and next["type"] or None
|
||||||
|
if tagname in 'html':
|
||||||
|
# An html element's start tag may be omitted if the first thing
|
||||||
|
# inside the html element is not a space character or a comment.
|
||||||
|
return type not in ("Comment", "SpaceCharacters")
|
||||||
|
elif tagname == 'head':
|
||||||
|
# A head element's start tag may be omitted if the first thing
|
||||||
|
# inside the head element is an element.
|
||||||
|
# XXX: we also omit the start tag if the head element is empty
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
return True
|
||||||
|
elif type == "EndTag":
|
||||||
|
return next["name"] == "head"
|
||||||
|
elif tagname == 'body':
|
||||||
|
# A body element's start tag may be omitted if the first thing
|
||||||
|
# inside the body element is not a space character or a comment,
|
||||||
|
# except if the first thing inside the body element is a script
|
||||||
|
# or style element and the node immediately preceding the body
|
||||||
|
# element is a head element whose end tag has been omitted.
|
||||||
|
if type in ("Comment", "SpaceCharacters"):
|
||||||
|
return False
|
||||||
|
elif type == "StartTag":
|
||||||
|
# XXX: we do not look at the preceding event, so we never omit
|
||||||
|
# the body element's start tag if it's followed by a script or
|
||||||
|
# a style element.
|
||||||
|
return next["name"] not in ('script', 'style')
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
elif tagname == 'colgroup':
|
||||||
|
# A colgroup element's start tag may be omitted if the first thing
|
||||||
|
# inside the colgroup element is a col element, and if the element
|
||||||
|
# is not immediately preceded by another colgroup element whose
|
||||||
|
# end tag has been omitted.
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
# XXX: we do not look at the preceding event, so instead we never
|
||||||
|
# omit the colgroup element's end tag when it is immediately
|
||||||
|
# followed by another colgroup element. See is_optional_end.
|
||||||
|
return next["name"] == "col"
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
elif tagname == 'tbody':
|
||||||
|
# A tbody element's start tag may be omitted if the first thing
|
||||||
|
# inside the tbody element is a tr element, and if the element is
|
||||||
|
# not immediately preceded by a tbody, thead, or tfoot element
|
||||||
|
# whose end tag has been omitted.
|
||||||
|
if type == "StartTag":
|
||||||
|
# omit the thead and tfoot elements' end tag when they are
|
||||||
|
# immediately followed by a tbody element. See is_optional_end.
|
||||||
|
if previous and previous['type'] == 'EndTag' and \
|
||||||
|
previous['name'] in ('tbody', 'thead', 'tfoot'):
|
||||||
|
return False
|
||||||
|
return next["name"] == 'tr'
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_optional_end(self, tagname, next):
|
||||||
|
type = next and next["type"] or None
|
||||||
|
if tagname in ('html', 'head', 'body'):
|
||||||
|
# An html element's end tag may be omitted if the html element
|
||||||
|
# is not immediately followed by a space character or a comment.
|
||||||
|
return type not in ("Comment", "SpaceCharacters")
|
||||||
|
elif tagname in ('li', 'optgroup', 'tr'):
|
||||||
|
# A li element's end tag may be omitted if the li element is
|
||||||
|
# immediately followed by another li element or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# An optgroup element's end tag may be omitted if the optgroup
|
||||||
|
# element is immediately followed by another optgroup element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
# A tr element's end tag may be omitted if the tr element is
|
||||||
|
# immediately followed by another tr element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] == tagname
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname in ('dt', 'dd'):
|
||||||
|
# A dt element's end tag may be omitted if the dt element is
|
||||||
|
# immediately followed by another dt element or a dd element.
|
||||||
|
# A dd element's end tag may be omitted if the dd element is
|
||||||
|
# immediately followed by another dd element or a dt element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('dt', 'dd')
|
||||||
|
elif tagname == 'dd':
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
elif tagname == 'p':
|
||||||
|
# A p element's end tag may be omitted if the p element is
|
||||||
|
# immediately followed by an address, article, aside,
|
||||||
|
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||||
|
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||||
|
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||||
|
# there is no more content in the parent element.
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
return next["name"] in ('address', 'article', 'aside',
|
||||||
|
'blockquote', 'datagrid', 'dialog',
|
||||||
|
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||||
|
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
'header', 'hr', 'menu', 'nav', 'ol',
|
||||||
|
'p', 'pre', 'section', 'table', 'ul')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname == 'option':
|
||||||
|
# An option element's end tag may be omitted if the option
|
||||||
|
# element is immediately followed by another option element,
|
||||||
|
# or if it is immediately followed by an <code>optgroup</code>
|
||||||
|
# element, or if there is no more content in the parent
|
||||||
|
# element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('option', 'optgroup')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname in ('rt', 'rp'):
|
||||||
|
# An rt element's end tag may be omitted if the rt element is
|
||||||
|
# immediately followed by an rt or rp element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# An rp element's end tag may be omitted if the rp element is
|
||||||
|
# immediately followed by an rt or rp element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('rt', 'rp')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname == 'colgroup':
|
||||||
|
# A colgroup element's end tag may be omitted if the colgroup
|
||||||
|
# element is not immediately followed by a space character or
|
||||||
|
# a comment.
|
||||||
|
if type in ("Comment", "SpaceCharacters"):
|
||||||
|
return False
|
||||||
|
elif type == "StartTag":
|
||||||
|
# XXX: we also look for an immediately following colgroup
|
||||||
|
# element. See is_optional_start.
|
||||||
|
return next["name"] != 'colgroup'
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
elif tagname in ('thead', 'tbody'):
|
||||||
|
# A thead element's end tag may be omitted if the thead element
|
||||||
|
# is immediately followed by a tbody or tfoot element.
|
||||||
|
# A tbody element's end tag may be omitted if the tbody element
|
||||||
|
# is immediately followed by a tbody or tfoot element, or if
|
||||||
|
# there is no more content in the parent element.
|
||||||
|
# A tfoot element's end tag may be omitted if the tfoot element
|
||||||
|
# is immediately followed by a tbody element, or if there is no
|
||||||
|
# more content in the parent element.
|
||||||
|
# XXX: we never omit the end tag when the following element is
|
||||||
|
# a tbody. See is_optional_start.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ['tbody', 'tfoot']
|
||||||
|
elif tagname == 'tbody':
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
elif tagname == 'tfoot':
|
||||||
|
# A tfoot element's end tag may be omitted if the tfoot element
|
||||||
|
# is immediately followed by a tbody element, or if there is no
|
||||||
|
# more content in the parent element.
|
||||||
|
# XXX: we never omit the end tag when the following element is
|
||||||
|
# a tbody. See is_optional_start.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] == 'tbody'
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname in ('td', 'th'):
|
||||||
|
# A td element's end tag may be omitted if the td element is
|
||||||
|
# immediately followed by a td or th element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# A th element's end tag may be omitted if the th element is
|
||||||
|
# immediately followed by a td or th element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('td', 'th')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
return False
|
916
lib/bleach/_vendor/html5lib/filters/sanitizer.py
Normal file
916
lib/bleach/_vendor/html5lib/filters/sanitizer.py
Normal file
|
@ -0,0 +1,916 @@
|
||||||
|
"""Deprecated from html5lib 1.1.
|
||||||
|
|
||||||
|
See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
|
||||||
|
information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
|
||||||
|
is recommended as a replacement. Please let us know in the aforementioned issue
|
||||||
|
if Bleach is unsuitable for your needs.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
|
||||||
|
from six.moves import urllib_parse as urlparse
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from ..constants import namespaces, prefixes
|
||||||
|
|
||||||
|
__all__ = ["Filter"]
|
||||||
|
|
||||||
|
|
||||||
|
_deprecation_msg = (
|
||||||
|
"html5lib's sanitizer is deprecated; see " +
|
||||||
|
"https://github.com/html5lib/html5lib-python/issues/443 and please let " +
|
||||||
|
"us know if Bleach is unsuitable for your needs"
|
||||||
|
)
|
||||||
|
|
||||||
|
warnings.warn(_deprecation_msg, DeprecationWarning)
|
||||||
|
|
||||||
|
allowed_elements = frozenset((
|
||||||
|
(namespaces['html'], 'a'),
|
||||||
|
(namespaces['html'], 'abbr'),
|
||||||
|
(namespaces['html'], 'acronym'),
|
||||||
|
(namespaces['html'], 'address'),
|
||||||
|
(namespaces['html'], 'area'),
|
||||||
|
(namespaces['html'], 'article'),
|
||||||
|
(namespaces['html'], 'aside'),
|
||||||
|
(namespaces['html'], 'audio'),
|
||||||
|
(namespaces['html'], 'b'),
|
||||||
|
(namespaces['html'], 'big'),
|
||||||
|
(namespaces['html'], 'blockquote'),
|
||||||
|
(namespaces['html'], 'br'),
|
||||||
|
(namespaces['html'], 'button'),
|
||||||
|
(namespaces['html'], 'canvas'),
|
||||||
|
(namespaces['html'], 'caption'),
|
||||||
|
(namespaces['html'], 'center'),
|
||||||
|
(namespaces['html'], 'cite'),
|
||||||
|
(namespaces['html'], 'code'),
|
||||||
|
(namespaces['html'], 'col'),
|
||||||
|
(namespaces['html'], 'colgroup'),
|
||||||
|
(namespaces['html'], 'command'),
|
||||||
|
(namespaces['html'], 'datagrid'),
|
||||||
|
(namespaces['html'], 'datalist'),
|
||||||
|
(namespaces['html'], 'dd'),
|
||||||
|
(namespaces['html'], 'del'),
|
||||||
|
(namespaces['html'], 'details'),
|
||||||
|
(namespaces['html'], 'dfn'),
|
||||||
|
(namespaces['html'], 'dialog'),
|
||||||
|
(namespaces['html'], 'dir'),
|
||||||
|
(namespaces['html'], 'div'),
|
||||||
|
(namespaces['html'], 'dl'),
|
||||||
|
(namespaces['html'], 'dt'),
|
||||||
|
(namespaces['html'], 'em'),
|
||||||
|
(namespaces['html'], 'event-source'),
|
||||||
|
(namespaces['html'], 'fieldset'),
|
||||||
|
(namespaces['html'], 'figcaption'),
|
||||||
|
(namespaces['html'], 'figure'),
|
||||||
|
(namespaces['html'], 'footer'),
|
||||||
|
(namespaces['html'], 'font'),
|
||||||
|
(namespaces['html'], 'form'),
|
||||||
|
(namespaces['html'], 'header'),
|
||||||
|
(namespaces['html'], 'h1'),
|
||||||
|
(namespaces['html'], 'h2'),
|
||||||
|
(namespaces['html'], 'h3'),
|
||||||
|
(namespaces['html'], 'h4'),
|
||||||
|
(namespaces['html'], 'h5'),
|
||||||
|
(namespaces['html'], 'h6'),
|
||||||
|
(namespaces['html'], 'hr'),
|
||||||
|
(namespaces['html'], 'i'),
|
||||||
|
(namespaces['html'], 'img'),
|
||||||
|
(namespaces['html'], 'input'),
|
||||||
|
(namespaces['html'], 'ins'),
|
||||||
|
(namespaces['html'], 'keygen'),
|
||||||
|
(namespaces['html'], 'kbd'),
|
||||||
|
(namespaces['html'], 'label'),
|
||||||
|
(namespaces['html'], 'legend'),
|
||||||
|
(namespaces['html'], 'li'),
|
||||||
|
(namespaces['html'], 'm'),
|
||||||
|
(namespaces['html'], 'map'),
|
||||||
|
(namespaces['html'], 'menu'),
|
||||||
|
(namespaces['html'], 'meter'),
|
||||||
|
(namespaces['html'], 'multicol'),
|
||||||
|
(namespaces['html'], 'nav'),
|
||||||
|
(namespaces['html'], 'nextid'),
|
||||||
|
(namespaces['html'], 'ol'),
|
||||||
|
(namespaces['html'], 'output'),
|
||||||
|
(namespaces['html'], 'optgroup'),
|
||||||
|
(namespaces['html'], 'option'),
|
||||||
|
(namespaces['html'], 'p'),
|
||||||
|
(namespaces['html'], 'pre'),
|
||||||
|
(namespaces['html'], 'progress'),
|
||||||
|
(namespaces['html'], 'q'),
|
||||||
|
(namespaces['html'], 's'),
|
||||||
|
(namespaces['html'], 'samp'),
|
||||||
|
(namespaces['html'], 'section'),
|
||||||
|
(namespaces['html'], 'select'),
|
||||||
|
(namespaces['html'], 'small'),
|
||||||
|
(namespaces['html'], 'sound'),
|
||||||
|
(namespaces['html'], 'source'),
|
||||||
|
(namespaces['html'], 'spacer'),
|
||||||
|
(namespaces['html'], 'span'),
|
||||||
|
(namespaces['html'], 'strike'),
|
||||||
|
(namespaces['html'], 'strong'),
|
||||||
|
(namespaces['html'], 'sub'),
|
||||||
|
(namespaces['html'], 'sup'),
|
||||||
|
(namespaces['html'], 'table'),
|
||||||
|
(namespaces['html'], 'tbody'),
|
||||||
|
(namespaces['html'], 'td'),
|
||||||
|
(namespaces['html'], 'textarea'),
|
||||||
|
(namespaces['html'], 'time'),
|
||||||
|
(namespaces['html'], 'tfoot'),
|
||||||
|
(namespaces['html'], 'th'),
|
||||||
|
(namespaces['html'], 'thead'),
|
||||||
|
(namespaces['html'], 'tr'),
|
||||||
|
(namespaces['html'], 'tt'),
|
||||||
|
(namespaces['html'], 'u'),
|
||||||
|
(namespaces['html'], 'ul'),
|
||||||
|
(namespaces['html'], 'var'),
|
||||||
|
(namespaces['html'], 'video'),
|
||||||
|
(namespaces['mathml'], 'maction'),
|
||||||
|
(namespaces['mathml'], 'math'),
|
||||||
|
(namespaces['mathml'], 'merror'),
|
||||||
|
(namespaces['mathml'], 'mfrac'),
|
||||||
|
(namespaces['mathml'], 'mi'),
|
||||||
|
(namespaces['mathml'], 'mmultiscripts'),
|
||||||
|
(namespaces['mathml'], 'mn'),
|
||||||
|
(namespaces['mathml'], 'mo'),
|
||||||
|
(namespaces['mathml'], 'mover'),
|
||||||
|
(namespaces['mathml'], 'mpadded'),
|
||||||
|
(namespaces['mathml'], 'mphantom'),
|
||||||
|
(namespaces['mathml'], 'mprescripts'),
|
||||||
|
(namespaces['mathml'], 'mroot'),
|
||||||
|
(namespaces['mathml'], 'mrow'),
|
||||||
|
(namespaces['mathml'], 'mspace'),
|
||||||
|
(namespaces['mathml'], 'msqrt'),
|
||||||
|
(namespaces['mathml'], 'mstyle'),
|
||||||
|
(namespaces['mathml'], 'msub'),
|
||||||
|
(namespaces['mathml'], 'msubsup'),
|
||||||
|
(namespaces['mathml'], 'msup'),
|
||||||
|
(namespaces['mathml'], 'mtable'),
|
||||||
|
(namespaces['mathml'], 'mtd'),
|
||||||
|
(namespaces['mathml'], 'mtext'),
|
||||||
|
(namespaces['mathml'], 'mtr'),
|
||||||
|
(namespaces['mathml'], 'munder'),
|
||||||
|
(namespaces['mathml'], 'munderover'),
|
||||||
|
(namespaces['mathml'], 'none'),
|
||||||
|
(namespaces['svg'], 'a'),
|
||||||
|
(namespaces['svg'], 'animate'),
|
||||||
|
(namespaces['svg'], 'animateColor'),
|
||||||
|
(namespaces['svg'], 'animateMotion'),
|
||||||
|
(namespaces['svg'], 'animateTransform'),
|
||||||
|
(namespaces['svg'], 'clipPath'),
|
||||||
|
(namespaces['svg'], 'circle'),
|
||||||
|
(namespaces['svg'], 'defs'),
|
||||||
|
(namespaces['svg'], 'desc'),
|
||||||
|
(namespaces['svg'], 'ellipse'),
|
||||||
|
(namespaces['svg'], 'font-face'),
|
||||||
|
(namespaces['svg'], 'font-face-name'),
|
||||||
|
(namespaces['svg'], 'font-face-src'),
|
||||||
|
(namespaces['svg'], 'g'),
|
||||||
|
(namespaces['svg'], 'glyph'),
|
||||||
|
(namespaces['svg'], 'hkern'),
|
||||||
|
(namespaces['svg'], 'linearGradient'),
|
||||||
|
(namespaces['svg'], 'line'),
|
||||||
|
(namespaces['svg'], 'marker'),
|
||||||
|
(namespaces['svg'], 'metadata'),
|
||||||
|
(namespaces['svg'], 'missing-glyph'),
|
||||||
|
(namespaces['svg'], 'mpath'),
|
||||||
|
(namespaces['svg'], 'path'),
|
||||||
|
(namespaces['svg'], 'polygon'),
|
||||||
|
(namespaces['svg'], 'polyline'),
|
||||||
|
(namespaces['svg'], 'radialGradient'),
|
||||||
|
(namespaces['svg'], 'rect'),
|
||||||
|
(namespaces['svg'], 'set'),
|
||||||
|
(namespaces['svg'], 'stop'),
|
||||||
|
(namespaces['svg'], 'svg'),
|
||||||
|
(namespaces['svg'], 'switch'),
|
||||||
|
(namespaces['svg'], 'text'),
|
||||||
|
(namespaces['svg'], 'title'),
|
||||||
|
(namespaces['svg'], 'tspan'),
|
||||||
|
(namespaces['svg'], 'use'),
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_attributes = frozenset((
|
||||||
|
# HTML attributes
|
||||||
|
(None, 'abbr'),
|
||||||
|
(None, 'accept'),
|
||||||
|
(None, 'accept-charset'),
|
||||||
|
(None, 'accesskey'),
|
||||||
|
(None, 'action'),
|
||||||
|
(None, 'align'),
|
||||||
|
(None, 'alt'),
|
||||||
|
(None, 'autocomplete'),
|
||||||
|
(None, 'autofocus'),
|
||||||
|
(None, 'axis'),
|
||||||
|
(None, 'background'),
|
||||||
|
(None, 'balance'),
|
||||||
|
(None, 'bgcolor'),
|
||||||
|
(None, 'bgproperties'),
|
||||||
|
(None, 'border'),
|
||||||
|
(None, 'bordercolor'),
|
||||||
|
(None, 'bordercolordark'),
|
||||||
|
(None, 'bordercolorlight'),
|
||||||
|
(None, 'bottompadding'),
|
||||||
|
(None, 'cellpadding'),
|
||||||
|
(None, 'cellspacing'),
|
||||||
|
(None, 'ch'),
|
||||||
|
(None, 'challenge'),
|
||||||
|
(None, 'char'),
|
||||||
|
(None, 'charoff'),
|
||||||
|
(None, 'choff'),
|
||||||
|
(None, 'charset'),
|
||||||
|
(None, 'checked'),
|
||||||
|
(None, 'cite'),
|
||||||
|
(None, 'class'),
|
||||||
|
(None, 'clear'),
|
||||||
|
(None, 'color'),
|
||||||
|
(None, 'cols'),
|
||||||
|
(None, 'colspan'),
|
||||||
|
(None, 'compact'),
|
||||||
|
(None, 'contenteditable'),
|
||||||
|
(None, 'controls'),
|
||||||
|
(None, 'coords'),
|
||||||
|
(None, 'data'),
|
||||||
|
(None, 'datafld'),
|
||||||
|
(None, 'datapagesize'),
|
||||||
|
(None, 'datasrc'),
|
||||||
|
(None, 'datetime'),
|
||||||
|
(None, 'default'),
|
||||||
|
(None, 'delay'),
|
||||||
|
(None, 'dir'),
|
||||||
|
(None, 'disabled'),
|
||||||
|
(None, 'draggable'),
|
||||||
|
(None, 'dynsrc'),
|
||||||
|
(None, 'enctype'),
|
||||||
|
(None, 'end'),
|
||||||
|
(None, 'face'),
|
||||||
|
(None, 'for'),
|
||||||
|
(None, 'form'),
|
||||||
|
(None, 'frame'),
|
||||||
|
(None, 'galleryimg'),
|
||||||
|
(None, 'gutter'),
|
||||||
|
(None, 'headers'),
|
||||||
|
(None, 'height'),
|
||||||
|
(None, 'hidefocus'),
|
||||||
|
(None, 'hidden'),
|
||||||
|
(None, 'high'),
|
||||||
|
(None, 'href'),
|
||||||
|
(None, 'hreflang'),
|
||||||
|
(None, 'hspace'),
|
||||||
|
(None, 'icon'),
|
||||||
|
(None, 'id'),
|
||||||
|
(None, 'inputmode'),
|
||||||
|
(None, 'ismap'),
|
||||||
|
(None, 'keytype'),
|
||||||
|
(None, 'label'),
|
||||||
|
(None, 'leftspacing'),
|
||||||
|
(None, 'lang'),
|
||||||
|
(None, 'list'),
|
||||||
|
(None, 'longdesc'),
|
||||||
|
(None, 'loop'),
|
||||||
|
(None, 'loopcount'),
|
||||||
|
(None, 'loopend'),
|
||||||
|
(None, 'loopstart'),
|
||||||
|
(None, 'low'),
|
||||||
|
(None, 'lowsrc'),
|
||||||
|
(None, 'max'),
|
||||||
|
(None, 'maxlength'),
|
||||||
|
(None, 'media'),
|
||||||
|
(None, 'method'),
|
||||||
|
(None, 'min'),
|
||||||
|
(None, 'multiple'),
|
||||||
|
(None, 'name'),
|
||||||
|
(None, 'nohref'),
|
||||||
|
(None, 'noshade'),
|
||||||
|
(None, 'nowrap'),
|
||||||
|
(None, 'open'),
|
||||||
|
(None, 'optimum'),
|
||||||
|
(None, 'pattern'),
|
||||||
|
(None, 'ping'),
|
||||||
|
(None, 'point-size'),
|
||||||
|
(None, 'poster'),
|
||||||
|
(None, 'pqg'),
|
||||||
|
(None, 'preload'),
|
||||||
|
(None, 'prompt'),
|
||||||
|
(None, 'radiogroup'),
|
||||||
|
(None, 'readonly'),
|
||||||
|
(None, 'rel'),
|
||||||
|
(None, 'repeat-max'),
|
||||||
|
(None, 'repeat-min'),
|
||||||
|
(None, 'replace'),
|
||||||
|
(None, 'required'),
|
||||||
|
(None, 'rev'),
|
||||||
|
(None, 'rightspacing'),
|
||||||
|
(None, 'rows'),
|
||||||
|
(None, 'rowspan'),
|
||||||
|
(None, 'rules'),
|
||||||
|
(None, 'scope'),
|
||||||
|
(None, 'selected'),
|
||||||
|
(None, 'shape'),
|
||||||
|
(None, 'size'),
|
||||||
|
(None, 'span'),
|
||||||
|
(None, 'src'),
|
||||||
|
(None, 'start'),
|
||||||
|
(None, 'step'),
|
||||||
|
(None, 'style'),
|
||||||
|
(None, 'summary'),
|
||||||
|
(None, 'suppress'),
|
||||||
|
(None, 'tabindex'),
|
||||||
|
(None, 'target'),
|
||||||
|
(None, 'template'),
|
||||||
|
(None, 'title'),
|
||||||
|
(None, 'toppadding'),
|
||||||
|
(None, 'type'),
|
||||||
|
(None, 'unselectable'),
|
||||||
|
(None, 'usemap'),
|
||||||
|
(None, 'urn'),
|
||||||
|
(None, 'valign'),
|
||||||
|
(None, 'value'),
|
||||||
|
(None, 'variable'),
|
||||||
|
(None, 'volume'),
|
||||||
|
(None, 'vspace'),
|
||||||
|
(None, 'vrml'),
|
||||||
|
(None, 'width'),
|
||||||
|
(None, 'wrap'),
|
||||||
|
(namespaces['xml'], 'lang'),
|
||||||
|
# MathML attributes
|
||||||
|
(None, 'actiontype'),
|
||||||
|
(None, 'align'),
|
||||||
|
(None, 'columnalign'),
|
||||||
|
(None, 'columnalign'),
|
||||||
|
(None, 'columnalign'),
|
||||||
|
(None, 'columnlines'),
|
||||||
|
(None, 'columnspacing'),
|
||||||
|
(None, 'columnspan'),
|
||||||
|
(None, 'depth'),
|
||||||
|
(None, 'display'),
|
||||||
|
(None, 'displaystyle'),
|
||||||
|
(None, 'equalcolumns'),
|
||||||
|
(None, 'equalrows'),
|
||||||
|
(None, 'fence'),
|
||||||
|
(None, 'fontstyle'),
|
||||||
|
(None, 'fontweight'),
|
||||||
|
(None, 'frame'),
|
||||||
|
(None, 'height'),
|
||||||
|
(None, 'linethickness'),
|
||||||
|
(None, 'lspace'),
|
||||||
|
(None, 'mathbackground'),
|
||||||
|
(None, 'mathcolor'),
|
||||||
|
(None, 'mathvariant'),
|
||||||
|
(None, 'mathvariant'),
|
||||||
|
(None, 'maxsize'),
|
||||||
|
(None, 'minsize'),
|
||||||
|
(None, 'other'),
|
||||||
|
(None, 'rowalign'),
|
||||||
|
(None, 'rowalign'),
|
||||||
|
(None, 'rowalign'),
|
||||||
|
(None, 'rowlines'),
|
||||||
|
(None, 'rowspacing'),
|
||||||
|
(None, 'rowspan'),
|
||||||
|
(None, 'rspace'),
|
||||||
|
(None, 'scriptlevel'),
|
||||||
|
(None, 'selection'),
|
||||||
|
(None, 'separator'),
|
||||||
|
(None, 'stretchy'),
|
||||||
|
(None, 'width'),
|
||||||
|
(None, 'width'),
|
||||||
|
(namespaces['xlink'], 'href'),
|
||||||
|
(namespaces['xlink'], 'show'),
|
||||||
|
(namespaces['xlink'], 'type'),
|
||||||
|
# SVG attributes
|
||||||
|
(None, 'accent-height'),
|
||||||
|
(None, 'accumulate'),
|
||||||
|
(None, 'additive'),
|
||||||
|
(None, 'alphabetic'),
|
||||||
|
(None, 'arabic-form'),
|
||||||
|
(None, 'ascent'),
|
||||||
|
(None, 'attributeName'),
|
||||||
|
(None, 'attributeType'),
|
||||||
|
(None, 'baseProfile'),
|
||||||
|
(None, 'bbox'),
|
||||||
|
(None, 'begin'),
|
||||||
|
(None, 'by'),
|
||||||
|
(None, 'calcMode'),
|
||||||
|
(None, 'cap-height'),
|
||||||
|
(None, 'class'),
|
||||||
|
(None, 'clip-path'),
|
||||||
|
(None, 'color'),
|
||||||
|
(None, 'color-rendering'),
|
||||||
|
(None, 'content'),
|
||||||
|
(None, 'cx'),
|
||||||
|
(None, 'cy'),
|
||||||
|
(None, 'd'),
|
||||||
|
(None, 'dx'),
|
||||||
|
(None, 'dy'),
|
||||||
|
(None, 'descent'),
|
||||||
|
(None, 'display'),
|
||||||
|
(None, 'dur'),
|
||||||
|
(None, 'end'),
|
||||||
|
(None, 'fill'),
|
||||||
|
(None, 'fill-opacity'),
|
||||||
|
(None, 'fill-rule'),
|
||||||
|
(None, 'font-family'),
|
||||||
|
(None, 'font-size'),
|
||||||
|
(None, 'font-stretch'),
|
||||||
|
(None, 'font-style'),
|
||||||
|
(None, 'font-variant'),
|
||||||
|
(None, 'font-weight'),
|
||||||
|
(None, 'from'),
|
||||||
|
(None, 'fx'),
|
||||||
|
(None, 'fy'),
|
||||||
|
(None, 'g1'),
|
||||||
|
(None, 'g2'),
|
||||||
|
(None, 'glyph-name'),
|
||||||
|
(None, 'gradientUnits'),
|
||||||
|
(None, 'hanging'),
|
||||||
|
(None, 'height'),
|
||||||
|
(None, 'horiz-adv-x'),
|
||||||
|
(None, 'horiz-origin-x'),
|
||||||
|
(None, 'id'),
|
||||||
|
(None, 'ideographic'),
|
||||||
|
(None, 'k'),
|
||||||
|
(None, 'keyPoints'),
|
||||||
|
(None, 'keySplines'),
|
||||||
|
(None, 'keyTimes'),
|
||||||
|
(None, 'lang'),
|
||||||
|
(None, 'marker-end'),
|
||||||
|
(None, 'marker-mid'),
|
||||||
|
(None, 'marker-start'),
|
||||||
|
(None, 'markerHeight'),
|
||||||
|
(None, 'markerUnits'),
|
||||||
|
(None, 'markerWidth'),
|
||||||
|
(None, 'mathematical'),
|
||||||
|
(None, 'max'),
|
||||||
|
(None, 'min'),
|
||||||
|
(None, 'name'),
|
||||||
|
(None, 'offset'),
|
||||||
|
(None, 'opacity'),
|
||||||
|
(None, 'orient'),
|
||||||
|
(None, 'origin'),
|
||||||
|
(None, 'overline-position'),
|
||||||
|
(None, 'overline-thickness'),
|
||||||
|
(None, 'panose-1'),
|
||||||
|
(None, 'path'),
|
||||||
|
(None, 'pathLength'),
|
||||||
|
(None, 'points'),
|
||||||
|
(None, 'preserveAspectRatio'),
|
||||||
|
(None, 'r'),
|
||||||
|
(None, 'refX'),
|
||||||
|
(None, 'refY'),
|
||||||
|
(None, 'repeatCount'),
|
||||||
|
(None, 'repeatDur'),
|
||||||
|
(None, 'requiredExtensions'),
|
||||||
|
(None, 'requiredFeatures'),
|
||||||
|
(None, 'restart'),
|
||||||
|
(None, 'rotate'),
|
||||||
|
(None, 'rx'),
|
||||||
|
(None, 'ry'),
|
||||||
|
(None, 'slope'),
|
||||||
|
(None, 'stemh'),
|
||||||
|
(None, 'stemv'),
|
||||||
|
(None, 'stop-color'),
|
||||||
|
(None, 'stop-opacity'),
|
||||||
|
(None, 'strikethrough-position'),
|
||||||
|
(None, 'strikethrough-thickness'),
|
||||||
|
(None, 'stroke'),
|
||||||
|
(None, 'stroke-dasharray'),
|
||||||
|
(None, 'stroke-dashoffset'),
|
||||||
|
(None, 'stroke-linecap'),
|
||||||
|
(None, 'stroke-linejoin'),
|
||||||
|
(None, 'stroke-miterlimit'),
|
||||||
|
(None, 'stroke-opacity'),
|
||||||
|
(None, 'stroke-width'),
|
||||||
|
(None, 'systemLanguage'),
|
||||||
|
(None, 'target'),
|
||||||
|
(None, 'text-anchor'),
|
||||||
|
(None, 'to'),
|
||||||
|
(None, 'transform'),
|
||||||
|
(None, 'type'),
|
||||||
|
(None, 'u1'),
|
||||||
|
(None, 'u2'),
|
||||||
|
(None, 'underline-position'),
|
||||||
|
(None, 'underline-thickness'),
|
||||||
|
(None, 'unicode'),
|
||||||
|
(None, 'unicode-range'),
|
||||||
|
(None, 'units-per-em'),
|
||||||
|
(None, 'values'),
|
||||||
|
(None, 'version'),
|
||||||
|
(None, 'viewBox'),
|
||||||
|
(None, 'visibility'),
|
||||||
|
(None, 'width'),
|
||||||
|
(None, 'widths'),
|
||||||
|
(None, 'x'),
|
||||||
|
(None, 'x-height'),
|
||||||
|
(None, 'x1'),
|
||||||
|
(None, 'x2'),
|
||||||
|
(namespaces['xlink'], 'actuate'),
|
||||||
|
(namespaces['xlink'], 'arcrole'),
|
||||||
|
(namespaces['xlink'], 'href'),
|
||||||
|
(namespaces['xlink'], 'role'),
|
||||||
|
(namespaces['xlink'], 'show'),
|
||||||
|
(namespaces['xlink'], 'title'),
|
||||||
|
(namespaces['xlink'], 'type'),
|
||||||
|
(namespaces['xml'], 'base'),
|
||||||
|
(namespaces['xml'], 'lang'),
|
||||||
|
(namespaces['xml'], 'space'),
|
||||||
|
(None, 'y'),
|
||||||
|
(None, 'y1'),
|
||||||
|
(None, 'y2'),
|
||||||
|
(None, 'zoomAndPan'),
|
||||||
|
))
|
||||||
|
|
||||||
|
attr_val_is_uri = frozenset((
|
||||||
|
(None, 'href'),
|
||||||
|
(None, 'src'),
|
||||||
|
(None, 'cite'),
|
||||||
|
(None, 'action'),
|
||||||
|
(None, 'longdesc'),
|
||||||
|
(None, 'poster'),
|
||||||
|
(None, 'background'),
|
||||||
|
(None, 'datasrc'),
|
||||||
|
(None, 'dynsrc'),
|
||||||
|
(None, 'lowsrc'),
|
||||||
|
(None, 'ping'),
|
||||||
|
(namespaces['xlink'], 'href'),
|
||||||
|
(namespaces['xml'], 'base'),
|
||||||
|
))
|
||||||
|
|
||||||
|
svg_attr_val_allows_ref = frozenset((
|
||||||
|
(None, 'clip-path'),
|
||||||
|
(None, 'color-profile'),
|
||||||
|
(None, 'cursor'),
|
||||||
|
(None, 'fill'),
|
||||||
|
(None, 'filter'),
|
||||||
|
(None, 'marker'),
|
||||||
|
(None, 'marker-start'),
|
||||||
|
(None, 'marker-mid'),
|
||||||
|
(None, 'marker-end'),
|
||||||
|
(None, 'mask'),
|
||||||
|
(None, 'stroke'),
|
||||||
|
))
|
||||||
|
|
||||||
|
svg_allow_local_href = frozenset((
|
||||||
|
(None, 'altGlyph'),
|
||||||
|
(None, 'animate'),
|
||||||
|
(None, 'animateColor'),
|
||||||
|
(None, 'animateMotion'),
|
||||||
|
(None, 'animateTransform'),
|
||||||
|
(None, 'cursor'),
|
||||||
|
(None, 'feImage'),
|
||||||
|
(None, 'filter'),
|
||||||
|
(None, 'linearGradient'),
|
||||||
|
(None, 'pattern'),
|
||||||
|
(None, 'radialGradient'),
|
||||||
|
(None, 'textpath'),
|
||||||
|
(None, 'tref'),
|
||||||
|
(None, 'set'),
|
||||||
|
(None, 'use')
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_css_properties = frozenset((
|
||||||
|
'azimuth',
|
||||||
|
'background-color',
|
||||||
|
'border-bottom-color',
|
||||||
|
'border-collapse',
|
||||||
|
'border-color',
|
||||||
|
'border-left-color',
|
||||||
|
'border-right-color',
|
||||||
|
'border-top-color',
|
||||||
|
'clear',
|
||||||
|
'color',
|
||||||
|
'cursor',
|
||||||
|
'direction',
|
||||||
|
'display',
|
||||||
|
'elevation',
|
||||||
|
'float',
|
||||||
|
'font',
|
||||||
|
'font-family',
|
||||||
|
'font-size',
|
||||||
|
'font-style',
|
||||||
|
'font-variant',
|
||||||
|
'font-weight',
|
||||||
|
'height',
|
||||||
|
'letter-spacing',
|
||||||
|
'line-height',
|
||||||
|
'overflow',
|
||||||
|
'pause',
|
||||||
|
'pause-after',
|
||||||
|
'pause-before',
|
||||||
|
'pitch',
|
||||||
|
'pitch-range',
|
||||||
|
'richness',
|
||||||
|
'speak',
|
||||||
|
'speak-header',
|
||||||
|
'speak-numeral',
|
||||||
|
'speak-punctuation',
|
||||||
|
'speech-rate',
|
||||||
|
'stress',
|
||||||
|
'text-align',
|
||||||
|
'text-decoration',
|
||||||
|
'text-indent',
|
||||||
|
'unicode-bidi',
|
||||||
|
'vertical-align',
|
||||||
|
'voice-family',
|
||||||
|
'volume',
|
||||||
|
'white-space',
|
||||||
|
'width',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_css_keywords = frozenset((
|
||||||
|
'auto',
|
||||||
|
'aqua',
|
||||||
|
'black',
|
||||||
|
'block',
|
||||||
|
'blue',
|
||||||
|
'bold',
|
||||||
|
'both',
|
||||||
|
'bottom',
|
||||||
|
'brown',
|
||||||
|
'center',
|
||||||
|
'collapse',
|
||||||
|
'dashed',
|
||||||
|
'dotted',
|
||||||
|
'fuchsia',
|
||||||
|
'gray',
|
||||||
|
'green',
|
||||||
|
'!important',
|
||||||
|
'italic',
|
||||||
|
'left',
|
||||||
|
'lime',
|
||||||
|
'maroon',
|
||||||
|
'medium',
|
||||||
|
'none',
|
||||||
|
'navy',
|
||||||
|
'normal',
|
||||||
|
'nowrap',
|
||||||
|
'olive',
|
||||||
|
'pointer',
|
||||||
|
'purple',
|
||||||
|
'red',
|
||||||
|
'right',
|
||||||
|
'solid',
|
||||||
|
'silver',
|
||||||
|
'teal',
|
||||||
|
'top',
|
||||||
|
'transparent',
|
||||||
|
'underline',
|
||||||
|
'white',
|
||||||
|
'yellow',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_svg_properties = frozenset((
|
||||||
|
'fill',
|
||||||
|
'fill-opacity',
|
||||||
|
'fill-rule',
|
||||||
|
'stroke',
|
||||||
|
'stroke-width',
|
||||||
|
'stroke-linecap',
|
||||||
|
'stroke-linejoin',
|
||||||
|
'stroke-opacity',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_protocols = frozenset((
|
||||||
|
'ed2k',
|
||||||
|
'ftp',
|
||||||
|
'http',
|
||||||
|
'https',
|
||||||
|
'irc',
|
||||||
|
'mailto',
|
||||||
|
'news',
|
||||||
|
'gopher',
|
||||||
|
'nntp',
|
||||||
|
'telnet',
|
||||||
|
'webcal',
|
||||||
|
'xmpp',
|
||||||
|
'callto',
|
||||||
|
'feed',
|
||||||
|
'urn',
|
||||||
|
'aim',
|
||||||
|
'rsync',
|
||||||
|
'tag',
|
||||||
|
'ssh',
|
||||||
|
'sftp',
|
||||||
|
'rtsp',
|
||||||
|
'afs',
|
||||||
|
'data',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_content_types = frozenset((
|
||||||
|
'image/png',
|
||||||
|
'image/jpeg',
|
||||||
|
'image/gif',
|
||||||
|
'image/webp',
|
||||||
|
'image/bmp',
|
||||||
|
'text/plain',
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
data_content_type = re.compile(r'''
|
||||||
|
^
|
||||||
|
# Match a content type <application>/<type>
|
||||||
|
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||||
|
# Match any character set and encoding
|
||||||
|
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||||
|
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||||
|
# Assume the rest is data
|
||||||
|
,.*
|
||||||
|
$
|
||||||
|
''',
|
||||||
|
re.VERBOSE)
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
|
||||||
|
def __init__(self,
|
||||||
|
source,
|
||||||
|
allowed_elements=allowed_elements,
|
||||||
|
allowed_attributes=allowed_attributes,
|
||||||
|
allowed_css_properties=allowed_css_properties,
|
||||||
|
allowed_css_keywords=allowed_css_keywords,
|
||||||
|
allowed_svg_properties=allowed_svg_properties,
|
||||||
|
allowed_protocols=allowed_protocols,
|
||||||
|
allowed_content_types=allowed_content_types,
|
||||||
|
attr_val_is_uri=attr_val_is_uri,
|
||||||
|
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
||||||
|
svg_allow_local_href=svg_allow_local_href):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg allowed_elements: set of elements to allow--everything else will
|
||||||
|
be escaped
|
||||||
|
|
||||||
|
:arg allowed_attributes: set of attributes to allow in
|
||||||
|
elements--everything else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_css_properties: set of CSS properties to allow--everything
|
||||||
|
else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_css_keywords: set of CSS keywords to allow--everything
|
||||||
|
else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_svg_properties: set of SVG properties to allow--everything
|
||||||
|
else will be removed
|
||||||
|
|
||||||
|
:arg allowed_protocols: set of allowed protocols for URIs
|
||||||
|
|
||||||
|
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
|
||||||
|
|
||||||
|
:arg attr_val_is_uri: set of attributes that have URI values--values
|
||||||
|
that have a scheme not listed in ``allowed_protocols`` are removed
|
||||||
|
|
||||||
|
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
|
||||||
|
references
|
||||||
|
|
||||||
|
:arg svg_allow_local_href: set of SVG elements that can have local
|
||||||
|
hrefs--these are removed
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(Filter, self).__init__(source)
|
||||||
|
|
||||||
|
warnings.warn(_deprecation_msg, DeprecationWarning)
|
||||||
|
|
||||||
|
self.allowed_elements = allowed_elements
|
||||||
|
self.allowed_attributes = allowed_attributes
|
||||||
|
self.allowed_css_properties = allowed_css_properties
|
||||||
|
self.allowed_css_keywords = allowed_css_keywords
|
||||||
|
self.allowed_svg_properties = allowed_svg_properties
|
||||||
|
self.allowed_protocols = allowed_protocols
|
||||||
|
self.allowed_content_types = allowed_content_types
|
||||||
|
self.attr_val_is_uri = attr_val_is_uri
|
||||||
|
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
|
||||||
|
self.svg_allow_local_href = svg_allow_local_href
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
token = self.sanitize_token(token)
|
||||||
|
if token:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||||
|
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
|
||||||
|
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
|
||||||
|
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
|
||||||
|
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
|
||||||
|
# allowed.
|
||||||
|
#
|
||||||
|
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||||
|
# => <script> do_nasty_stuff() </script>
|
||||||
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
|
# => <a>Click here for $100</a>
|
||||||
|
def sanitize_token(self, token):
|
||||||
|
|
||||||
|
# accommodate filters which use token_type differently
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type in ("StartTag", "EndTag", "EmptyTag"):
|
||||||
|
name = token["name"]
|
||||||
|
namespace = token["namespace"]
|
||||||
|
if ((namespace, name) in self.allowed_elements or
|
||||||
|
(namespace is None and
|
||||||
|
(namespaces["html"], name) in self.allowed_elements)):
|
||||||
|
return self.allowed_token(token)
|
||||||
|
else:
|
||||||
|
return self.disallowed_token(token)
|
||||||
|
elif token_type == "Comment":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
|
||||||
|
def allowed_token(self, token):
|
||||||
|
if "data" in token:
|
||||||
|
attrs = token["data"]
|
||||||
|
attr_names = set(attrs.keys())
|
||||||
|
|
||||||
|
# Remove forbidden attributes
|
||||||
|
for to_remove in (attr_names - self.allowed_attributes):
|
||||||
|
del token["data"][to_remove]
|
||||||
|
attr_names.remove(to_remove)
|
||||||
|
|
||||||
|
# Remove attributes with disallowed URL values
|
||||||
|
for attr in (attr_names & self.attr_val_is_uri):
|
||||||
|
assert attr in attrs
|
||||||
|
# I don't have a clue where this regexp comes from or why it matches those
|
||||||
|
# characters, nor why we call unescape. I just know it's always been here.
|
||||||
|
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
||||||
|
# this will do is remove *more* than it otherwise would.
|
||||||
|
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
|
||||||
|
unescape(attrs[attr])).lower()
|
||||||
|
# remove replacement characters from unescaped characters
|
||||||
|
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||||
|
try:
|
||||||
|
uri = urlparse.urlparse(val_unescaped)
|
||||||
|
except ValueError:
|
||||||
|
uri = None
|
||||||
|
del attrs[attr]
|
||||||
|
if uri and uri.scheme:
|
||||||
|
if uri.scheme not in self.allowed_protocols:
|
||||||
|
del attrs[attr]
|
||||||
|
if uri.scheme == 'data':
|
||||||
|
m = data_content_type.match(uri.path)
|
||||||
|
if not m:
|
||||||
|
del attrs[attr]
|
||||||
|
elif m.group('content_type') not in self.allowed_content_types:
|
||||||
|
del attrs[attr]
|
||||||
|
|
||||||
|
for attr in self.svg_attr_val_allows_ref:
|
||||||
|
if attr in attrs:
|
||||||
|
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||||
|
' ',
|
||||||
|
unescape(attrs[attr]))
|
||||||
|
if (token["name"] in self.svg_allow_local_href and
|
||||||
|
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
|
||||||
|
attrs[(namespaces['xlink'], 'href')])):
|
||||||
|
del attrs[(namespaces['xlink'], 'href')]
|
||||||
|
if (None, 'style') in attrs:
|
||||||
|
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
|
||||||
|
token["data"] = attrs
|
||||||
|
return token
|
||||||
|
|
||||||
|
def disallowed_token(self, token):
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type == "EndTag":
|
||||||
|
token["data"] = "</%s>" % token["name"]
|
||||||
|
elif token["data"]:
|
||||||
|
assert token_type in ("StartTag", "EmptyTag")
|
||||||
|
attrs = []
|
||||||
|
for (ns, name), v in token["data"].items():
|
||||||
|
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
|
||||||
|
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
|
||||||
|
else:
|
||||||
|
token["data"] = "<%s>" % token["name"]
|
||||||
|
if token.get("selfClosing"):
|
||||||
|
token["data"] = token["data"][:-1] + "/>"
|
||||||
|
|
||||||
|
token["type"] = "Characters"
|
||||||
|
|
||||||
|
del token["name"]
|
||||||
|
return token
|
||||||
|
|
||||||
|
def sanitize_css(self, style):
|
||||||
|
# disallow urls
|
||||||
|
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||||
|
|
||||||
|
# gauntlet
|
||||||
|
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||||
|
return ''
|
||||||
|
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
clean = []
|
||||||
|
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
if prop.lower() in self.allowed_css_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||||
|
'padding']:
|
||||||
|
for keyword in value.split():
|
||||||
|
if keyword not in self.allowed_css_keywords and \
|
||||||
|
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.lower() in self.allowed_svg_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
|
||||||
|
return ' '.join(clean)
|
38
lib/bleach/_vendor/html5lib/filters/whitespace.py
Normal file
38
lib/bleach/_vendor/html5lib/filters/whitespace.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from ..constants import rcdataElements, spaceCharacters
|
||||||
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Collapses whitespace except in pre, textarea, and script elements"""
|
||||||
|
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
preserve = 0
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type == "StartTag" \
|
||||||
|
and (preserve or token["name"] in self.spacePreserveElements):
|
||||||
|
preserve += 1
|
||||||
|
|
||||||
|
elif type == "EndTag" and preserve:
|
||||||
|
preserve -= 1
|
||||||
|
|
||||||
|
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||||
|
# Test on token["data"] above to not introduce spaces where there were not
|
||||||
|
token["data"] = " "
|
||||||
|
|
||||||
|
elif not preserve and type == "Characters":
|
||||||
|
token["data"] = collapse_spaces(token["data"])
|
||||||
|
|
||||||
|
yield token
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_spaces(text):
|
||||||
|
return SPACES_REGEX.sub(' ', text)
|
2795
lib/bleach/_vendor/html5lib/html5parser.py
Normal file
2795
lib/bleach/_vendor/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load diff
409
lib/bleach/_vendor/html5lib/serializer.py
Normal file
409
lib/bleach/_vendor/html5lib/serializer.py
Normal file
|
@ -0,0 +1,409 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from codecs import register_error, xmlcharrefreplace_errors
|
||||||
|
|
||||||
|
from .constants import voidElements, booleanAttributes, spaceCharacters
|
||||||
|
from .constants import rcdataElements, entities, xmlEntities
|
||||||
|
from . import treewalkers, _utils
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
|
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
|
||||||
|
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
|
||||||
|
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
|
||||||
|
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
|
||||||
|
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
|
||||||
|
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
|
||||||
|
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
|
||||||
|
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
|
||||||
|
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
|
||||||
|
"\u3000]")
|
||||||
|
|
||||||
|
|
||||||
|
_encode_entity_map = {}
|
||||||
|
_is_ucs4 = len("\U0010FFFF") == 1
|
||||||
|
for k, v in list(entities.items()):
|
||||||
|
# skip multi-character entities
|
||||||
|
if ((_is_ucs4 and len(v) > 1) or
|
||||||
|
(not _is_ucs4 and len(v) > 2)):
|
||||||
|
continue
|
||||||
|
if v != "&":
|
||||||
|
if len(v) == 2:
|
||||||
|
v = _utils.surrogatePairToCodepoint(v)
|
||||||
|
else:
|
||||||
|
v = ord(v)
|
||||||
|
if v not in _encode_entity_map or k.islower():
|
||||||
|
# prefer < over < and similarly for &, >, etc.
|
||||||
|
_encode_entity_map[v] = k
|
||||||
|
|
||||||
|
|
||||||
|
def htmlentityreplace_errors(exc):
|
||||||
|
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||||
|
res = []
|
||||||
|
codepoints = []
|
||||||
|
skip = False
|
||||||
|
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||||
|
if skip:
|
||||||
|
skip = False
|
||||||
|
continue
|
||||||
|
index = i + exc.start
|
||||||
|
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||||
|
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||||
|
skip = True
|
||||||
|
else:
|
||||||
|
codepoint = ord(c)
|
||||||
|
codepoints.append(codepoint)
|
||||||
|
for cp in codepoints:
|
||||||
|
e = _encode_entity_map.get(cp)
|
||||||
|
if e:
|
||||||
|
res.append("&")
|
||||||
|
res.append(e)
|
||||||
|
if not e.endswith(";"):
|
||||||
|
res.append(";")
|
||||||
|
else:
|
||||||
|
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||||
|
return ("".join(res), exc.end)
|
||||||
|
else:
|
||||||
|
return xmlcharrefreplace_errors(exc)
|
||||||
|
|
||||||
|
|
||||||
|
register_error("htmlentityreplace", htmlentityreplace_errors)
|
||||||
|
|
||||||
|
|
||||||
|
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
||||||
|
"""Serializes the input token stream using the specified treewalker
|
||||||
|
|
||||||
|
:arg input: the token stream to serialize
|
||||||
|
|
||||||
|
:arg tree: the treewalker to use
|
||||||
|
|
||||||
|
:arg encoding: the encoding to use
|
||||||
|
|
||||||
|
:arg serializer_opts: any options to pass to the
|
||||||
|
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
|
||||||
|
|
||||||
|
:returns: the tree serialized as a string
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5parser import parse
|
||||||
|
>>> from html5lib.serializer import serialize
|
||||||
|
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
|
||||||
|
>>> serialize(token_stream, omit_optional_tags=False)
|
||||||
|
'<html><head></head><body><p>Hi!</p></body></html>'
|
||||||
|
|
||||||
|
"""
|
||||||
|
# XXX: Should we cache this?
|
||||||
|
walker = treewalkers.getTreeWalker(tree)
|
||||||
|
s = HTMLSerializer(**serializer_opts)
|
||||||
|
return s.render(walker(input), encoding)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLSerializer(object):
|
||||||
|
|
||||||
|
# attribute quoting options
|
||||||
|
quote_attr_values = "legacy" # be secure by default
|
||||||
|
quote_char = '"'
|
||||||
|
use_best_quote_char = True
|
||||||
|
|
||||||
|
# tag syntax options
|
||||||
|
omit_optional_tags = True
|
||||||
|
minimize_boolean_attributes = True
|
||||||
|
use_trailing_solidus = False
|
||||||
|
space_before_trailing_solidus = True
|
||||||
|
|
||||||
|
# escaping options
|
||||||
|
escape_lt_in_attrs = False
|
||||||
|
escape_rcdata = False
|
||||||
|
resolve_entities = True
|
||||||
|
|
||||||
|
# miscellaneous options
|
||||||
|
alphabetical_attributes = False
|
||||||
|
inject_meta_charset = True
|
||||||
|
strip_whitespace = False
|
||||||
|
sanitize = False
|
||||||
|
|
||||||
|
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||||
|
"omit_optional_tags", "minimize_boolean_attributes",
|
||||||
|
"use_trailing_solidus", "space_before_trailing_solidus",
|
||||||
|
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
||||||
|
"alphabetical_attributes", "inject_meta_charset",
|
||||||
|
"strip_whitespace", "sanitize")
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
"""Initialize HTMLSerializer
|
||||||
|
|
||||||
|
:arg inject_meta_charset: Whether or not to inject the meta charset.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg quote_attr_values: Whether to quote attribute values that don't
|
||||||
|
require quoting per legacy browser behavior (``"legacy"``), when
|
||||||
|
required by the standard (``"spec"``), or always (``"always"``).
|
||||||
|
|
||||||
|
Defaults to ``"legacy"``.
|
||||||
|
|
||||||
|
:arg quote_char: Use given quote character for attribute quoting.
|
||||||
|
|
||||||
|
Defaults to ``"`` which will use double quotes unless attribute
|
||||||
|
value contains a double quote, in which case single quotes are
|
||||||
|
used.
|
||||||
|
|
||||||
|
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
|
||||||
|
values.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg escape_rcdata: Whether to escape characters that need to be
|
||||||
|
escaped within normal elements within rcdata elements such as
|
||||||
|
style.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg resolve_entities: Whether to resolve named character entities that
|
||||||
|
appear in the source tree. The XML predefined entities < >
|
||||||
|
& " ' are unaffected by this setting.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg strip_whitespace: Whether to remove semantically meaningless
|
||||||
|
whitespace. (This compresses all whitespace to a single space
|
||||||
|
except within ``pre``.)
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg minimize_boolean_attributes: Shortens boolean attributes to give
|
||||||
|
just the attribute value, for example::
|
||||||
|
|
||||||
|
<input disabled="disabled">
|
||||||
|
|
||||||
|
becomes::
|
||||||
|
|
||||||
|
<input disabled>
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
|
||||||
|
start tag of void elements (empty elements whose end tag is
|
||||||
|
forbidden). E.g. ``<hr/>``.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg space_before_trailing_solidus: Places a space immediately before
|
||||||
|
the closing slash in a tag using a trailing solidus. E.g.
|
||||||
|
``<hr />``. Requires ``use_trailing_solidus=True``.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg sanitize: Strip all unsafe or unknown constructs from output.
|
||||||
|
See :py:class:`html5lib.filters.sanitizer.Filter`.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg omit_optional_tags: Omit start/end tags that are optional.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
"""
|
||||||
|
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
||||||
|
if len(unexpected_args) > 0:
|
||||||
|
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
|
||||||
|
if 'quote_char' in kwargs:
|
||||||
|
self.use_best_quote_char = False
|
||||||
|
for attr in self.options:
|
||||||
|
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||||
|
self.errors = []
|
||||||
|
self.strict = False
|
||||||
|
|
||||||
|
def encode(self, string):
|
||||||
|
assert(isinstance(string, text_type))
|
||||||
|
if self.encoding:
|
||||||
|
return string.encode(self.encoding, "htmlentityreplace")
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
|
||||||
|
def encodeStrict(self, string):
|
||||||
|
assert(isinstance(string, text_type))
|
||||||
|
if self.encoding:
|
||||||
|
return string.encode(self.encoding, "strict")
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
|
||||||
|
def serialize(self, treewalker, encoding=None):
|
||||||
|
# pylint:disable=too-many-nested-blocks
|
||||||
|
self.encoding = encoding
|
||||||
|
in_cdata = False
|
||||||
|
self.errors = []
|
||||||
|
|
||||||
|
if encoding and self.inject_meta_charset:
|
||||||
|
from .filters.inject_meta_charset import Filter
|
||||||
|
treewalker = Filter(treewalker, encoding)
|
||||||
|
# Alphabetical attributes is here under the assumption that none of
|
||||||
|
# the later filters add or change order of attributes; it needs to be
|
||||||
|
# before the sanitizer so escaped elements come out correctly
|
||||||
|
if self.alphabetical_attributes:
|
||||||
|
from .filters.alphabeticalattributes import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
# WhitespaceFilter should be used before OptionalTagFilter
|
||||||
|
# for maximum efficiently of this latter filter
|
||||||
|
if self.strip_whitespace:
|
||||||
|
from .filters.whitespace import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
if self.sanitize:
|
||||||
|
from .filters.sanitizer import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
if self.omit_optional_tags:
|
||||||
|
from .filters.optionaltags import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
|
||||||
|
for token in treewalker:
|
||||||
|
type = token["type"]
|
||||||
|
if type == "Doctype":
|
||||||
|
doctype = "<!DOCTYPE %s" % token["name"]
|
||||||
|
|
||||||
|
if token["publicId"]:
|
||||||
|
doctype += ' PUBLIC "%s"' % token["publicId"]
|
||||||
|
elif token["systemId"]:
|
||||||
|
doctype += " SYSTEM"
|
||||||
|
if token["systemId"]:
|
||||||
|
if token["systemId"].find('"') >= 0:
|
||||||
|
if token["systemId"].find("'") >= 0:
|
||||||
|
self.serializeError("System identifier contains both single and double quote characters")
|
||||||
|
quote_char = "'"
|
||||||
|
else:
|
||||||
|
quote_char = '"'
|
||||||
|
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||||
|
|
||||||
|
doctype += ">"
|
||||||
|
yield self.encodeStrict(doctype)
|
||||||
|
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
if type == "SpaceCharacters" or in_cdata:
|
||||||
|
if in_cdata and token["data"].find("</") >= 0:
|
||||||
|
self.serializeError("Unexpected </ in CDATA")
|
||||||
|
yield self.encode(token["data"])
|
||||||
|
else:
|
||||||
|
yield self.encode(escape(token["data"]))
|
||||||
|
|
||||||
|
elif type in ("StartTag", "EmptyTag"):
|
||||||
|
name = token["name"]
|
||||||
|
yield self.encodeStrict("<%s" % name)
|
||||||
|
if name in rcdataElements and not self.escape_rcdata:
|
||||||
|
in_cdata = True
|
||||||
|
elif in_cdata:
|
||||||
|
self.serializeError("Unexpected child element of a CDATA element")
|
||||||
|
for (_, attr_name), attr_value in token["data"].items():
|
||||||
|
# TODO: Add namespace support here
|
||||||
|
k = attr_name
|
||||||
|
v = attr_value
|
||||||
|
yield self.encodeStrict(' ')
|
||||||
|
|
||||||
|
yield self.encodeStrict(k)
|
||||||
|
if not self.minimize_boolean_attributes or \
|
||||||
|
(k not in booleanAttributes.get(name, tuple()) and
|
||||||
|
k not in booleanAttributes.get("", tuple())):
|
||||||
|
yield self.encodeStrict("=")
|
||||||
|
if self.quote_attr_values == "always" or len(v) == 0:
|
||||||
|
quote_attr = True
|
||||||
|
elif self.quote_attr_values == "spec":
|
||||||
|
quote_attr = _quoteAttributeSpec.search(v) is not None
|
||||||
|
elif self.quote_attr_values == "legacy":
|
||||||
|
quote_attr = _quoteAttributeLegacy.search(v) is not None
|
||||||
|
else:
|
||||||
|
raise ValueError("quote_attr_values must be one of: "
|
||||||
|
"'always', 'spec', or 'legacy'")
|
||||||
|
v = v.replace("&", "&")
|
||||||
|
if self.escape_lt_in_attrs:
|
||||||
|
v = v.replace("<", "<")
|
||||||
|
if quote_attr:
|
||||||
|
quote_char = self.quote_char
|
||||||
|
if self.use_best_quote_char:
|
||||||
|
if "'" in v and '"' not in v:
|
||||||
|
quote_char = '"'
|
||||||
|
elif '"' in v and "'" not in v:
|
||||||
|
quote_char = "'"
|
||||||
|
if quote_char == "'":
|
||||||
|
v = v.replace("'", "'")
|
||||||
|
else:
|
||||||
|
v = v.replace('"', """)
|
||||||
|
yield self.encodeStrict(quote_char)
|
||||||
|
yield self.encode(v)
|
||||||
|
yield self.encodeStrict(quote_char)
|
||||||
|
else:
|
||||||
|
yield self.encode(v)
|
||||||
|
if name in voidElements and self.use_trailing_solidus:
|
||||||
|
if self.space_before_trailing_solidus:
|
||||||
|
yield self.encodeStrict(" /")
|
||||||
|
else:
|
||||||
|
yield self.encodeStrict("/")
|
||||||
|
yield self.encode(">")
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
name = token["name"]
|
||||||
|
if name in rcdataElements:
|
||||||
|
in_cdata = False
|
||||||
|
elif in_cdata:
|
||||||
|
self.serializeError("Unexpected child element of a CDATA element")
|
||||||
|
yield self.encodeStrict("</%s>" % name)
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
data = token["data"]
|
||||||
|
if data.find("--") >= 0:
|
||||||
|
self.serializeError("Comment contains --")
|
||||||
|
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||||
|
|
||||||
|
elif type == "Entity":
|
||||||
|
name = token["name"]
|
||||||
|
key = name + ";"
|
||||||
|
if key not in entities:
|
||||||
|
self.serializeError("Entity %s not recognized" % name)
|
||||||
|
if self.resolve_entities and key not in xmlEntities:
|
||||||
|
data = entities[key]
|
||||||
|
else:
|
||||||
|
data = "&%s;" % name
|
||||||
|
yield self.encodeStrict(data)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.serializeError(token["data"])
|
||||||
|
|
||||||
|
def render(self, treewalker, encoding=None):
|
||||||
|
"""Serializes the stream from the treewalker into a string
|
||||||
|
|
||||||
|
:arg treewalker: the treewalker to serialize
|
||||||
|
|
||||||
|
:arg encoding: the string encoding to use
|
||||||
|
|
||||||
|
:returns: the serialized tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib import parse, getTreeWalker
|
||||||
|
>>> from html5lib.serializer import HTMLSerializer
|
||||||
|
>>> token_stream = parse('<html><body>Hi!</body></html>')
|
||||||
|
>>> walker = getTreeWalker('etree')
|
||||||
|
>>> serializer = HTMLSerializer(omit_optional_tags=False)
|
||||||
|
>>> serializer.render(walker(token_stream))
|
||||||
|
'<html><head></head><body>Hi!</body></html>'
|
||||||
|
|
||||||
|
"""
|
||||||
|
if encoding:
|
||||||
|
return b"".join(list(self.serialize(treewalker, encoding)))
|
||||||
|
else:
|
||||||
|
return "".join(list(self.serialize(treewalker)))
|
||||||
|
|
||||||
|
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||||
|
# XXX The idea is to make data mandatory.
|
||||||
|
self.errors.append(data)
|
||||||
|
if self.strict:
|
||||||
|
raise SerializeError
|
||||||
|
|
||||||
|
|
||||||
|
class SerializeError(Exception):
|
||||||
|
"""Error in serialized tree"""
|
||||||
|
pass
|
30
lib/bleach/_vendor/html5lib/treeadapters/__init__.py
Normal file
30
lib/bleach/_vendor/html5lib/treeadapters/__init__.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
"""Tree adapters let you convert from one tree structure to another
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
from html5lib.treeadapters import genshi
|
||||||
|
|
||||||
|
doc = '<html><body>Hi!</body></html>'
|
||||||
|
treebuilder = html5lib.getTreeBuilder('etree')
|
||||||
|
parser = html5lib.HTMLParser(tree=treebuilder)
|
||||||
|
tree = parser.parse(doc)
|
||||||
|
TreeWalker = html5lib.getTreeWalker('etree')
|
||||||
|
|
||||||
|
genshi_tree = genshi.to_genshi(TreeWalker(tree))
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import sax
|
||||||
|
|
||||||
|
__all__ = ["sax"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from . import genshi # noqa
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
__all__.append("genshi")
|
54
lib/bleach/_vendor/html5lib/treeadapters/genshi.py
Normal file
54
lib/bleach/_vendor/html5lib/treeadapters/genshi.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from genshi.core import QName, Attrs
|
||||||
|
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
||||||
|
|
||||||
|
|
||||||
|
def to_genshi(walker):
|
||||||
|
"""Convert a tree to a genshi tree
|
||||||
|
|
||||||
|
:arg walker: the treewalker to use to walk the tree to convert it
|
||||||
|
|
||||||
|
:returns: generator of genshi nodes
|
||||||
|
|
||||||
|
"""
|
||||||
|
text = []
|
||||||
|
for token in walker:
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("Characters", "SpaceCharacters"):
|
||||||
|
text.append(token["data"])
|
||||||
|
elif text:
|
||||||
|
yield TEXT, "".join(text), (None, -1, -1)
|
||||||
|
text = []
|
||||||
|
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
if token["namespace"]:
|
||||||
|
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||||
|
else:
|
||||||
|
name = token["name"]
|
||||||
|
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
|
||||||
|
for attr, value in token["data"].items()])
|
||||||
|
yield (START, (QName(name), attrs), (None, -1, -1))
|
||||||
|
if type == "EmptyTag":
|
||||||
|
type = "EndTag"
|
||||||
|
|
||||||
|
if type == "EndTag":
|
||||||
|
if token["namespace"]:
|
||||||
|
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||||
|
else:
|
||||||
|
name = token["name"]
|
||||||
|
|
||||||
|
yield END, QName(name), (None, -1, -1)
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
yield COMMENT, token["data"], (None, -1, -1)
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
yield DOCTYPE, (token["name"], token["publicId"],
|
||||||
|
token["systemId"]), (None, -1, -1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
pass # FIXME: What to do?
|
||||||
|
|
||||||
|
if text:
|
||||||
|
yield TEXT, "".join(text), (None, -1, -1)
|
50
lib/bleach/_vendor/html5lib/treeadapters/sax.py
Normal file
50
lib/bleach/_vendor/html5lib/treeadapters/sax.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from xml.sax.xmlreader import AttributesNSImpl
|
||||||
|
|
||||||
|
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
|
||||||
|
|
||||||
|
prefix_mapping = {}
|
||||||
|
for prefix, localName, namespace in adjustForeignAttributes.values():
|
||||||
|
if prefix is not None:
|
||||||
|
prefix_mapping[prefix] = namespace
|
||||||
|
|
||||||
|
|
||||||
|
def to_sax(walker, handler):
|
||||||
|
"""Call SAX-like content handler based on treewalker walker
|
||||||
|
|
||||||
|
:arg walker: the treewalker to use to walk the tree to convert it
|
||||||
|
|
||||||
|
:arg handler: SAX handler to use
|
||||||
|
|
||||||
|
"""
|
||||||
|
handler.startDocument()
|
||||||
|
for prefix, namespace in prefix_mapping.items():
|
||||||
|
handler.startPrefixMapping(prefix, namespace)
|
||||||
|
|
||||||
|
for token in walker:
|
||||||
|
type = token["type"]
|
||||||
|
if type == "Doctype":
|
||||||
|
continue
|
||||||
|
elif type in ("StartTag", "EmptyTag"):
|
||||||
|
attrs = AttributesNSImpl(token["data"],
|
||||||
|
unadjustForeignAttributes)
|
||||||
|
handler.startElementNS((token["namespace"], token["name"]),
|
||||||
|
token["name"],
|
||||||
|
attrs)
|
||||||
|
if type == "EmptyTag":
|
||||||
|
handler.endElementNS((token["namespace"], token["name"]),
|
||||||
|
token["name"])
|
||||||
|
elif type == "EndTag":
|
||||||
|
handler.endElementNS((token["namespace"], token["name"]),
|
||||||
|
token["name"])
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
handler.characters(token["data"])
|
||||||
|
elif type == "Comment":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
assert False, "Unknown token type"
|
||||||
|
|
||||||
|
for prefix, namespace in prefix_mapping.items():
|
||||||
|
handler.endPrefixMapping(prefix)
|
||||||
|
handler.endDocument()
|
88
lib/bleach/_vendor/html5lib/treebuilders/__init__.py
Normal file
88
lib/bleach/_vendor/html5lib/treebuilders/__init__.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
"""A collection of modules for building different kinds of trees from HTML
|
||||||
|
documents.
|
||||||
|
|
||||||
|
To create a treebuilder for a new type of tree, you need to do
|
||||||
|
implement several things:
|
||||||
|
|
||||||
|
1. A set of classes for various types of elements: Document, Doctype, Comment,
|
||||||
|
Element. These must implement the interface of ``base.treebuilders.Node``
|
||||||
|
(although comment nodes have a different signature for their constructor,
|
||||||
|
see ``treebuilders.etree.Comment``) Textual content may also be implemented
|
||||||
|
as another node type, or not, as your tree implementation requires.
|
||||||
|
|
||||||
|
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
|
||||||
|
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
|
||||||
|
|
||||||
|
* ``documentClass`` - the class to use for the bottommost node of a document
|
||||||
|
* ``elementClass`` - the class to use for HTML Elements
|
||||||
|
* ``commentClass`` - the class to use for comments
|
||||||
|
* ``doctypeClass`` - the class to use for doctypes
|
||||||
|
|
||||||
|
It also has one required method:
|
||||||
|
|
||||||
|
* ``getDocument`` - Returns the root node of the complete document tree
|
||||||
|
|
||||||
|
3. If you wish to run the unit tests, you must also create a ``testSerializer``
|
||||||
|
method on your treebuilder which accepts a node and returns a string
|
||||||
|
containing Node and its children serialized according to the format used in
|
||||||
|
the unittests
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from .._utils import default_etree
|
||||||
|
|
||||||
|
treeBuilderCache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||||
|
"""Get a TreeBuilder class for various types of trees with built-in support
|
||||||
|
|
||||||
|
:arg treeType: the name of the tree type required (case-insensitive). Supported
|
||||||
|
values are:
|
||||||
|
|
||||||
|
* "dom" - A generic builder for DOM implementations, defaulting to a
|
||||||
|
xml.dom.minidom based implementation.
|
||||||
|
* "etree" - A generic builder for tree implementations exposing an
|
||||||
|
ElementTree-like interface, defaulting to xml.etree.cElementTree if
|
||||||
|
available and xml.etree.ElementTree if not.
|
||||||
|
* "lxml" - A etree-based builder for lxml.etree, handling limitations
|
||||||
|
of lxml's implementation.
|
||||||
|
|
||||||
|
:arg implementation: (Currently applies to the "etree" and "dom" tree
|
||||||
|
types). A module implementing the tree type e.g. xml.etree.ElementTree
|
||||||
|
or xml.etree.cElementTree.
|
||||||
|
|
||||||
|
:arg kwargs: Any additional options to pass to the TreeBuilder when
|
||||||
|
creating it.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.treebuilders import getTreeBuilder
|
||||||
|
>>> builder = getTreeBuilder('etree')
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
treeType = treeType.lower()
|
||||||
|
if treeType not in treeBuilderCache:
|
||||||
|
if treeType == "dom":
|
||||||
|
from . import dom
|
||||||
|
# Come up with a sane default (pref. from the stdlib)
|
||||||
|
if implementation is None:
|
||||||
|
from xml.dom import minidom
|
||||||
|
implementation = minidom
|
||||||
|
# NEVER cache here, caching is done in the dom submodule
|
||||||
|
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||||
|
elif treeType == "lxml":
|
||||||
|
from . import etree_lxml
|
||||||
|
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||||
|
elif treeType == "etree":
|
||||||
|
from . import etree
|
||||||
|
if implementation is None:
|
||||||
|
implementation = default_etree
|
||||||
|
# NEVER cache here, caching is done in the etree submodule
|
||||||
|
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||||
|
else:
|
||||||
|
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
|
||||||
|
return treeBuilderCache.get(treeType)
|
417
lib/bleach/_vendor/html5lib/treebuilders/base.py
Normal file
417
lib/bleach/_vendor/html5lib/treebuilders/base.py
Normal file
|
@ -0,0 +1,417 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from ..constants import scopingElements, tableInsertModeElements, namespaces
|
||||||
|
|
||||||
|
# The scope markers are inserted when entering object elements,
|
||||||
|
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||||
|
# from "leaking" into tables, object elements, and marquees.
|
||||||
|
Marker = None
|
||||||
|
|
||||||
|
listElementsMap = {
|
||||||
|
None: (frozenset(scopingElements), False),
|
||||||
|
"button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
|
||||||
|
"list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
|
||||||
|
(namespaces["html"], "ul")}), False),
|
||||||
|
"table": (frozenset([(namespaces["html"], "html"),
|
||||||
|
(namespaces["html"], "table")]), False),
|
||||||
|
"select": (frozenset([(namespaces["html"], "optgroup"),
|
||||||
|
(namespaces["html"], "option")]), True)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Node(object):
|
||||||
|
"""Represents an item in the tree"""
|
||||||
|
def __init__(self, name):
|
||||||
|
"""Creates a Node
|
||||||
|
|
||||||
|
:arg name: The tag name associated with the node
|
||||||
|
|
||||||
|
"""
|
||||||
|
# The tag name associated with the node
|
||||||
|
self.name = name
|
||||||
|
# The parent of the current node (or None for the document node)
|
||||||
|
self.parent = None
|
||||||
|
# The value of the current node (applies to text nodes and comments)
|
||||||
|
self.value = None
|
||||||
|
# A dict holding name -> value pairs for attributes of the node
|
||||||
|
self.attributes = {}
|
||||||
|
# A list of child nodes of the current node. This must include all
|
||||||
|
# elements but not necessarily other node types.
|
||||||
|
self.childNodes = []
|
||||||
|
# A list of miscellaneous flags that can be set on the node.
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
|
||||||
|
for name, value in
|
||||||
|
self.attributes.items()])
|
||||||
|
if attributesStr:
|
||||||
|
return "<%s %s>" % (self.name, attributesStr)
|
||||||
|
else:
|
||||||
|
return "<%s>" % (self.name)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<%s>" % (self.name)
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
"""Insert node as a child of the current node
|
||||||
|
|
||||||
|
:arg node: the node to insert
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
"""Insert data as text in the current node, positioned before the
|
||||||
|
start of node insertBefore or to the end of the node's text.
|
||||||
|
|
||||||
|
:arg data: the data to insert
|
||||||
|
|
||||||
|
:arg insertBefore: True if you want to insert the text before the node
|
||||||
|
and False if you want to insert it after the node
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
"""Insert node as a child of the current node, before refNode in the
|
||||||
|
list of child nodes. Raises ValueError if refNode is not a child of
|
||||||
|
the current node
|
||||||
|
|
||||||
|
:arg node: the node to insert
|
||||||
|
|
||||||
|
:arg refNode: the child node to insert the node before
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
"""Remove node from the children of the current node
|
||||||
|
|
||||||
|
:arg node: the child node to remove
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
"""Move all the children of the current node to newParent.
|
||||||
|
This is needed so that trees that don't store text as nodes move the
|
||||||
|
text in the correct way
|
||||||
|
|
||||||
|
:arg newParent: the node to move all this node's children to
|
||||||
|
|
||||||
|
"""
|
||||||
|
# XXX - should this method be made more general?
|
||||||
|
for child in self.childNodes:
|
||||||
|
newParent.appendChild(child)
|
||||||
|
self.childNodes = []
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
"""Return a shallow copy of the current node i.e. a node with the same
|
||||||
|
name and attributes but with no parent or child nodes
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text, false otherwise
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ActiveFormattingElements(list):
|
||||||
|
def append(self, node):
|
||||||
|
equalCount = 0
|
||||||
|
if node != Marker:
|
||||||
|
for element in self[::-1]:
|
||||||
|
if element == Marker:
|
||||||
|
break
|
||||||
|
if self.nodesEqual(element, node):
|
||||||
|
equalCount += 1
|
||||||
|
if equalCount == 3:
|
||||||
|
self.remove(element)
|
||||||
|
break
|
||||||
|
list.append(self, node)
|
||||||
|
|
||||||
|
def nodesEqual(self, node1, node2):
|
||||||
|
if not node1.nameTuple == node2.nameTuple:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not node1.attributes == node2.attributes:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class TreeBuilder(object):
|
||||||
|
"""Base treebuilder implementation
|
||||||
|
|
||||||
|
* documentClass - the class to use for the bottommost node of a document
|
||||||
|
* elementClass - the class to use for HTML Elements
|
||||||
|
* commentClass - the class to use for comments
|
||||||
|
* doctypeClass - the class to use for doctypes
|
||||||
|
|
||||||
|
"""
|
||||||
|
# pylint:disable=not-callable
|
||||||
|
|
||||||
|
# Document class
|
||||||
|
documentClass = None
|
||||||
|
|
||||||
|
# The class to use for creating a node
|
||||||
|
elementClass = None
|
||||||
|
|
||||||
|
# The class to use for creating comments
|
||||||
|
commentClass = None
|
||||||
|
|
||||||
|
# The class to use for creating doctypes
|
||||||
|
doctypeClass = None
|
||||||
|
|
||||||
|
# Fragment class
|
||||||
|
fragmentClass = None
|
||||||
|
|
||||||
|
def __init__(self, namespaceHTMLElements):
|
||||||
|
"""Create a TreeBuilder
|
||||||
|
|
||||||
|
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||||
|
|
||||||
|
"""
|
||||||
|
if namespaceHTMLElements:
|
||||||
|
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||||
|
else:
|
||||||
|
self.defaultNamespace = None
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.openElements = []
|
||||||
|
self.activeFormattingElements = ActiveFormattingElements()
|
||||||
|
|
||||||
|
# XXX - rename these to headElement, formElement
|
||||||
|
self.headPointer = None
|
||||||
|
self.formPointer = None
|
||||||
|
|
||||||
|
self.insertFromTable = False
|
||||||
|
|
||||||
|
self.document = self.documentClass()
|
||||||
|
|
||||||
|
def elementInScope(self, target, variant=None):
|
||||||
|
|
||||||
|
# If we pass a node in we match that. if we pass a string
|
||||||
|
# match any node with that name
|
||||||
|
exactNode = hasattr(target, "nameTuple")
|
||||||
|
if not exactNode:
|
||||||
|
if isinstance(target, text_type):
|
||||||
|
target = (namespaces["html"], target)
|
||||||
|
assert isinstance(target, tuple)
|
||||||
|
|
||||||
|
listElements, invert = listElementsMap[variant]
|
||||||
|
|
||||||
|
for node in reversed(self.openElements):
|
||||||
|
if exactNode and node == target:
|
||||||
|
return True
|
||||||
|
elif not exactNode and node.nameTuple == target:
|
||||||
|
return True
|
||||||
|
elif (invert ^ (node.nameTuple in listElements)):
|
||||||
|
return False
|
||||||
|
|
||||||
|
assert False # We should never reach this point
|
||||||
|
|
||||||
|
def reconstructActiveFormattingElements(self):
|
||||||
|
# Within this algorithm the order of steps described in the
|
||||||
|
# specification is not quite the same as the order of steps in the
|
||||||
|
# code. It should still do the same though.
|
||||||
|
|
||||||
|
# Step 1: stop the algorithm when there's nothing to do.
|
||||||
|
if not self.activeFormattingElements:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||||
|
i = len(self.activeFormattingElements) - 1
|
||||||
|
entry = self.activeFormattingElements[i]
|
||||||
|
if entry == Marker or entry in self.openElements:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 6
|
||||||
|
while entry != Marker and entry not in self.openElements:
|
||||||
|
if i == 0:
|
||||||
|
# This will be reset to 0 below
|
||||||
|
i = -1
|
||||||
|
break
|
||||||
|
i -= 1
|
||||||
|
# Step 5: let entry be one earlier in the list.
|
||||||
|
entry = self.activeFormattingElements[i]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Step 7
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Step 8
|
||||||
|
entry = self.activeFormattingElements[i]
|
||||||
|
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
|
||||||
|
|
||||||
|
# Step 9
|
||||||
|
element = self.insertElement({"type": "StartTag",
|
||||||
|
"name": clone.name,
|
||||||
|
"namespace": clone.namespace,
|
||||||
|
"data": clone.attributes})
|
||||||
|
|
||||||
|
# Step 10
|
||||||
|
self.activeFormattingElements[i] = element
|
||||||
|
|
||||||
|
# Step 11
|
||||||
|
if element == self.activeFormattingElements[-1]:
|
||||||
|
break
|
||||||
|
|
||||||
|
def clearActiveFormattingElements(self):
|
||||||
|
entry = self.activeFormattingElements.pop()
|
||||||
|
while self.activeFormattingElements and entry != Marker:
|
||||||
|
entry = self.activeFormattingElements.pop()
|
||||||
|
|
||||||
|
def elementInActiveFormattingElements(self, name):
|
||||||
|
"""Check if an element exists between the end of the active
|
||||||
|
formatting elements and the last marker. If it does, return it, else
|
||||||
|
return false"""
|
||||||
|
|
||||||
|
for item in self.activeFormattingElements[::-1]:
|
||||||
|
# Check for Marker first because if it's a Marker it doesn't have a
|
||||||
|
# name attribute.
|
||||||
|
if item == Marker:
|
||||||
|
break
|
||||||
|
elif item.name == name:
|
||||||
|
return item
|
||||||
|
return False
|
||||||
|
|
||||||
|
def insertRoot(self, token):
|
||||||
|
element = self.createElement(token)
|
||||||
|
self.openElements.append(element)
|
||||||
|
self.document.appendChild(element)
|
||||||
|
|
||||||
|
def insertDoctype(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
doctype = self.doctypeClass(name, publicId, systemId)
|
||||||
|
self.document.appendChild(doctype)
|
||||||
|
|
||||||
|
def insertComment(self, token, parent=None):
|
||||||
|
if parent is None:
|
||||||
|
parent = self.openElements[-1]
|
||||||
|
parent.appendChild(self.commentClass(token["data"]))
|
||||||
|
|
||||||
|
def createElement(self, token):
|
||||||
|
"""Create an element but don't insert it anywhere"""
|
||||||
|
name = token["name"]
|
||||||
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
element = self.elementClass(name, namespace)
|
||||||
|
element.attributes = token["data"]
|
||||||
|
return element
|
||||||
|
|
||||||
|
def _getInsertFromTable(self):
|
||||||
|
return self._insertFromTable
|
||||||
|
|
||||||
|
def _setInsertFromTable(self, value):
|
||||||
|
"""Switch the function used to insert an element from the
|
||||||
|
normal one to the misnested table one and back again"""
|
||||||
|
self._insertFromTable = value
|
||||||
|
if value:
|
||||||
|
self.insertElement = self.insertElementTable
|
||||||
|
else:
|
||||||
|
self.insertElement = self.insertElementNormal
|
||||||
|
|
||||||
|
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||||
|
|
||||||
|
def insertElementNormal(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
assert isinstance(name, text_type), "Element %s not unicode" % name
|
||||||
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
element = self.elementClass(name, namespace)
|
||||||
|
element.attributes = token["data"]
|
||||||
|
self.openElements[-1].appendChild(element)
|
||||||
|
self.openElements.append(element)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def insertElementTable(self, token):
|
||||||
|
"""Create an element and insert it into the tree"""
|
||||||
|
element = self.createElement(token)
|
||||||
|
if self.openElements[-1].name not in tableInsertModeElements:
|
||||||
|
return self.insertElementNormal(token)
|
||||||
|
else:
|
||||||
|
# We should be in the InTable mode. This means we want to do
|
||||||
|
# special magic element rearranging
|
||||||
|
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||||
|
if insertBefore is None:
|
||||||
|
parent.appendChild(element)
|
||||||
|
else:
|
||||||
|
parent.insertBefore(element, insertBefore)
|
||||||
|
self.openElements.append(element)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def insertText(self, data, parent=None):
|
||||||
|
"""Insert text data."""
|
||||||
|
if parent is None:
|
||||||
|
parent = self.openElements[-1]
|
||||||
|
|
||||||
|
if (not self.insertFromTable or (self.insertFromTable and
|
||||||
|
self.openElements[-1].name
|
||||||
|
not in tableInsertModeElements)):
|
||||||
|
parent.insertText(data)
|
||||||
|
else:
|
||||||
|
# We should be in the InTable mode. This means we want to do
|
||||||
|
# special magic element rearranging
|
||||||
|
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||||
|
parent.insertText(data, insertBefore)
|
||||||
|
|
||||||
|
def getTableMisnestedNodePosition(self):
|
||||||
|
"""Get the foster parent element, and sibling to insert before
|
||||||
|
(or None) when inserting a misnested table node"""
|
||||||
|
# The foster parent element is the one which comes before the most
|
||||||
|
# recently opened table element
|
||||||
|
# XXX - this is really inelegant
|
||||||
|
lastTable = None
|
||||||
|
fosterParent = None
|
||||||
|
insertBefore = None
|
||||||
|
for elm in self.openElements[::-1]:
|
||||||
|
if elm.name == "table":
|
||||||
|
lastTable = elm
|
||||||
|
break
|
||||||
|
if lastTable:
|
||||||
|
# XXX - we should really check that this parent is actually a
|
||||||
|
# node here
|
||||||
|
if lastTable.parent:
|
||||||
|
fosterParent = lastTable.parent
|
||||||
|
insertBefore = lastTable
|
||||||
|
else:
|
||||||
|
fosterParent = self.openElements[
|
||||||
|
self.openElements.index(lastTable) - 1]
|
||||||
|
else:
|
||||||
|
fosterParent = self.openElements[0]
|
||||||
|
return fosterParent, insertBefore
|
||||||
|
|
||||||
|
def generateImpliedEndTags(self, exclude=None):
|
||||||
|
name = self.openElements[-1].name
|
||||||
|
# XXX td, th and tr are not actually needed
|
||||||
|
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
|
||||||
|
name != exclude):
|
||||||
|
self.openElements.pop()
|
||||||
|
# XXX This is not entirely what the specification says. We should
|
||||||
|
# investigate it more closely.
|
||||||
|
self.generateImpliedEndTags(exclude)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
"""Return the final tree"""
|
||||||
|
return self.document
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
"""Return the final fragment"""
|
||||||
|
# assert self.innerHTML
|
||||||
|
fragment = self.fragmentClass()
|
||||||
|
self.openElements[0].reparentChildren(fragment)
|
||||||
|
return fragment
|
||||||
|
|
||||||
|
def testSerializer(self, node):
|
||||||
|
"""Serialize the subtree of node in the format required by unit tests
|
||||||
|
|
||||||
|
:arg node: the node from which to start serializing
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
239
lib/bleach/_vendor/html5lib/treebuilders/dom.py
Normal file
239
lib/bleach/_vendor/html5lib/treebuilders/dom.py
Normal file
|
@ -0,0 +1,239 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import MutableMapping
|
||||||
|
except ImportError: # Python 2.7
|
||||||
|
from collections import MutableMapping
|
||||||
|
from xml.dom import minidom, Node
|
||||||
|
import weakref
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from .. import constants
|
||||||
|
from ..constants import namespaces
|
||||||
|
from .._utils import moduleFactoryFactory
|
||||||
|
|
||||||
|
|
||||||
|
def getDomBuilder(DomImplementation):
|
||||||
|
Dom = DomImplementation
|
||||||
|
|
||||||
|
class AttrList(MutableMapping):
|
||||||
|
def __init__(self, element):
|
||||||
|
self.element = element
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.element.attributes.keys())
|
||||||
|
|
||||||
|
def __setitem__(self, name, value):
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
attr = self.element.ownerDocument.createAttribute(name)
|
||||||
|
attr.value = value
|
||||||
|
self.element.attributes[name] = attr
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.element.attributes)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
return list(self.element.attributes.items())
|
||||||
|
|
||||||
|
def values(self):
|
||||||
|
return list(self.element.attributes.values())
|
||||||
|
|
||||||
|
def __getitem__(self, name):
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
return self.element.attributes[name].value
|
||||||
|
|
||||||
|
def __delitem__(self, name):
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
del self.element.attributes[name]
|
||||||
|
|
||||||
|
class NodeBuilder(base.Node):
|
||||||
|
def __init__(self, element):
|
||||||
|
base.Node.__init__(self, element.nodeName)
|
||||||
|
self.element = element
|
||||||
|
|
||||||
|
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
|
||||||
|
self.element.namespaceURI or None)
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
node.parent = self
|
||||||
|
self.element.appendChild(node.element)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
text = self.element.ownerDocument.createTextNode(data)
|
||||||
|
if insertBefore:
|
||||||
|
self.element.insertBefore(text, insertBefore.element)
|
||||||
|
else:
|
||||||
|
self.element.appendChild(text)
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
self.element.insertBefore(node.element, refNode.element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
if node.element.parentNode == self.element:
|
||||||
|
self.element.removeChild(node.element)
|
||||||
|
node.parent = None
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
while self.element.hasChildNodes():
|
||||||
|
child = self.element.firstChild
|
||||||
|
self.element.removeChild(child)
|
||||||
|
newParent.element.appendChild(child)
|
||||||
|
self.childNodes = []
|
||||||
|
|
||||||
|
def getAttributes(self):
|
||||||
|
return AttrList(self.element)
|
||||||
|
|
||||||
|
def setAttributes(self, attributes):
|
||||||
|
if attributes:
|
||||||
|
for name, value in list(attributes.items()):
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
if name[0] is not None:
|
||||||
|
qualifiedName = (name[0] + ":" + name[1])
|
||||||
|
else:
|
||||||
|
qualifiedName = name[1]
|
||||||
|
self.element.setAttributeNS(name[2], qualifiedName,
|
||||||
|
value)
|
||||||
|
else:
|
||||||
|
self.element.setAttribute(
|
||||||
|
name, value)
|
||||||
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
return NodeBuilder(self.element.cloneNode(False))
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
return self.element.hasChildNodes()
|
||||||
|
|
||||||
|
def getNameTuple(self):
|
||||||
|
if self.namespace is None:
|
||||||
|
return namespaces["html"], self.name
|
||||||
|
else:
|
||||||
|
return self.namespace, self.name
|
||||||
|
|
||||||
|
nameTuple = property(getNameTuple)
|
||||||
|
|
||||||
|
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||||
|
def documentClass(self):
|
||||||
|
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
|
||||||
|
return weakref.proxy(self)
|
||||||
|
|
||||||
|
def insertDoctype(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
domimpl = Dom.getDOMImplementation()
|
||||||
|
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||||
|
self.document.appendChild(NodeBuilder(doctype))
|
||||||
|
if Dom == minidom:
|
||||||
|
doctype.ownerDocument = self.dom
|
||||||
|
|
||||||
|
def elementClass(self, name, namespace=None):
|
||||||
|
if namespace is None and self.defaultNamespace is None:
|
||||||
|
node = self.dom.createElement(name)
|
||||||
|
else:
|
||||||
|
node = self.dom.createElementNS(namespace, name)
|
||||||
|
|
||||||
|
return NodeBuilder(node)
|
||||||
|
|
||||||
|
def commentClass(self, data):
|
||||||
|
return NodeBuilder(self.dom.createComment(data))
|
||||||
|
|
||||||
|
def fragmentClass(self):
|
||||||
|
return NodeBuilder(self.dom.createDocumentFragment())
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self.dom.appendChild(node.element)
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.dom
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
def insertText(self, data, parent=None):
|
||||||
|
data = data
|
||||||
|
if parent != self:
|
||||||
|
base.TreeBuilder.insertText(self, data, parent)
|
||||||
|
else:
|
||||||
|
# HACK: allow text nodes as children of the document node
|
||||||
|
if hasattr(self.dom, '_child_node_types'):
|
||||||
|
# pylint:disable=protected-access
|
||||||
|
if Node.TEXT_NODE not in self.dom._child_node_types:
|
||||||
|
self.dom._child_node_types = list(self.dom._child_node_types)
|
||||||
|
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||||
|
self.dom.appendChild(self.dom.createTextNode(data))
|
||||||
|
|
||||||
|
implementation = DomImplementation
|
||||||
|
name = None
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
element.normalize()
|
||||||
|
rv = []
|
||||||
|
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
if element.name:
|
||||||
|
if element.publicId or element.systemId:
|
||||||
|
publicId = element.publicId or ""
|
||||||
|
systemId = element.systemId or ""
|
||||||
|
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(' ' * indent, element.name, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||||
|
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||||
|
rv.append("#document")
|
||||||
|
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||||
|
rv.append("#document-fragment")
|
||||||
|
elif element.nodeType == Node.COMMENT_NODE:
|
||||||
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
|
||||||
|
elif element.nodeType == Node.TEXT_NODE:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
|
||||||
|
else:
|
||||||
|
if (hasattr(element, "namespaceURI") and
|
||||||
|
element.namespaceURI is not None):
|
||||||
|
name = "%s %s" % (constants.prefixes[element.namespaceURI],
|
||||||
|
element.nodeName)
|
||||||
|
else:
|
||||||
|
name = element.nodeName
|
||||||
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
|
if element.hasAttributes():
|
||||||
|
attributes = []
|
||||||
|
for i in range(len(element.attributes)):
|
||||||
|
attr = element.attributes.item(i)
|
||||||
|
name = attr.nodeName
|
||||||
|
value = attr.value
|
||||||
|
ns = attr.namespaceURI
|
||||||
|
if ns:
|
||||||
|
name = "%s %s" % (constants.prefixes[ns], attr.localName)
|
||||||
|
else:
|
||||||
|
name = attr.nodeName
|
||||||
|
attributes.append((name, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
indent += 2
|
||||||
|
for child in element.childNodes:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
return locals()
|
||||||
|
|
||||||
|
|
||||||
|
# The actual means to get a module!
|
||||||
|
getDomModule = moduleFactoryFactory(getDomBuilder)
|
343
lib/bleach/_vendor/html5lib/treebuilders/etree.py
Normal file
343
lib/bleach/_vendor/html5lib/treebuilders/etree.py
Normal file
|
@ -0,0 +1,343 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
# pylint:disable=protected-access
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from copy import copy
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from .. import _ihatexml
|
||||||
|
from .. import constants
|
||||||
|
from ..constants import namespaces
|
||||||
|
from .._utils import moduleFactoryFactory
|
||||||
|
|
||||||
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
|
|
||||||
|
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||||
|
ElementTree = ElementTreeImplementation
|
||||||
|
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||||
|
|
||||||
|
class Element(base.Node):
|
||||||
|
def __init__(self, name, namespace=None):
|
||||||
|
self._name = name
|
||||||
|
self._namespace = namespace
|
||||||
|
self._element = ElementTree.Element(self._getETreeTag(name,
|
||||||
|
namespace))
|
||||||
|
if namespace is None:
|
||||||
|
self.nameTuple = namespaces["html"], self._name
|
||||||
|
else:
|
||||||
|
self.nameTuple = self._namespace, self._name
|
||||||
|
self.parent = None
|
||||||
|
self._childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def _getETreeTag(self, name, namespace):
|
||||||
|
if namespace is None:
|
||||||
|
etree_tag = name
|
||||||
|
else:
|
||||||
|
etree_tag = "{%s}%s" % (namespace, name)
|
||||||
|
return etree_tag
|
||||||
|
|
||||||
|
def _setName(self, name):
|
||||||
|
self._name = name
|
||||||
|
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||||
|
|
||||||
|
def _getName(self):
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _setNamespace(self, namespace):
|
||||||
|
self._namespace = namespace
|
||||||
|
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||||
|
|
||||||
|
def _getNamespace(self):
|
||||||
|
return self._namespace
|
||||||
|
|
||||||
|
namespace = property(_getNamespace, _setNamespace)
|
||||||
|
|
||||||
|
def _getAttributes(self):
|
||||||
|
return self._element.attrib
|
||||||
|
|
||||||
|
def _setAttributes(self, attributes):
|
||||||
|
el_attrib = self._element.attrib
|
||||||
|
el_attrib.clear()
|
||||||
|
if attributes:
|
||||||
|
# calling .items _always_ allocates, and the above truthy check is cheaper than the
|
||||||
|
# allocation on average
|
||||||
|
for key, value in attributes.items():
|
||||||
|
if isinstance(key, tuple):
|
||||||
|
name = "{%s}%s" % (key[2], key[1])
|
||||||
|
else:
|
||||||
|
name = key
|
||||||
|
el_attrib[name] = value
|
||||||
|
|
||||||
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
|
def _getChildNodes(self):
|
||||||
|
return self._childNodes
|
||||||
|
|
||||||
|
def _setChildNodes(self, value):
|
||||||
|
del self._element[:]
|
||||||
|
self._childNodes = []
|
||||||
|
for element in value:
|
||||||
|
self.insertChild(element)
|
||||||
|
|
||||||
|
childNodes = property(_getChildNodes, _setChildNodes)
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text"""
|
||||||
|
return bool(self._element.text or len(self._element))
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self._childNodes.append(node)
|
||||||
|
self._element.append(node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = list(self._element).index(refNode._element)
|
||||||
|
self._element.insert(index, node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
self._childNodes.remove(node)
|
||||||
|
self._element.remove(node._element)
|
||||||
|
node.parent = None
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
if not(len(self._element)):
|
||||||
|
if not self._element.text:
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.text += data
|
||||||
|
elif insertBefore is None:
|
||||||
|
# Insert the text as the tail of the last child element
|
||||||
|
if not self._element[-1].tail:
|
||||||
|
self._element[-1].tail = ""
|
||||||
|
self._element[-1].tail += data
|
||||||
|
else:
|
||||||
|
# Insert the text before the specified node
|
||||||
|
children = list(self._element)
|
||||||
|
index = children.index(insertBefore._element)
|
||||||
|
if index > 0:
|
||||||
|
if not self._element[index - 1].tail:
|
||||||
|
self._element[index - 1].tail = ""
|
||||||
|
self._element[index - 1].tail += data
|
||||||
|
else:
|
||||||
|
if not self._element.text:
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.text += data
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
element = type(self)(self.name, self.namespace)
|
||||||
|
if self._element.attrib:
|
||||||
|
element._element.attrib = copy(self._element.attrib)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
if newParent.childNodes:
|
||||||
|
newParent.childNodes[-1]._element.tail += self._element.text
|
||||||
|
else:
|
||||||
|
if not newParent._element.text:
|
||||||
|
newParent._element.text = ""
|
||||||
|
if self._element.text is not None:
|
||||||
|
newParent._element.text += self._element.text
|
||||||
|
self._element.text = ""
|
||||||
|
base.Node.reparentChildren(self, newParent)
|
||||||
|
|
||||||
|
class Comment(Element):
|
||||||
|
def __init__(self, data):
|
||||||
|
# Use the superclass constructor to set all properties on the
|
||||||
|
# wrapper element
|
||||||
|
self._element = ElementTree.Comment(data)
|
||||||
|
self.parent = None
|
||||||
|
self._childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def _getData(self):
|
||||||
|
return self._element.text
|
||||||
|
|
||||||
|
def _setData(self, value):
|
||||||
|
self._element.text = value
|
||||||
|
|
||||||
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
|
class DocumentType(Element):
|
||||||
|
def __init__(self, name, publicId, systemId):
|
||||||
|
Element.__init__(self, "<!DOCTYPE>")
|
||||||
|
self._element.text = name
|
||||||
|
self.publicId = publicId
|
||||||
|
self.systemId = systemId
|
||||||
|
|
||||||
|
def _getPublicId(self):
|
||||||
|
return self._element.get("publicId", "")
|
||||||
|
|
||||||
|
def _setPublicId(self, value):
|
||||||
|
if value is not None:
|
||||||
|
self._element.set("publicId", value)
|
||||||
|
|
||||||
|
publicId = property(_getPublicId, _setPublicId)
|
||||||
|
|
||||||
|
def _getSystemId(self):
|
||||||
|
return self._element.get("systemId", "")
|
||||||
|
|
||||||
|
def _setSystemId(self, value):
|
||||||
|
if value is not None:
|
||||||
|
self._element.set("systemId", value)
|
||||||
|
|
||||||
|
systemId = property(_getSystemId, _setSystemId)
|
||||||
|
|
||||||
|
class Document(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, "DOCUMENT_ROOT")
|
||||||
|
|
||||||
|
class DocumentFragment(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, "DOCUMENT_FRAGMENT")
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if not(hasattr(element, "tag")):
|
||||||
|
element = element.getroot()
|
||||||
|
if element.tag == "<!DOCTYPE>":
|
||||||
|
if element.get("publicId") or element.get("systemId"):
|
||||||
|
publicId = element.get("publicId") or ""
|
||||||
|
systemId = element.get("systemId") or ""
|
||||||
|
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(element.text, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||||
|
elif element.tag == "DOCUMENT_ROOT":
|
||||||
|
rv.append("#document")
|
||||||
|
if element.text is not None:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||||
|
if element.tail is not None:
|
||||||
|
raise TypeError("Document node cannot have tail")
|
||||||
|
if hasattr(element, "attrib") and len(element.attrib):
|
||||||
|
raise TypeError("Document node cannot have attributes")
|
||||||
|
elif element.tag == ElementTreeCommentType:
|
||||||
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||||
|
else:
|
||||||
|
assert isinstance(element.tag, text_type), \
|
||||||
|
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
|
||||||
|
nsmatch = tag_regexp.match(element.tag)
|
||||||
|
|
||||||
|
if nsmatch is None:
|
||||||
|
name = element.tag
|
||||||
|
else:
|
||||||
|
ns, name = nsmatch.groups()
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
name = "%s %s" % (prefix, name)
|
||||||
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
|
|
||||||
|
if hasattr(element, "attrib"):
|
||||||
|
attributes = []
|
||||||
|
for name, value in element.attrib.items():
|
||||||
|
nsmatch = tag_regexp.match(name)
|
||||||
|
if nsmatch is not None:
|
||||||
|
ns, name = nsmatch.groups()
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
attr_string = "%s %s" % (prefix, name)
|
||||||
|
else:
|
||||||
|
attr_string = name
|
||||||
|
attributes.append((attr_string, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||||
|
indent += 2
|
||||||
|
for child in element:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
if element.tail:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def tostring(element): # pylint:disable=unused-variable
|
||||||
|
"""Serialize an element and its child nodes to a string"""
|
||||||
|
rv = []
|
||||||
|
filter = _ihatexml.InfosetFilter()
|
||||||
|
|
||||||
|
def serializeElement(element):
|
||||||
|
if isinstance(element, ElementTree.ElementTree):
|
||||||
|
element = element.getroot()
|
||||||
|
|
||||||
|
if element.tag == "<!DOCTYPE>":
|
||||||
|
if element.get("publicId") or element.get("systemId"):
|
||||||
|
publicId = element.get("publicId") or ""
|
||||||
|
systemId = element.get("systemId") or ""
|
||||||
|
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
|
||||||
|
(element.text, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||||
|
elif element.tag == "DOCUMENT_ROOT":
|
||||||
|
if element.text is not None:
|
||||||
|
rv.append(element.text)
|
||||||
|
if element.tail is not None:
|
||||||
|
raise TypeError("Document node cannot have tail")
|
||||||
|
if hasattr(element, "attrib") and len(element.attrib):
|
||||||
|
raise TypeError("Document node cannot have attributes")
|
||||||
|
|
||||||
|
for child in element:
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
elif element.tag == ElementTreeCommentType:
|
||||||
|
rv.append("<!--%s-->" % (element.text,))
|
||||||
|
else:
|
||||||
|
# This is assumed to be an ordinary element
|
||||||
|
if not element.attrib:
|
||||||
|
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
|
||||||
|
else:
|
||||||
|
attr = " ".join(["%s=\"%s\"" % (
|
||||||
|
filter.fromXmlName(name), value)
|
||||||
|
for name, value in element.attrib.items()])
|
||||||
|
rv.append("<%s %s>" % (element.tag, attr))
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
|
||||||
|
for child in element:
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
rv.append("</%s>" % (element.tag,))
|
||||||
|
|
||||||
|
if element.tail:
|
||||||
|
rv.append(element.tail)
|
||||||
|
|
||||||
|
serializeElement(element)
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = Element
|
||||||
|
commentClass = Comment
|
||||||
|
fragmentClass = DocumentFragment
|
||||||
|
implementation = ElementTreeImplementation
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
if fullTree:
|
||||||
|
return self.document._element
|
||||||
|
else:
|
||||||
|
if self.defaultNamespace is not None:
|
||||||
|
return self.document._element.find(
|
||||||
|
"{%s}html" % self.defaultNamespace)
|
||||||
|
else:
|
||||||
|
return self.document._element.find("html")
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return base.TreeBuilder.getFragment(self)._element
|
||||||
|
|
||||||
|
return locals()
|
||||||
|
|
||||||
|
|
||||||
|
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
392
lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
Normal file
392
lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
Normal file
|
@ -0,0 +1,392 @@
|
||||||
|
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||||
|
of the native library as possible, without using fragile hacks like custom element
|
||||||
|
names that break between releases. The downside of this is that we cannot represent
|
||||||
|
all possible trees; specifically the following are known to cause problems:
|
||||||
|
|
||||||
|
Text or comments as siblings of the root element
|
||||||
|
Docypes with no name
|
||||||
|
|
||||||
|
When any of these things occur, we emit a DataLossWarning
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
# pylint:disable=protected-access
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import MutableMapping
|
||||||
|
except ImportError:
|
||||||
|
from collections import MutableMapping
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from ..constants import DataLossWarning
|
||||||
|
from .. import constants
|
||||||
|
from . import etree as etree_builders
|
||||||
|
from .. import _ihatexml
|
||||||
|
|
||||||
|
import lxml.etree as etree
|
||||||
|
from six import PY3, binary_type
|
||||||
|
|
||||||
|
|
||||||
|
fullTree = True
|
||||||
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
|
comment_type = etree.Comment("asd").tag
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentType(object):
|
||||||
|
def __init__(self, name, publicId, systemId):
|
||||||
|
self.name = name
|
||||||
|
self.publicId = publicId
|
||||||
|
self.systemId = systemId
|
||||||
|
|
||||||
|
|
||||||
|
class Document(object):
|
||||||
|
def __init__(self):
|
||||||
|
self._elementTree = None
|
||||||
|
self._childNodes = []
|
||||||
|
|
||||||
|
def appendChild(self, element):
|
||||||
|
last = self._elementTree.getroot()
|
||||||
|
for last in self._elementTree.getroot().itersiblings():
|
||||||
|
pass
|
||||||
|
|
||||||
|
last.addnext(element._element)
|
||||||
|
|
||||||
|
def _getChildNodes(self):
|
||||||
|
return self._childNodes
|
||||||
|
|
||||||
|
childNodes = property(_getChildNodes)
|
||||||
|
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||||
|
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if not hasattr(element, "tag"):
|
||||||
|
if hasattr(element, "getroot"):
|
||||||
|
# Full tree case
|
||||||
|
rv.append("#document")
|
||||||
|
if element.docinfo.internalDTD:
|
||||||
|
if not (element.docinfo.public_id or
|
||||||
|
element.docinfo.system_url):
|
||||||
|
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
||||||
|
else:
|
||||||
|
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
|
||||||
|
element.docinfo.root_name,
|
||||||
|
element.docinfo.public_id,
|
||||||
|
element.docinfo.system_url)
|
||||||
|
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
|
||||||
|
next_element = element.getroot()
|
||||||
|
while next_element.getprevious() is not None:
|
||||||
|
next_element = next_element.getprevious()
|
||||||
|
while next_element is not None:
|
||||||
|
serializeElement(next_element, indent + 2)
|
||||||
|
next_element = next_element.getnext()
|
||||||
|
elif isinstance(element, str) or isinstance(element, bytes):
|
||||||
|
# Text in a fragment
|
||||||
|
assert isinstance(element, str) or sys.version_info[0] == 2
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||||
|
else:
|
||||||
|
# Fragment case
|
||||||
|
rv.append("#document-fragment")
|
||||||
|
for next_element in element:
|
||||||
|
serializeElement(next_element, indent + 2)
|
||||||
|
elif element.tag == comment_type:
|
||||||
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||||
|
if hasattr(element, "tail") and element.tail:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
|
||||||
|
else:
|
||||||
|
assert isinstance(element, etree._Element)
|
||||||
|
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||||
|
if nsmatch is not None:
|
||||||
|
ns = nsmatch.group(1)
|
||||||
|
tag = nsmatch.group(2)
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
||||||
|
infosetFilter.fromXmlName(tag)))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>" % (' ' * indent,
|
||||||
|
infosetFilter.fromXmlName(element.tag)))
|
||||||
|
|
||||||
|
if hasattr(element, "attrib"):
|
||||||
|
attributes = []
|
||||||
|
for name, value in element.attrib.items():
|
||||||
|
nsmatch = tag_regexp.match(name)
|
||||||
|
if nsmatch is not None:
|
||||||
|
ns, name = nsmatch.groups()
|
||||||
|
name = infosetFilter.fromXmlName(name)
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
attr_string = "%s %s" % (prefix, name)
|
||||||
|
else:
|
||||||
|
attr_string = infosetFilter.fromXmlName(name)
|
||||||
|
attributes.append((attr_string, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||||
|
indent += 2
|
||||||
|
for child in element:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
if hasattr(element, "tail") and element.tail:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
def tostring(element):
|
||||||
|
"""Serialize an element and its child nodes to a string"""
|
||||||
|
rv = []
|
||||||
|
|
||||||
|
def serializeElement(element):
|
||||||
|
if not hasattr(element, "tag"):
|
||||||
|
if element.docinfo.internalDTD:
|
||||||
|
if element.docinfo.doctype:
|
||||||
|
dtd_str = element.docinfo.doctype
|
||||||
|
else:
|
||||||
|
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
||||||
|
rv.append(dtd_str)
|
||||||
|
serializeElement(element.getroot())
|
||||||
|
|
||||||
|
elif element.tag == comment_type:
|
||||||
|
rv.append("<!--%s-->" % (element.text,))
|
||||||
|
|
||||||
|
else:
|
||||||
|
# This is assumed to be an ordinary element
|
||||||
|
if not element.attrib:
|
||||||
|
rv.append("<%s>" % (element.tag,))
|
||||||
|
else:
|
||||||
|
attr = " ".join(["%s=\"%s\"" % (name, value)
|
||||||
|
for name, value in element.attrib.items()])
|
||||||
|
rv.append("<%s %s>" % (element.tag, attr))
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
|
||||||
|
for child in element:
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
rv.append("</%s>" % (element.tag,))
|
||||||
|
|
||||||
|
if hasattr(element, "tail") and element.tail:
|
||||||
|
rv.append(element.tail)
|
||||||
|
|
||||||
|
serializeElement(element)
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
class TreeBuilder(base.TreeBuilder):
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = None
|
||||||
|
commentClass = None
|
||||||
|
fragmentClass = Document
|
||||||
|
implementation = etree
|
||||||
|
|
||||||
|
def __init__(self, namespaceHTMLElements, fullTree=False):
|
||||||
|
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||||
|
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||||
|
self.namespaceHTMLElements = namespaceHTMLElements
|
||||||
|
|
||||||
|
class Attributes(MutableMapping):
|
||||||
|
def __init__(self, element):
|
||||||
|
self._element = element
|
||||||
|
|
||||||
|
def _coerceKey(self, key):
|
||||||
|
if isinstance(key, tuple):
|
||||||
|
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
||||||
|
else:
|
||||||
|
name = infosetFilter.coerceAttribute(key)
|
||||||
|
return name
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
value = self._element._element.attrib[self._coerceKey(key)]
|
||||||
|
if not PY3 and isinstance(value, binary_type):
|
||||||
|
value = value.decode("ascii")
|
||||||
|
return value
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
self._element._element.attrib[self._coerceKey(key)] = value
|
||||||
|
|
||||||
|
def __delitem__(self, key):
|
||||||
|
del self._element._element.attrib[self._coerceKey(key)]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._element._element.attrib)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._element._element.attrib)
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
return self._element._element.attrib.clear()
|
||||||
|
|
||||||
|
class Element(builder.Element):
|
||||||
|
def __init__(self, name, namespace):
|
||||||
|
name = infosetFilter.coerceElement(name)
|
||||||
|
builder.Element.__init__(self, name, namespace=namespace)
|
||||||
|
self._attributes = Attributes(self)
|
||||||
|
|
||||||
|
def _setName(self, name):
|
||||||
|
self._name = infosetFilter.coerceElement(name)
|
||||||
|
self._element.tag = self._getETreeTag(
|
||||||
|
self._name, self._namespace)
|
||||||
|
|
||||||
|
def _getName(self):
|
||||||
|
return infosetFilter.fromXmlName(self._name)
|
||||||
|
|
||||||
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _getAttributes(self):
|
||||||
|
return self._attributes
|
||||||
|
|
||||||
|
def _setAttributes(self, value):
|
||||||
|
attributes = self.attributes
|
||||||
|
attributes.clear()
|
||||||
|
attributes.update(value)
|
||||||
|
|
||||||
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
data = infosetFilter.coerceCharacters(data)
|
||||||
|
builder.Element.insertText(self, data, insertBefore)
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
element = type(self)(self.name, self.namespace)
|
||||||
|
if self._element.attrib:
|
||||||
|
element._element.attrib.update(self._element.attrib)
|
||||||
|
return element
|
||||||
|
|
||||||
|
class Comment(builder.Comment):
|
||||||
|
def __init__(self, data):
|
||||||
|
data = infosetFilter.coerceComment(data)
|
||||||
|
builder.Comment.__init__(self, data)
|
||||||
|
|
||||||
|
def _setData(self, data):
|
||||||
|
data = infosetFilter.coerceComment(data)
|
||||||
|
self._element.text = data
|
||||||
|
|
||||||
|
def _getData(self):
|
||||||
|
return self._element.text
|
||||||
|
|
||||||
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
|
self.elementClass = Element
|
||||||
|
self.commentClass = Comment
|
||||||
|
# self.fragmentClass = builder.DocumentFragment
|
||||||
|
base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
base.TreeBuilder.reset(self)
|
||||||
|
self.insertComment = self.insertCommentInitial
|
||||||
|
self.initial_comments = []
|
||||||
|
self.doctype = None
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
if fullTree:
|
||||||
|
return self.document._elementTree
|
||||||
|
else:
|
||||||
|
return self.document._elementTree.getroot()
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
fragment = []
|
||||||
|
element = self.openElements[0]._element
|
||||||
|
if element.text:
|
||||||
|
fragment.append(element.text)
|
||||||
|
fragment.extend(list(element))
|
||||||
|
if element.tail:
|
||||||
|
fragment.append(element.tail)
|
||||||
|
return fragment
|
||||||
|
|
||||||
|
def insertDoctype(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
if not name:
|
||||||
|
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
|
||||||
|
self.doctype = None
|
||||||
|
else:
|
||||||
|
coercedName = self.infosetFilter.coerceElement(name)
|
||||||
|
if coercedName != name:
|
||||||
|
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
|
||||||
|
|
||||||
|
doctype = self.doctypeClass(coercedName, publicId, systemId)
|
||||||
|
self.doctype = doctype
|
||||||
|
|
||||||
|
def insertCommentInitial(self, data, parent=None):
|
||||||
|
assert parent is None or parent is self.document
|
||||||
|
assert self.document._elementTree is None
|
||||||
|
self.initial_comments.append(data)
|
||||||
|
|
||||||
|
def insertCommentMain(self, data, parent=None):
|
||||||
|
if (parent == self.document and
|
||||||
|
self.document._elementTree.getroot()[-1].tag == comment_type):
|
||||||
|
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
||||||
|
super(TreeBuilder, self).insertComment(data, parent)
|
||||||
|
|
||||||
|
def insertRoot(self, token):
|
||||||
|
# Because of the way libxml2 works, it doesn't seem to be possible to
|
||||||
|
# alter information like the doctype after the tree has been parsed.
|
||||||
|
# Therefore we need to use the built-in parser to create our initial
|
||||||
|
# tree, after which we can add elements like normal
|
||||||
|
docStr = ""
|
||||||
|
if self.doctype:
|
||||||
|
assert self.doctype.name
|
||||||
|
docStr += "<!DOCTYPE %s" % self.doctype.name
|
||||||
|
if (self.doctype.publicId is not None or
|
||||||
|
self.doctype.systemId is not None):
|
||||||
|
docStr += (' PUBLIC "%s" ' %
|
||||||
|
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
|
||||||
|
if self.doctype.systemId:
|
||||||
|
sysid = self.doctype.systemId
|
||||||
|
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
|
||||||
|
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
|
||||||
|
sysid = sysid.replace("'", 'U00027')
|
||||||
|
if sysid.find("'") >= 0:
|
||||||
|
docStr += '"%s"' % sysid
|
||||||
|
else:
|
||||||
|
docStr += "'%s'" % sysid
|
||||||
|
else:
|
||||||
|
docStr += "''"
|
||||||
|
docStr += ">"
|
||||||
|
if self.doctype.name != token["name"]:
|
||||||
|
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
|
||||||
|
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
||||||
|
root = etree.fromstring(docStr)
|
||||||
|
|
||||||
|
# Append the initial comments:
|
||||||
|
for comment_token in self.initial_comments:
|
||||||
|
comment = self.commentClass(comment_token["data"])
|
||||||
|
root.addprevious(comment._element)
|
||||||
|
|
||||||
|
# Create the root document and add the ElementTree to it
|
||||||
|
self.document = self.documentClass()
|
||||||
|
self.document._elementTree = root.getroottree()
|
||||||
|
|
||||||
|
# Give the root element the right name
|
||||||
|
name = token["name"]
|
||||||
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
if namespace is None:
|
||||||
|
etree_tag = name
|
||||||
|
else:
|
||||||
|
etree_tag = "{%s}%s" % (namespace, name)
|
||||||
|
root.tag = etree_tag
|
||||||
|
|
||||||
|
# Add the root element to the internal child/open data structures
|
||||||
|
root_element = self.elementClass(name, namespace)
|
||||||
|
root_element._element = root
|
||||||
|
self.document._childNodes.append(root_element)
|
||||||
|
self.openElements.append(root_element)
|
||||||
|
|
||||||
|
# Reset to the default insert comment function
|
||||||
|
self.insertComment = self.insertCommentMain
|
154
lib/bleach/_vendor/html5lib/treewalkers/__init__.py
Normal file
154
lib/bleach/_vendor/html5lib/treewalkers/__init__.py
Normal file
|
@ -0,0 +1,154 @@
|
||||||
|
"""A collection of modules for iterating through different kinds of
|
||||||
|
tree, generating tokens identical to those produced by the tokenizer
|
||||||
|
module.
|
||||||
|
|
||||||
|
To create a tree walker for a new type of tree, you need to
|
||||||
|
implement a tree walker object (called TreeWalker by convention) that
|
||||||
|
implements a 'serialize' method which takes a tree as sole argument and
|
||||||
|
returns an iterator which generates tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from .. import constants
|
||||||
|
from .._utils import default_etree
|
||||||
|
|
||||||
|
__all__ = ["getTreeWalker", "pprint"]
|
||||||
|
|
||||||
|
treeWalkerCache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||||
|
"""Get a TreeWalker class for various types of tree with built-in support
|
||||||
|
|
||||||
|
:arg str treeType: the name of the tree type required (case-insensitive).
|
||||||
|
Supported values are:
|
||||||
|
|
||||||
|
* "dom": The xml.dom.minidom DOM implementation
|
||||||
|
* "etree": A generic walker for tree implementations exposing an
|
||||||
|
elementtree-like interface (known to work with ElementTree,
|
||||||
|
cElementTree and lxml.etree).
|
||||||
|
* "lxml": Optimized walker for lxml.etree
|
||||||
|
* "genshi": a Genshi stream
|
||||||
|
|
||||||
|
:arg implementation: A module implementing the tree type e.g.
|
||||||
|
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
|
||||||
|
tree type only).
|
||||||
|
|
||||||
|
:arg kwargs: keyword arguments passed to the etree walker--for other
|
||||||
|
walkers, this has no effect
|
||||||
|
|
||||||
|
:returns: a TreeWalker class
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
treeType = treeType.lower()
|
||||||
|
if treeType not in treeWalkerCache:
|
||||||
|
if treeType == "dom":
|
||||||
|
from . import dom
|
||||||
|
treeWalkerCache[treeType] = dom.TreeWalker
|
||||||
|
elif treeType == "genshi":
|
||||||
|
from . import genshi
|
||||||
|
treeWalkerCache[treeType] = genshi.TreeWalker
|
||||||
|
elif treeType == "lxml":
|
||||||
|
from . import etree_lxml
|
||||||
|
treeWalkerCache[treeType] = etree_lxml.TreeWalker
|
||||||
|
elif treeType == "etree":
|
||||||
|
from . import etree
|
||||||
|
if implementation is None:
|
||||||
|
implementation = default_etree
|
||||||
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||||
|
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||||
|
return treeWalkerCache.get(treeType)
|
||||||
|
|
||||||
|
|
||||||
|
def concatenateCharacterTokens(tokens):
|
||||||
|
pendingCharacters = []
|
||||||
|
for token in tokens:
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("Characters", "SpaceCharacters"):
|
||||||
|
pendingCharacters.append(token["data"])
|
||||||
|
else:
|
||||||
|
if pendingCharacters:
|
||||||
|
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||||
|
pendingCharacters = []
|
||||||
|
yield token
|
||||||
|
if pendingCharacters:
|
||||||
|
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||||
|
|
||||||
|
|
||||||
|
def pprint(walker):
|
||||||
|
"""Pretty printer for tree walkers
|
||||||
|
|
||||||
|
Takes a TreeWalker instance and pretty prints the output of walking the tree.
|
||||||
|
|
||||||
|
:arg walker: a TreeWalker instance
|
||||||
|
|
||||||
|
"""
|
||||||
|
output = []
|
||||||
|
indent = 0
|
||||||
|
for token in concatenateCharacterTokens(walker):
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
# tag name
|
||||||
|
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
|
||||||
|
if token["namespace"] in constants.prefixes:
|
||||||
|
ns = constants.prefixes[token["namespace"]]
|
||||||
|
else:
|
||||||
|
ns = token["namespace"]
|
||||||
|
name = "%s %s" % (ns, token["name"])
|
||||||
|
else:
|
||||||
|
name = token["name"]
|
||||||
|
output.append("%s<%s>" % (" " * indent, name))
|
||||||
|
indent += 2
|
||||||
|
# attributes (sorted for consistent ordering)
|
||||||
|
attrs = token["data"]
|
||||||
|
for (namespace, localname), value in sorted(attrs.items()):
|
||||||
|
if namespace:
|
||||||
|
if namespace in constants.prefixes:
|
||||||
|
ns = constants.prefixes[namespace]
|
||||||
|
else:
|
||||||
|
ns = namespace
|
||||||
|
name = "%s %s" % (ns, localname)
|
||||||
|
else:
|
||||||
|
name = localname
|
||||||
|
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
|
||||||
|
# self-closing
|
||||||
|
if type == "EmptyTag":
|
||||||
|
indent -= 2
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
indent -= 2
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
if token["name"]:
|
||||||
|
if token["publicId"]:
|
||||||
|
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(" " * indent,
|
||||||
|
token["name"],
|
||||||
|
token["publicId"],
|
||||||
|
token["systemId"] if token["systemId"] else ""))
|
||||||
|
elif token["systemId"]:
|
||||||
|
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
||||||
|
(" " * indent,
|
||||||
|
token["name"],
|
||||||
|
token["systemId"]))
|
||||||
|
else:
|
||||||
|
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
||||||
|
token["name"]))
|
||||||
|
else:
|
||||||
|
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
||||||
|
|
||||||
|
elif type == "Characters":
|
||||||
|
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
||||||
|
|
||||||
|
elif type == "SpaceCharacters":
|
||||||
|
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown token type, %s" % type)
|
||||||
|
|
||||||
|
return "\n".join(output)
|
252
lib/bleach/_vendor/html5lib/treewalkers/base.py
Normal file
252
lib/bleach/_vendor/html5lib/treewalkers/base.py
Normal file
|
@ -0,0 +1,252 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from xml.dom import Node
|
||||||
|
from ..constants import namespaces, voidElements, spaceCharacters
|
||||||
|
|
||||||
|
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
||||||
|
"TreeWalker", "NonRecursiveTreeWalker"]
|
||||||
|
|
||||||
|
DOCUMENT = Node.DOCUMENT_NODE
|
||||||
|
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||||
|
TEXT = Node.TEXT_NODE
|
||||||
|
ELEMENT = Node.ELEMENT_NODE
|
||||||
|
COMMENT = Node.COMMENT_NODE
|
||||||
|
ENTITY = Node.ENTITY_NODE
|
||||||
|
UNKNOWN = "<#UNKNOWN#>"
|
||||||
|
|
||||||
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
|
class TreeWalker(object):
|
||||||
|
"""Walks a tree yielding tokens
|
||||||
|
|
||||||
|
Tokens are dicts that all have a ``type`` field specifying the type of the
|
||||||
|
token.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, tree):
|
||||||
|
"""Creates a TreeWalker
|
||||||
|
|
||||||
|
:arg tree: the tree to walk
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.tree = tree
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def error(self, msg):
|
||||||
|
"""Generates an error token with the given message
|
||||||
|
|
||||||
|
:arg msg: the error message
|
||||||
|
|
||||||
|
:returns: SerializeError token
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {"type": "SerializeError", "data": msg}
|
||||||
|
|
||||||
|
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||||
|
"""Generates an EmptyTag token
|
||||||
|
|
||||||
|
:arg namespace: the namespace of the token--can be ``None``
|
||||||
|
|
||||||
|
:arg name: the name of the element
|
||||||
|
|
||||||
|
:arg attrs: the attributes of the element as a dict
|
||||||
|
|
||||||
|
:arg hasChildren: whether or not to yield a SerializationError because
|
||||||
|
this tag shouldn't have children
|
||||||
|
|
||||||
|
:returns: EmptyTag token
|
||||||
|
|
||||||
|
"""
|
||||||
|
yield {"type": "EmptyTag", "name": name,
|
||||||
|
"namespace": namespace,
|
||||||
|
"data": attrs}
|
||||||
|
if hasChildren:
|
||||||
|
yield self.error("Void element has children")
|
||||||
|
|
||||||
|
def startTag(self, namespace, name, attrs):
|
||||||
|
"""Generates a StartTag token
|
||||||
|
|
||||||
|
:arg namespace: the namespace of the token--can be ``None``
|
||||||
|
|
||||||
|
:arg name: the name of the element
|
||||||
|
|
||||||
|
:arg attrs: the attributes of the element as a dict
|
||||||
|
|
||||||
|
:returns: StartTag token
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {"type": "StartTag",
|
||||||
|
"name": name,
|
||||||
|
"namespace": namespace,
|
||||||
|
"data": attrs}
|
||||||
|
|
||||||
|
def endTag(self, namespace, name):
|
||||||
|
"""Generates an EndTag token
|
||||||
|
|
||||||
|
:arg namespace: the namespace of the token--can be ``None``
|
||||||
|
|
||||||
|
:arg name: the name of the element
|
||||||
|
|
||||||
|
:returns: EndTag token
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {"type": "EndTag",
|
||||||
|
"name": name,
|
||||||
|
"namespace": namespace}
|
||||||
|
|
||||||
|
def text(self, data):
|
||||||
|
"""Generates SpaceCharacters and Characters tokens
|
||||||
|
|
||||||
|
Depending on what's in the data, this generates one or more
|
||||||
|
``SpaceCharacters`` and ``Characters`` tokens.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
>>> from html5lib.treewalkers.base import TreeWalker
|
||||||
|
>>> # Give it an empty tree just so it instantiates
|
||||||
|
>>> walker = TreeWalker([])
|
||||||
|
>>> list(walker.text(''))
|
||||||
|
[]
|
||||||
|
>>> list(walker.text(' '))
|
||||||
|
[{u'data': ' ', u'type': u'SpaceCharacters'}]
|
||||||
|
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
[{u'data': ' ', u'type': u'SpaceCharacters'},
|
||||||
|
{u'data': u'abc', u'type': u'Characters'},
|
||||||
|
{u'data': u' ', u'type': u'SpaceCharacters'}]
|
||||||
|
|
||||||
|
:arg data: the text data
|
||||||
|
|
||||||
|
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
|
||||||
|
|
||||||
|
"""
|
||||||
|
data = data
|
||||||
|
middle = data.lstrip(spaceCharacters)
|
||||||
|
left = data[:len(data) - len(middle)]
|
||||||
|
if left:
|
||||||
|
yield {"type": "SpaceCharacters", "data": left}
|
||||||
|
data = middle
|
||||||
|
middle = data.rstrip(spaceCharacters)
|
||||||
|
right = data[len(middle):]
|
||||||
|
if middle:
|
||||||
|
yield {"type": "Characters", "data": middle}
|
||||||
|
if right:
|
||||||
|
yield {"type": "SpaceCharacters", "data": right}
|
||||||
|
|
||||||
|
def comment(self, data):
|
||||||
|
"""Generates a Comment token
|
||||||
|
|
||||||
|
:arg data: the comment
|
||||||
|
|
||||||
|
:returns: Comment token
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {"type": "Comment", "data": data}
|
||||||
|
|
||||||
|
def doctype(self, name, publicId=None, systemId=None):
|
||||||
|
"""Generates a Doctype token
|
||||||
|
|
||||||
|
:arg name:
|
||||||
|
|
||||||
|
:arg publicId:
|
||||||
|
|
||||||
|
:arg systemId:
|
||||||
|
|
||||||
|
:returns: the Doctype token
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {"type": "Doctype",
|
||||||
|
"name": name,
|
||||||
|
"publicId": publicId,
|
||||||
|
"systemId": systemId}
|
||||||
|
|
||||||
|
def entity(self, name):
|
||||||
|
"""Generates an Entity token
|
||||||
|
|
||||||
|
:arg name: the entity name
|
||||||
|
|
||||||
|
:returns: an Entity token
|
||||||
|
|
||||||
|
"""
|
||||||
|
return {"type": "Entity", "name": name}
|
||||||
|
|
||||||
|
def unknown(self, nodeType):
|
||||||
|
"""Handles unknown node types"""
|
||||||
|
return self.error("Unknown node type: " + nodeType)
|
||||||
|
|
||||||
|
|
||||||
|
class NonRecursiveTreeWalker(TreeWalker):
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
currentNode = self.tree
|
||||||
|
while currentNode is not None:
|
||||||
|
details = self.getNodeDetails(currentNode)
|
||||||
|
type, details = details[0], details[1:]
|
||||||
|
hasChildren = False
|
||||||
|
|
||||||
|
if type == DOCTYPE:
|
||||||
|
yield self.doctype(*details)
|
||||||
|
|
||||||
|
elif type == TEXT:
|
||||||
|
for token in self.text(*details):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
elif type == ELEMENT:
|
||||||
|
namespace, name, attributes, hasChildren = details
|
||||||
|
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||||
|
for token in self.emptyTag(namespace, name, attributes,
|
||||||
|
hasChildren):
|
||||||
|
yield token
|
||||||
|
hasChildren = False
|
||||||
|
else:
|
||||||
|
yield self.startTag(namespace, name, attributes)
|
||||||
|
|
||||||
|
elif type == COMMENT:
|
||||||
|
yield self.comment(details[0])
|
||||||
|
|
||||||
|
elif type == ENTITY:
|
||||||
|
yield self.entity(details[0])
|
||||||
|
|
||||||
|
elif type == DOCUMENT:
|
||||||
|
hasChildren = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
yield self.unknown(details[0])
|
||||||
|
|
||||||
|
if hasChildren:
|
||||||
|
firstChild = self.getFirstChild(currentNode)
|
||||||
|
else:
|
||||||
|
firstChild = None
|
||||||
|
|
||||||
|
if firstChild is not None:
|
||||||
|
currentNode = firstChild
|
||||||
|
else:
|
||||||
|
while currentNode is not None:
|
||||||
|
details = self.getNodeDetails(currentNode)
|
||||||
|
type, details = details[0], details[1:]
|
||||||
|
if type == ELEMENT:
|
||||||
|
namespace, name, attributes, hasChildren = details
|
||||||
|
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
|
||||||
|
yield self.endTag(namespace, name)
|
||||||
|
if self.tree is currentNode:
|
||||||
|
currentNode = None
|
||||||
|
break
|
||||||
|
nextSibling = self.getNextSibling(currentNode)
|
||||||
|
if nextSibling is not None:
|
||||||
|
currentNode = nextSibling
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
currentNode = self.getParentNode(currentNode)
|
43
lib/bleach/_vendor/html5lib/treewalkers/dom.py
Normal file
43
lib/bleach/_vendor/html5lib/treewalkers/dom.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from xml.dom import Node
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
|
||||||
|
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
return base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||||
|
|
||||||
|
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||||
|
return base.TEXT, node.nodeValue
|
||||||
|
|
||||||
|
elif node.nodeType == Node.ELEMENT_NODE:
|
||||||
|
attrs = {}
|
||||||
|
for attr in list(node.attributes.keys()):
|
||||||
|
attr = node.getAttributeNode(attr)
|
||||||
|
if attr.namespaceURI:
|
||||||
|
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
||||||
|
else:
|
||||||
|
attrs[(None, attr.name)] = attr.value
|
||||||
|
return (base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||||
|
attrs, node.hasChildNodes())
|
||||||
|
|
||||||
|
elif node.nodeType == Node.COMMENT_NODE:
|
||||||
|
return base.COMMENT, node.nodeValue
|
||||||
|
|
||||||
|
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||||
|
return (base.DOCUMENT,)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return base.UNKNOWN, node.nodeType
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
return node.firstChild
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
return node.nextSibling
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
return node.parentNode
|
131
lib/bleach/_vendor/html5lib/treewalkers/etree.py
Normal file
131
lib/bleach/_vendor/html5lib/treewalkers/etree.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
import re
|
||||||
|
|
||||||
|
from six import string_types
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from .._utils import moduleFactoryFactory
|
||||||
|
|
||||||
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
|
|
||||||
|
def getETreeBuilder(ElementTreeImplementation):
|
||||||
|
ElementTree = ElementTreeImplementation
|
||||||
|
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||||
|
|
||||||
|
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
|
||||||
|
"""Given the particular ElementTree representation, this implementation,
|
||||||
|
to avoid using recursion, returns "nodes" as tuples with the following
|
||||||
|
content:
|
||||||
|
|
||||||
|
1. The current element
|
||||||
|
|
||||||
|
2. The index of the element relative to its parent
|
||||||
|
|
||||||
|
3. A stack of ancestor elements
|
||||||
|
|
||||||
|
4. A flag "text", "tail" or None to indicate if the current node is a
|
||||||
|
text node; either the text or tail of the current element (1)
|
||||||
|
"""
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if isinstance(node, tuple): # It might be the root Element
|
||||||
|
elt, _, _, flag = node
|
||||||
|
if flag in ("text", "tail"):
|
||||||
|
return base.TEXT, getattr(elt, flag)
|
||||||
|
else:
|
||||||
|
node = elt
|
||||||
|
|
||||||
|
if not(hasattr(node, "tag")):
|
||||||
|
node = node.getroot()
|
||||||
|
|
||||||
|
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
|
||||||
|
return (base.DOCUMENT,)
|
||||||
|
|
||||||
|
elif node.tag == "<!DOCTYPE>":
|
||||||
|
return (base.DOCTYPE, node.text,
|
||||||
|
node.get("publicId"), node.get("systemId"))
|
||||||
|
|
||||||
|
elif node.tag == ElementTreeCommentType:
|
||||||
|
return base.COMMENT, node.text
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert isinstance(node.tag, string_types), type(node.tag)
|
||||||
|
# This is assumed to be an ordinary element
|
||||||
|
match = tag_regexp.match(node.tag)
|
||||||
|
if match:
|
||||||
|
namespace, tag = match.groups()
|
||||||
|
else:
|
||||||
|
namespace = None
|
||||||
|
tag = node.tag
|
||||||
|
attrs = OrderedDict()
|
||||||
|
for name, value in list(node.attrib.items()):
|
||||||
|
match = tag_regexp.match(name)
|
||||||
|
if match:
|
||||||
|
attrs[(match.group(1), match.group(2))] = value
|
||||||
|
else:
|
||||||
|
attrs[(None, name)] = value
|
||||||
|
return (base.ELEMENT, namespace, tag,
|
||||||
|
attrs, len(node) or node.text)
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
if isinstance(node, tuple):
|
||||||
|
element, key, parents, flag = node
|
||||||
|
else:
|
||||||
|
element, key, parents, flag = node, None, [], None
|
||||||
|
|
||||||
|
if flag in ("text", "tail"):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if element.text:
|
||||||
|
return element, key, parents, "text"
|
||||||
|
elif len(element):
|
||||||
|
parents.append(element)
|
||||||
|
return element[0], 0, parents, None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
if isinstance(node, tuple):
|
||||||
|
element, key, parents, flag = node
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if flag == "text":
|
||||||
|
if len(element):
|
||||||
|
parents.append(element)
|
||||||
|
return element[0], 0, parents, None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if element.tail and flag != "tail":
|
||||||
|
return element, key, parents, "tail"
|
||||||
|
elif key < len(parents[-1]) - 1:
|
||||||
|
return parents[-1][key + 1], key + 1, parents, None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
if isinstance(node, tuple):
|
||||||
|
element, key, parents, flag = node
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if flag == "text":
|
||||||
|
if not parents:
|
||||||
|
return element
|
||||||
|
else:
|
||||||
|
return element, key, parents, None
|
||||||
|
else:
|
||||||
|
parent = parents.pop()
|
||||||
|
if not parents:
|
||||||
|
return parent
|
||||||
|
else:
|
||||||
|
assert list(parents[-1]).count(parent) == 1
|
||||||
|
return parent, list(parents[-1]).index(parent), parents, None
|
||||||
|
|
||||||
|
return locals()
|
||||||
|
|
||||||
|
|
||||||
|
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
215
lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
Normal file
215
lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
Normal file
|
@ -0,0 +1,215 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
from ..treebuilders.etree import tag_regexp
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
from .. import _ihatexml
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_str(s):
|
||||||
|
if s is None:
|
||||||
|
return None
|
||||||
|
elif isinstance(s, text_type):
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
return s.decode("ascii", "strict")
|
||||||
|
|
||||||
|
|
||||||
|
class Root(object):
|
||||||
|
def __init__(self, et):
|
||||||
|
self.elementtree = et
|
||||||
|
self.children = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
if et.docinfo.internalDTD:
|
||||||
|
self.children.append(Doctype(self,
|
||||||
|
ensure_str(et.docinfo.root_name),
|
||||||
|
ensure_str(et.docinfo.public_id),
|
||||||
|
ensure_str(et.docinfo.system_url)))
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
node = et.getroot()
|
||||||
|
except AttributeError:
|
||||||
|
node = et
|
||||||
|
|
||||||
|
while node.getprevious() is not None:
|
||||||
|
node = node.getprevious()
|
||||||
|
while node is not None:
|
||||||
|
self.children.append(node)
|
||||||
|
node = node.getnext()
|
||||||
|
|
||||||
|
self.text = None
|
||||||
|
self.tail = None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.children[key]
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
class Doctype(object):
|
||||||
|
def __init__(self, root_node, name, public_id, system_id):
|
||||||
|
self.root_node = root_node
|
||||||
|
self.name = name
|
||||||
|
self.public_id = public_id
|
||||||
|
self.system_id = system_id
|
||||||
|
|
||||||
|
self.text = None
|
||||||
|
self.tail = None
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
return self.root_node.children[1]
|
||||||
|
|
||||||
|
|
||||||
|
class FragmentRoot(Root):
|
||||||
|
def __init__(self, children):
|
||||||
|
self.children = [FragmentWrapper(self, child) for child in children]
|
||||||
|
self.text = self.tail = None
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class FragmentWrapper(object):
|
||||||
|
def __init__(self, fragment_root, obj):
|
||||||
|
self.root_node = fragment_root
|
||||||
|
self.obj = obj
|
||||||
|
if hasattr(self.obj, 'text'):
|
||||||
|
self.text = ensure_str(self.obj.text)
|
||||||
|
else:
|
||||||
|
self.text = None
|
||||||
|
if hasattr(self.obj, 'tail'):
|
||||||
|
self.tail = ensure_str(self.obj.tail)
|
||||||
|
else:
|
||||||
|
self.tail = None
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.obj, name)
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
siblings = self.root_node.children
|
||||||
|
idx = siblings.index(self)
|
||||||
|
if idx < len(siblings) - 1:
|
||||||
|
return siblings[idx + 1]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.obj[key]
|
||||||
|
|
||||||
|
def __bool__(self):
|
||||||
|
return bool(self.obj)
|
||||||
|
|
||||||
|
def getparent(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return str(self.obj)
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return str(self.obj)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.obj)
|
||||||
|
|
||||||
|
|
||||||
|
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||||
|
def __init__(self, tree):
|
||||||
|
# pylint:disable=redefined-variable-type
|
||||||
|
if isinstance(tree, list):
|
||||||
|
self.fragmentChildren = set(tree)
|
||||||
|
tree = FragmentRoot(tree)
|
||||||
|
else:
|
||||||
|
self.fragmentChildren = set()
|
||||||
|
tree = Root(tree)
|
||||||
|
base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||||
|
self.filter = _ihatexml.InfosetFilter()
|
||||||
|
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if isinstance(node, tuple): # Text node
|
||||||
|
node, key = node
|
||||||
|
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||||
|
return base.TEXT, ensure_str(getattr(node, key))
|
||||||
|
|
||||||
|
elif isinstance(node, Root):
|
||||||
|
return (base.DOCUMENT,)
|
||||||
|
|
||||||
|
elif isinstance(node, Doctype):
|
||||||
|
return base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||||
|
|
||||||
|
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
|
||||||
|
return base.TEXT, ensure_str(node.obj)
|
||||||
|
|
||||||
|
elif node.tag == etree.Comment:
|
||||||
|
return base.COMMENT, ensure_str(node.text)
|
||||||
|
|
||||||
|
elif node.tag == etree.Entity:
|
||||||
|
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
||||||
|
|
||||||
|
else:
|
||||||
|
# This is assumed to be an ordinary element
|
||||||
|
match = tag_regexp.match(ensure_str(node.tag))
|
||||||
|
if match:
|
||||||
|
namespace, tag = match.groups()
|
||||||
|
else:
|
||||||
|
namespace = None
|
||||||
|
tag = ensure_str(node.tag)
|
||||||
|
attrs = OrderedDict()
|
||||||
|
for name, value in list(node.attrib.items()):
|
||||||
|
name = ensure_str(name)
|
||||||
|
value = ensure_str(value)
|
||||||
|
match = tag_regexp.match(name)
|
||||||
|
if match:
|
||||||
|
attrs[(match.group(1), match.group(2))] = value
|
||||||
|
else:
|
||||||
|
attrs[(None, name)] = value
|
||||||
|
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||||
|
attrs, len(node) > 0 or node.text)
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
assert not isinstance(node, tuple), "Text nodes have no children"
|
||||||
|
|
||||||
|
assert len(node) or node.text, "Node has no children"
|
||||||
|
if node.text:
|
||||||
|
return (node, "text")
|
||||||
|
else:
|
||||||
|
return node[0]
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
if isinstance(node, tuple): # Text node
|
||||||
|
node, key = node
|
||||||
|
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||||
|
if key == "text":
|
||||||
|
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||||
|
# because node[0] might evaluate to False if it has no child element
|
||||||
|
if len(node):
|
||||||
|
return node[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else: # tail
|
||||||
|
return node.getnext()
|
||||||
|
|
||||||
|
return (node, "tail") if node.tail else node.getnext()
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
if isinstance(node, tuple): # Text node
|
||||||
|
node, key = node
|
||||||
|
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||||
|
if key == "text":
|
||||||
|
return node
|
||||||
|
# else: fallback to "normal" processing
|
||||||
|
elif node in self.fragmentChildren:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return node.getparent()
|
69
lib/bleach/_vendor/html5lib/treewalkers/genshi.py
Normal file
69
lib/bleach/_vendor/html5lib/treewalkers/genshi.py
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from genshi.core import QName
|
||||||
|
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||||
|
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
from ..constants import voidElements, namespaces
|
||||||
|
|
||||||
|
|
||||||
|
class TreeWalker(base.TreeWalker):
|
||||||
|
def __iter__(self):
|
||||||
|
# Buffer the events so we can pass in the following one
|
||||||
|
previous = None
|
||||||
|
for event in self.tree:
|
||||||
|
if previous is not None:
|
||||||
|
for token in self.tokens(previous, event):
|
||||||
|
yield token
|
||||||
|
previous = event
|
||||||
|
|
||||||
|
# Don't forget the final event!
|
||||||
|
if previous is not None:
|
||||||
|
for token in self.tokens(previous, None):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def tokens(self, event, next):
|
||||||
|
kind, data, _ = event
|
||||||
|
if kind == START:
|
||||||
|
tag, attribs = data
|
||||||
|
name = tag.localname
|
||||||
|
namespace = tag.namespace
|
||||||
|
converted_attribs = {}
|
||||||
|
for k, v in attribs:
|
||||||
|
if isinstance(k, QName):
|
||||||
|
converted_attribs[(k.namespace, k.localname)] = v
|
||||||
|
else:
|
||||||
|
converted_attribs[(None, k)] = v
|
||||||
|
|
||||||
|
if namespace == namespaces["html"] and name in voidElements:
|
||||||
|
for token in self.emptyTag(namespace, name, converted_attribs,
|
||||||
|
not next or next[0] != END or
|
||||||
|
next[1] != tag):
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield self.startTag(namespace, name, converted_attribs)
|
||||||
|
|
||||||
|
elif kind == END:
|
||||||
|
name = data.localname
|
||||||
|
namespace = data.namespace
|
||||||
|
if namespace != namespaces["html"] or name not in voidElements:
|
||||||
|
yield self.endTag(namespace, name)
|
||||||
|
|
||||||
|
elif kind == COMMENT:
|
||||||
|
yield self.comment(data)
|
||||||
|
|
||||||
|
elif kind == TEXT:
|
||||||
|
for token in self.text(data):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
elif kind == DOCTYPE:
|
||||||
|
yield self.doctype(*data)
|
||||||
|
|
||||||
|
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
|
||||||
|
START_CDATA, END_CDATA, PI):
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
yield self.unknown(kind)
|
1078
lib/bleach/_vendor/parse.py
Normal file
1078
lib/bleach/_vendor/parse.py
Normal file
File diff suppressed because it is too large
Load diff
3
lib/bleach/_vendor/vendor.txt
Normal file
3
lib/bleach/_vendor/vendor.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
html5lib==1.1 \
|
||||||
|
--hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
|
||||||
|
--hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
|
14
lib/bleach/_vendor/vendor_install.sh
Normal file
14
lib/bleach/_vendor/vendor_install.sh
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
set -u
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
BLEACH_VENDOR_DIR=${BLEACH_VENDOR_DIR:-"."}
|
||||||
|
DEST=${DEST:-"."}
|
||||||
|
|
||||||
|
pip install --no-binary all --no-compile --no-deps -r "${BLEACH_VENDOR_DIR}/vendor.txt" --target "${DEST}"
|
||||||
|
|
||||||
|
# install Python 3.6.14 urllib.urlparse for #536
|
||||||
|
curl --proto '=https' --tlsv1.2 -o "${DEST}/parse.py" https://raw.githubusercontent.com/python/cpython/v3.6.14/Lib/urllib/parse.py
|
||||||
|
(cd "${DEST}" && sha256sum parse.py > parse.py.SHA256SUM)
|
|
@ -1,20 +1,32 @@
|
||||||
"""A set of basic callbacks for bleach.linkify."""
|
"""A set of basic callbacks for bleach.linkify."""
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def nofollow(attrs, new=False):
|
def nofollow(attrs, new=False):
|
||||||
if attrs['href'].startswith('mailto:'):
|
href_key = (None, "href")
|
||||||
|
|
||||||
|
if href_key not in attrs:
|
||||||
return attrs
|
return attrs
|
||||||
rel = [x for x in attrs.get('rel', '').split(' ') if x]
|
|
||||||
if 'nofollow' not in [x.lower() for x in rel]:
|
if attrs[href_key].startswith("mailto:"):
|
||||||
rel.append('nofollow')
|
return attrs
|
||||||
attrs['rel'] = ' '.join(rel)
|
|
||||||
|
rel_key = (None, "rel")
|
||||||
|
rel_values = [val for val in attrs.get(rel_key, "").split(" ") if val]
|
||||||
|
if "nofollow" not in [rel_val.lower() for rel_val in rel_values]:
|
||||||
|
rel_values.append("nofollow")
|
||||||
|
attrs[rel_key] = " ".join(rel_values)
|
||||||
|
|
||||||
return attrs
|
return attrs
|
||||||
|
|
||||||
|
|
||||||
def target_blank(attrs, new=False):
|
def target_blank(attrs, new=False):
|
||||||
if attrs['href'].startswith('mailto:'):
|
href_key = (None, "href")
|
||||||
|
|
||||||
|
if href_key not in attrs:
|
||||||
return attrs
|
return attrs
|
||||||
attrs['target'] = '_blank'
|
|
||||||
|
if attrs[href_key].startswith("mailto:"):
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
attrs[(None, "target")] = "_blank"
|
||||||
return attrs
|
return attrs
|
||||||
|
|
|
@ -1,62 +0,0 @@
|
||||||
import datetime
|
|
||||||
from decimal import Decimal
|
|
||||||
import types
|
|
||||||
import six
|
|
||||||
|
|
||||||
|
|
||||||
def is_protected_type(obj):
|
|
||||||
"""Determine if the object instance is of a protected type.
|
|
||||||
|
|
||||||
Objects of protected types are preserved as-is when passed to
|
|
||||||
force_unicode(strings_only=True).
|
|
||||||
"""
|
|
||||||
return isinstance(obj, (
|
|
||||||
six.integer_types +
|
|
||||||
(types.NoneType,
|
|
||||||
datetime.datetime, datetime.date, datetime.time,
|
|
||||||
float, Decimal))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
|
|
||||||
"""
|
|
||||||
Similar to smart_text, except that lazy instances are resolved to
|
|
||||||
strings, rather than kept as lazy objects.
|
|
||||||
|
|
||||||
If strings_only is True, don't convert (some) non-string-like objects.
|
|
||||||
"""
|
|
||||||
# Handle the common case first, saves 30-40% when s is an instance of
|
|
||||||
# six.text_type. This function gets called often in that setting.
|
|
||||||
if isinstance(s, six.text_type):
|
|
||||||
return s
|
|
||||||
if strings_only and is_protected_type(s):
|
|
||||||
return s
|
|
||||||
try:
|
|
||||||
if not isinstance(s, six.string_types):
|
|
||||||
if hasattr(s, '__unicode__'):
|
|
||||||
s = s.__unicode__()
|
|
||||||
else:
|
|
||||||
if six.PY3:
|
|
||||||
if isinstance(s, bytes):
|
|
||||||
s = six.text_type(s, encoding, errors)
|
|
||||||
else:
|
|
||||||
s = six.text_type(s)
|
|
||||||
else:
|
|
||||||
s = six.text_type(bytes(s), encoding, errors)
|
|
||||||
else:
|
|
||||||
# Note: We use .decode() here, instead of six.text_type(s,
|
|
||||||
# encoding, errors), so that if s is a SafeBytes, it ends up being
|
|
||||||
# a SafeText at the end.
|
|
||||||
s = s.decode(encoding, errors)
|
|
||||||
except UnicodeDecodeError as e:
|
|
||||||
if not isinstance(s, Exception):
|
|
||||||
raise UnicodeDecodeError(*e.args)
|
|
||||||
else:
|
|
||||||
# If we get to here, the caller has passed in an Exception
|
|
||||||
# subclass populated with non-ASCII bytestring data without a
|
|
||||||
# working unicode method. Try to handle this without raising a
|
|
||||||
# further exception by individually forcing the exception args
|
|
||||||
# to unicode.
|
|
||||||
s = ' '.join([force_unicode(arg, encoding, strings_only,
|
|
||||||
errors) for arg in s])
|
|
||||||
return s
|
|
665
lib/bleach/html5lib_shim.py
Normal file
665
lib/bleach/html5lib_shim.py
Normal file
|
@ -0,0 +1,665 @@
|
||||||
|
# flake8: noqa
|
||||||
|
"""
|
||||||
|
Shim module between Bleach and html5lib. This makes it easier to upgrade the
|
||||||
|
html5lib library without having to change a lot of code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# ignore html5lib deprecation warnings to use bleach; we are bleach
|
||||||
|
# apply before we import submodules that import html5lib
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
message="html5lib's sanitizer is deprecated",
|
||||||
|
category=DeprecationWarning,
|
||||||
|
module="bleach._vendor.html5lib",
|
||||||
|
)
|
||||||
|
|
||||||
|
from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
|
||||||
|
HTMLParser,
|
||||||
|
getTreeWalker,
|
||||||
|
)
|
||||||
|
from bleach._vendor.html5lib import (
|
||||||
|
constants,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
|
||||||
|
namespaces,
|
||||||
|
prefixes,
|
||||||
|
)
|
||||||
|
from bleach._vendor.html5lib.constants import (
|
||||||
|
_ReparseException as ReparseException,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib.filters.base import (
|
||||||
|
Filter,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib.filters.sanitizer import (
|
||||||
|
allowed_protocols,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib.filters.sanitizer import (
|
||||||
|
Filter as SanitizerFilter,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib._inputstream import (
|
||||||
|
HTMLInputStream,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib.serializer import (
|
||||||
|
escape,
|
||||||
|
HTMLSerializer,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib._tokenizer import (
|
||||||
|
attributeMap,
|
||||||
|
HTMLTokenizer,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
from bleach._vendor.html5lib._trie import (
|
||||||
|
Trie,
|
||||||
|
) # noqa: E402 module level import not at top of file
|
||||||
|
|
||||||
|
|
||||||
|
#: Map of entity name to expanded entity
|
||||||
|
ENTITIES = constants.entities
|
||||||
|
|
||||||
|
#: Trie of html entity string -> character representation
|
||||||
|
ENTITIES_TRIE = Trie(ENTITIES)
|
||||||
|
|
||||||
|
#: Token type constants--these never change
|
||||||
|
TAG_TOKEN_TYPES = {
|
||||||
|
constants.tokenTypes["StartTag"],
|
||||||
|
constants.tokenTypes["EndTag"],
|
||||||
|
constants.tokenTypes["EmptyTag"],
|
||||||
|
}
|
||||||
|
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
|
||||||
|
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
|
||||||
|
|
||||||
|
|
||||||
|
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
|
||||||
|
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
||||||
|
HTML_TAGS = [
|
||||||
|
"a",
|
||||||
|
"abbr",
|
||||||
|
"address",
|
||||||
|
"area",
|
||||||
|
"article",
|
||||||
|
"aside",
|
||||||
|
"audio",
|
||||||
|
"b",
|
||||||
|
"base",
|
||||||
|
"bdi",
|
||||||
|
"bdo",
|
||||||
|
"blockquote",
|
||||||
|
"body",
|
||||||
|
"br",
|
||||||
|
"button",
|
||||||
|
"canvas",
|
||||||
|
"caption",
|
||||||
|
"cite",
|
||||||
|
"code",
|
||||||
|
"col",
|
||||||
|
"colgroup",
|
||||||
|
"data",
|
||||||
|
"datalist",
|
||||||
|
"dd",
|
||||||
|
"del",
|
||||||
|
"details",
|
||||||
|
"dfn",
|
||||||
|
"dialog",
|
||||||
|
"div",
|
||||||
|
"dl",
|
||||||
|
"dt",
|
||||||
|
"em",
|
||||||
|
"embed",
|
||||||
|
"fieldset",
|
||||||
|
"figcaption",
|
||||||
|
"figure",
|
||||||
|
"footer",
|
||||||
|
"form",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"head",
|
||||||
|
"header",
|
||||||
|
"hgroup",
|
||||||
|
"hr",
|
||||||
|
"html",
|
||||||
|
"i",
|
||||||
|
"iframe",
|
||||||
|
"img",
|
||||||
|
"input",
|
||||||
|
"ins",
|
||||||
|
"kbd",
|
||||||
|
"keygen",
|
||||||
|
"label",
|
||||||
|
"legend",
|
||||||
|
"li",
|
||||||
|
"link",
|
||||||
|
"map",
|
||||||
|
"mark",
|
||||||
|
"menu",
|
||||||
|
"meta",
|
||||||
|
"meter",
|
||||||
|
"nav",
|
||||||
|
"noscript",
|
||||||
|
"object",
|
||||||
|
"ol",
|
||||||
|
"optgroup",
|
||||||
|
"option",
|
||||||
|
"output",
|
||||||
|
"p",
|
||||||
|
"param",
|
||||||
|
"picture",
|
||||||
|
"pre",
|
||||||
|
"progress",
|
||||||
|
"q",
|
||||||
|
"rp",
|
||||||
|
"rt",
|
||||||
|
"ruby",
|
||||||
|
"s",
|
||||||
|
"samp",
|
||||||
|
"script",
|
||||||
|
"section",
|
||||||
|
"select",
|
||||||
|
"slot",
|
||||||
|
"small",
|
||||||
|
"source",
|
||||||
|
"span",
|
||||||
|
"strong",
|
||||||
|
"style",
|
||||||
|
"sub",
|
||||||
|
"summary",
|
||||||
|
"sup",
|
||||||
|
"table",
|
||||||
|
"tbody",
|
||||||
|
"td",
|
||||||
|
"template",
|
||||||
|
"textarea",
|
||||||
|
"tfoot",
|
||||||
|
"th",
|
||||||
|
"thead",
|
||||||
|
"time",
|
||||||
|
"title",
|
||||||
|
"tr",
|
||||||
|
"track",
|
||||||
|
"u",
|
||||||
|
"ul",
|
||||||
|
"var",
|
||||||
|
"video",
|
||||||
|
"wbr",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class InputStreamWithMemory:
|
||||||
|
"""Wraps an HTMLInputStream to remember characters since last <
|
||||||
|
|
||||||
|
This wraps existing HTMLInputStream classes to keep track of the stream
|
||||||
|
since the last < which marked an open tag state.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, inner_stream):
|
||||||
|
self._inner_stream = inner_stream
|
||||||
|
self.reset = self._inner_stream.reset
|
||||||
|
self.position = self._inner_stream.position
|
||||||
|
self._buffer = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def errors(self):
|
||||||
|
return self._inner_stream.errors
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charEncoding(self):
|
||||||
|
return self._inner_stream.charEncoding
|
||||||
|
|
||||||
|
@property
|
||||||
|
def changeEncoding(self):
|
||||||
|
return self._inner_stream.changeEncoding
|
||||||
|
|
||||||
|
def char(self):
|
||||||
|
c = self._inner_stream.char()
|
||||||
|
# char() can return None if EOF, so ignore that
|
||||||
|
if c:
|
||||||
|
self._buffer.append(c)
|
||||||
|
return c
|
||||||
|
|
||||||
|
def charsUntil(self, characters, opposite=False):
|
||||||
|
chars = self._inner_stream.charsUntil(characters, opposite=opposite)
|
||||||
|
self._buffer.extend(list(chars))
|
||||||
|
return chars
|
||||||
|
|
||||||
|
def unget(self, char):
|
||||||
|
if self._buffer:
|
||||||
|
self._buffer.pop(-1)
|
||||||
|
return self._inner_stream.unget(char)
|
||||||
|
|
||||||
|
def get_tag(self):
|
||||||
|
"""Returns the stream history since last '<'
|
||||||
|
|
||||||
|
Since the buffer starts at the last '<' as as seen by tagOpenState(),
|
||||||
|
we know that everything from that point to when this method is called
|
||||||
|
is the "tag" that is being tokenized.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return "".join(self._buffer)
|
||||||
|
|
||||||
|
def start_tag(self):
|
||||||
|
"""Resets stream history to just '<'
|
||||||
|
|
||||||
|
This gets called by tagOpenState() which marks a '<' that denotes an
|
||||||
|
open tag. Any time we see that, we reset the buffer.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self._buffer = ["<"]
|
||||||
|
|
||||||
|
|
||||||
|
class BleachHTMLTokenizer(HTMLTokenizer):
|
||||||
|
"""Tokenizer that doesn't consume character entities"""
|
||||||
|
|
||||||
|
def __init__(self, consume_entities=False, **kwargs):
|
||||||
|
super(BleachHTMLTokenizer, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.consume_entities = consume_entities
|
||||||
|
|
||||||
|
# Wrap the stream with one that remembers the history
|
||||||
|
self.stream = InputStreamWithMemory(self.stream)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
last_error_token = None
|
||||||
|
|
||||||
|
for token in super(BleachHTMLTokenizer, self).__iter__():
|
||||||
|
if last_error_token is not None:
|
||||||
|
if (
|
||||||
|
last_error_token["data"] == "invalid-character-in-attribute-name"
|
||||||
|
and token["type"] in TAG_TOKEN_TYPES
|
||||||
|
and token.get("data")
|
||||||
|
):
|
||||||
|
# token["data"] is an html5lib attributeMap
|
||||||
|
# (OrderedDict 3.7+ and dict otherwise)
|
||||||
|
# of attr name to attr value
|
||||||
|
#
|
||||||
|
# Remove attribute names that have ', " or < in them
|
||||||
|
# because those characters are invalid for attribute names.
|
||||||
|
token["data"] = attributeMap(
|
||||||
|
(attr_name, attr_value)
|
||||||
|
for attr_name, attr_value in token["data"].items()
|
||||||
|
if (
|
||||||
|
'"' not in attr_name
|
||||||
|
and "'" not in attr_name
|
||||||
|
and "<" not in attr_name
|
||||||
|
)
|
||||||
|
)
|
||||||
|
last_error_token = None
|
||||||
|
yield token
|
||||||
|
|
||||||
|
elif (
|
||||||
|
last_error_token["data"] == "expected-closing-tag-but-got-char"
|
||||||
|
and self.parser.tags is not None
|
||||||
|
and token["data"].lower().strip() not in self.parser.tags
|
||||||
|
):
|
||||||
|
# We've got either a malformed tag or a pseudo-tag or
|
||||||
|
# something that html5lib wants to turn into a malformed
|
||||||
|
# comment which Bleach clean() will drop so we interfere
|
||||||
|
# with the token stream to handle it more correctly.
|
||||||
|
#
|
||||||
|
# If this is an allowed tag, it's malformed and we just let
|
||||||
|
# the html5lib parser deal with it--we don't enter into this
|
||||||
|
# block.
|
||||||
|
#
|
||||||
|
# If this is not an allowed tag, then we convert it to
|
||||||
|
# characters and it'll get escaped in the sanitizer.
|
||||||
|
token["data"] = self.stream.get_tag()
|
||||||
|
token["type"] = CHARACTERS_TYPE
|
||||||
|
|
||||||
|
last_error_token = None
|
||||||
|
yield token
|
||||||
|
|
||||||
|
elif token["type"] == PARSEERROR_TYPE:
|
||||||
|
# If the token is a parse error, then let the last_error_token
|
||||||
|
# go, and make token the new last_error_token
|
||||||
|
yield last_error_token
|
||||||
|
last_error_token = token
|
||||||
|
|
||||||
|
else:
|
||||||
|
yield last_error_token
|
||||||
|
yield token
|
||||||
|
last_error_token = None
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If the token is a ParseError, we hold on to it so we can get the
|
||||||
|
# next token and potentially fix it.
|
||||||
|
if token["type"] == PARSEERROR_TYPE:
|
||||||
|
last_error_token = token
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield token
|
||||||
|
|
||||||
|
if last_error_token:
|
||||||
|
yield last_error_token
|
||||||
|
|
||||||
|
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
||||||
|
# If this tokenizer is set to consume entities, then we can let the
|
||||||
|
# superclass do its thing.
|
||||||
|
if self.consume_entities:
|
||||||
|
return super(BleachHTMLTokenizer, self).consumeEntity(
|
||||||
|
allowedChar, fromAttribute
|
||||||
|
)
|
||||||
|
|
||||||
|
# If this tokenizer is set to not consume entities, then we don't want
|
||||||
|
# to consume and convert them, so this overrides the html5lib tokenizer's
|
||||||
|
# consumeEntity so that it's now a no-op.
|
||||||
|
#
|
||||||
|
# However, when that gets called, it's consumed an &, so we put that back in
|
||||||
|
# the stream.
|
||||||
|
if fromAttribute:
|
||||||
|
self.currentToken["data"][-1][1] += "&"
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
|
||||||
|
|
||||||
|
def tagOpenState(self):
|
||||||
|
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
|
||||||
|
# or ParseError. In all cases, we want to drop any stream history
|
||||||
|
# we've collected so far and we do that by calling start_tag() on
|
||||||
|
# the input stream wrapper.
|
||||||
|
self.stream.start_tag()
|
||||||
|
return super(BleachHTMLTokenizer, self).tagOpenState()
|
||||||
|
|
||||||
|
def emitCurrentToken(self):
|
||||||
|
token = self.currentToken
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.parser.tags is not None
|
||||||
|
and token["type"] in TAG_TOKEN_TYPES
|
||||||
|
and token["name"].lower() not in self.parser.tags
|
||||||
|
):
|
||||||
|
# If this is a start/end/empty tag for a tag that's not in our
|
||||||
|
# allowed list, then it gets stripped or escaped. In both of these
|
||||||
|
# cases it gets converted to a Characters token.
|
||||||
|
if self.parser.strip:
|
||||||
|
# If we're stripping the token, we just throw in an empty
|
||||||
|
# string token.
|
||||||
|
new_data = ""
|
||||||
|
|
||||||
|
else:
|
||||||
|
# If we're escaping the token, we want to escape the exact
|
||||||
|
# original string. Since tokenizing also normalizes data
|
||||||
|
# and this is a tag-like thing, we've lost some information.
|
||||||
|
# So we go back through the stream to get the original
|
||||||
|
# string and use that.
|
||||||
|
new_data = self.stream.get_tag()
|
||||||
|
|
||||||
|
new_token = {"type": CHARACTERS_TYPE, "data": new_data}
|
||||||
|
|
||||||
|
self.currentToken = new_token
|
||||||
|
self.tokenQueue.append(new_token)
|
||||||
|
self.state = self.dataState
|
||||||
|
return
|
||||||
|
|
||||||
|
super(BleachHTMLTokenizer, self).emitCurrentToken()
|
||||||
|
|
||||||
|
|
||||||
|
class BleachHTMLParser(HTMLParser):
|
||||||
|
"""Parser that uses BleachHTMLTokenizer"""
|
||||||
|
|
||||||
|
def __init__(self, tags, strip, consume_entities, **kwargs):
|
||||||
|
"""
|
||||||
|
:arg tags: list of allowed tags--everything else is either stripped or
|
||||||
|
escaped; if None, then this doesn't look at tags at all
|
||||||
|
:arg strip: whether to strip disallowed tags (True) or escape them (False);
|
||||||
|
if tags=None, then this doesn't have any effect
|
||||||
|
:arg consume_entities: whether to consume entities (default behavior) or
|
||||||
|
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.tags = [tag.lower() for tag in tags] if tags is not None else None
|
||||||
|
self.strip = strip
|
||||||
|
self.consume_entities = consume_entities
|
||||||
|
super(BleachHTMLParser, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
def _parse(
|
||||||
|
self, stream, innerHTML=False, container="div", scripting=True, **kwargs
|
||||||
|
):
|
||||||
|
# set scripting=True to parse <noscript> as though JS is enabled to
|
||||||
|
# match the expected context in browsers
|
||||||
|
#
|
||||||
|
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
|
||||||
|
#
|
||||||
|
# Override HTMLParser so we can swap out the tokenizer for our own.
|
||||||
|
self.innerHTMLMode = innerHTML
|
||||||
|
self.container = container
|
||||||
|
self.scripting = scripting
|
||||||
|
self.tokenizer = BleachHTMLTokenizer(
|
||||||
|
stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
|
||||||
|
)
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.mainLoop()
|
||||||
|
except ReparseException:
|
||||||
|
self.reset()
|
||||||
|
self.mainLoop()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_entity(value):
|
||||||
|
"""Convert an entity (minus the & and ; part) into what it represents
|
||||||
|
|
||||||
|
This handles numeric, hex, and text entities.
|
||||||
|
|
||||||
|
:arg value: the string (minus the ``&`` and ``;`` part) to convert
|
||||||
|
|
||||||
|
:returns: unicode character or None if it's an ambiguous ampersand that
|
||||||
|
doesn't match a character entity
|
||||||
|
|
||||||
|
"""
|
||||||
|
if value[0] == "#":
|
||||||
|
if len(value) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if value[1] in ("x", "X"):
|
||||||
|
# hex-encoded code point
|
||||||
|
int_as_string, base = value[2:], 16
|
||||||
|
else:
|
||||||
|
# decimal code point
|
||||||
|
int_as_string, base = value[1:], 10
|
||||||
|
|
||||||
|
if int_as_string == "":
|
||||||
|
return None
|
||||||
|
|
||||||
|
code_point = int(int_as_string, base)
|
||||||
|
if 0 < code_point < 0x110000:
|
||||||
|
return chr(code_point)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return ENTITIES.get(value, None)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_entities(text):
|
||||||
|
"""Converts all found entities in the text
|
||||||
|
|
||||||
|
:arg text: the text to convert entities in
|
||||||
|
|
||||||
|
:returns: unicode text with converted entities
|
||||||
|
|
||||||
|
"""
|
||||||
|
if "&" not in text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
new_text = []
|
||||||
|
for part in next_possible_entity(text):
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if part.startswith("&"):
|
||||||
|
entity = match_entity(part)
|
||||||
|
if entity is not None:
|
||||||
|
converted = convert_entity(entity)
|
||||||
|
|
||||||
|
# If it's not an ambiguous ampersand, then replace with the
|
||||||
|
# unicode character. Otherwise, we leave the entity in.
|
||||||
|
if converted is not None:
|
||||||
|
new_text.append(converted)
|
||||||
|
remainder = part[len(entity) + 2 :]
|
||||||
|
if part:
|
||||||
|
new_text.append(remainder)
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_text.append(part)
|
||||||
|
|
||||||
|
return "".join(new_text)
|
||||||
|
|
||||||
|
|
||||||
|
def match_entity(stream):
|
||||||
|
"""Returns first entity in stream or None if no entity exists
|
||||||
|
|
||||||
|
Note: For Bleach purposes, entities must start with a "&" and end with
|
||||||
|
a ";". This ignoresambiguous character entities that have no ";" at the
|
||||||
|
end.
|
||||||
|
|
||||||
|
:arg stream: the character stream
|
||||||
|
|
||||||
|
:returns: ``None`` or the entity string without "&" or ";"
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Nix the & at the beginning
|
||||||
|
if stream[0] != "&":
|
||||||
|
raise ValueError('Stream should begin with "&"')
|
||||||
|
|
||||||
|
stream = stream[1:]
|
||||||
|
|
||||||
|
stream = list(stream)
|
||||||
|
possible_entity = ""
|
||||||
|
end_characters = "<&=;" + string.whitespace
|
||||||
|
|
||||||
|
# Handle number entities
|
||||||
|
if stream and stream[0] == "#":
|
||||||
|
possible_entity = "#"
|
||||||
|
stream.pop(0)
|
||||||
|
|
||||||
|
if stream and stream[0] in ("x", "X"):
|
||||||
|
allowed = "0123456789abcdefABCDEF"
|
||||||
|
possible_entity += stream.pop(0)
|
||||||
|
else:
|
||||||
|
allowed = "0123456789"
|
||||||
|
|
||||||
|
# FIXME(willkg): Do we want to make sure these are valid number
|
||||||
|
# entities? This doesn't do that currently.
|
||||||
|
while stream and stream[0] not in end_characters:
|
||||||
|
c = stream.pop(0)
|
||||||
|
if c not in allowed:
|
||||||
|
break
|
||||||
|
possible_entity += c
|
||||||
|
|
||||||
|
if possible_entity and stream and stream[0] == ";":
|
||||||
|
return possible_entity
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Handle character entities
|
||||||
|
while stream and stream[0] not in end_characters:
|
||||||
|
c = stream.pop(0)
|
||||||
|
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
|
||||||
|
break
|
||||||
|
possible_entity += c
|
||||||
|
|
||||||
|
if possible_entity and stream and stream[0] == ";":
|
||||||
|
return possible_entity
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
AMP_SPLIT_RE = re.compile("(&)")
|
||||||
|
|
||||||
|
|
||||||
|
def next_possible_entity(text):
|
||||||
|
"""Takes a text and generates a list of possible entities
|
||||||
|
|
||||||
|
:arg text: the text to look at
|
||||||
|
|
||||||
|
:returns: generator where each part (except the first) starts with an
|
||||||
|
"&"
|
||||||
|
|
||||||
|
"""
|
||||||
|
for i, part in enumerate(AMP_SPLIT_RE.split(text)):
|
||||||
|
if i == 0:
|
||||||
|
yield part
|
||||||
|
elif i % 2 == 0:
|
||||||
|
yield "&" + part
|
||||||
|
|
||||||
|
|
||||||
|
class BleachHTMLSerializer(HTMLSerializer):
|
||||||
|
"""HTMLSerializer that undoes & -> & in attributes and sets
|
||||||
|
escape_rcdata to True
|
||||||
|
"""
|
||||||
|
|
||||||
|
# per the HTMLSerializer.__init__ docstring:
|
||||||
|
#
|
||||||
|
# Whether to escape characters that need to be
|
||||||
|
# escaped within normal elements within rcdata elements such as
|
||||||
|
# style.
|
||||||
|
#
|
||||||
|
escape_rcdata = True
|
||||||
|
|
||||||
|
def escape_base_amp(self, stoken):
|
||||||
|
"""Escapes just bare & in HTML attribute values"""
|
||||||
|
# First, undo escaping of &. We need to do this because html5lib's
|
||||||
|
# HTMLSerializer expected the tokenizer to consume all the character
|
||||||
|
# entities and convert them to their respective characters, but the
|
||||||
|
# BleachHTMLTokenizer doesn't do that. For example, this fixes
|
||||||
|
# &entity; back to &entity; .
|
||||||
|
stoken = stoken.replace("&", "&")
|
||||||
|
|
||||||
|
# However, we do want all bare & that are not marking character
|
||||||
|
# entities to be changed to &, so let's do that carefully here.
|
||||||
|
for part in next_possible_entity(stoken):
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if part.startswith("&"):
|
||||||
|
entity = match_entity(part)
|
||||||
|
# Only leave entities in that are not ambiguous. If they're
|
||||||
|
# ambiguous, then we escape the ampersand.
|
||||||
|
if entity is not None and convert_entity(entity) is not None:
|
||||||
|
yield "&" + entity + ";"
|
||||||
|
|
||||||
|
# Length of the entity plus 2--one for & at the beginning
|
||||||
|
# and one for ; at the end
|
||||||
|
part = part[len(entity) + 2 :]
|
||||||
|
if part:
|
||||||
|
yield part
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield part.replace("&", "&")
|
||||||
|
|
||||||
|
def serialize(self, treewalker, encoding=None):
|
||||||
|
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values
|
||||||
|
|
||||||
|
Note that this converts & to & in attribute values where the & isn't
|
||||||
|
already part of an unambiguous character entity.
|
||||||
|
|
||||||
|
"""
|
||||||
|
in_tag = False
|
||||||
|
after_equals = False
|
||||||
|
|
||||||
|
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
|
||||||
|
if in_tag:
|
||||||
|
if stoken == ">":
|
||||||
|
in_tag = False
|
||||||
|
|
||||||
|
elif after_equals:
|
||||||
|
if stoken != '"':
|
||||||
|
for part in self.escape_base_amp(stoken):
|
||||||
|
yield part
|
||||||
|
|
||||||
|
after_equals = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif stoken == "=":
|
||||||
|
after_equals = True
|
||||||
|
|
||||||
|
yield stoken
|
||||||
|
else:
|
||||||
|
if stoken.startswith("<"):
|
||||||
|
in_tag = True
|
||||||
|
yield stoken
|
574
lib/bleach/linkifier.py
Normal file
574
lib/bleach/linkifier.py
Normal file
|
@ -0,0 +1,574 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
from bleach import callbacks as linkify_callbacks
|
||||||
|
from bleach import html5lib_shim
|
||||||
|
from bleach.utils import alphabetize_attributes
|
||||||
|
|
||||||
|
|
||||||
|
#: List of default callbacks
|
||||||
|
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
||||||
|
|
||||||
|
|
||||||
|
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
||||||
|
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
||||||
|
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
||||||
|
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
||||||
|
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
||||||
|
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
||||||
|
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
||||||
|
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
||||||
|
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
||||||
|
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
||||||
|
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
||||||
|
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
||||||
|
xn xxx ye yt yu za zm zw""".split()
|
||||||
|
|
||||||
|
# Make sure that .com doesn't get matched by .co first
|
||||||
|
TLDS.reverse()
|
||||||
|
|
||||||
|
|
||||||
|
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
|
||||||
|
"""Builds the url regex used by linkifier
|
||||||
|
|
||||||
|
If you want a different set of tlds or allowed protocols, pass those in
|
||||||
|
and stomp on the existing ``url_re``::
|
||||||
|
|
||||||
|
from bleach import linkifier
|
||||||
|
|
||||||
|
my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
|
||||||
|
|
||||||
|
linker = LinkifyFilter(url_re=my_url_re)
|
||||||
|
|
||||||
|
"""
|
||||||
|
return re.compile(
|
||||||
|
r"""\(* # Match any opening parentheses.
|
||||||
|
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
||||||
|
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||||
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
||||||
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||||
|
# except for # and ~, which happen in practice)
|
||||||
|
""".format(
|
||||||
|
"|".join(sorted(protocols)), "|".join(sorted(tlds))
|
||||||
|
),
|
||||||
|
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
URL_RE = build_url_re()
|
||||||
|
|
||||||
|
|
||||||
|
PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def build_email_re(tlds=TLDS):
|
||||||
|
"""Builds the email regex used by linkifier
|
||||||
|
|
||||||
|
If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
|
||||||
|
|
||||||
|
from bleach import linkifier
|
||||||
|
|
||||||
|
my_email_re = linkifier.build_email_re(my_tlds_list)
|
||||||
|
|
||||||
|
linker = LinkifyFilter(email_re=my_url_re)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# open and closing braces doubled below for format string
|
||||||
|
return re.compile(
|
||||||
|
r"""(?<!//)
|
||||||
|
(([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
|
||||||
|
(\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
|
||||||
|
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
||||||
|
|\\[\001-\011\013\014\016-\177])*" # quoted-string
|
||||||
|
)@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
|
||||||
|
""".format(
|
||||||
|
"|".join(tlds)
|
||||||
|
),
|
||||||
|
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
EMAIL_RE = build_email_re()
|
||||||
|
|
||||||
|
|
||||||
|
class Linker:
|
||||||
|
"""Convert URL-like strings in an HTML fragment to links
|
||||||
|
|
||||||
|
This function converts strings that look like URLs, domain names and email
|
||||||
|
addresses in text that may be an HTML fragment to links, while preserving:
|
||||||
|
|
||||||
|
1. links already in the string
|
||||||
|
2. urls found in attributes
|
||||||
|
3. email addresses
|
||||||
|
|
||||||
|
linkify does a best-effort approach and tries to recover from bad
|
||||||
|
situations due to crazy text.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
callbacks=DEFAULT_CALLBACKS,
|
||||||
|
skip_tags=None,
|
||||||
|
parse_email=False,
|
||||||
|
url_re=URL_RE,
|
||||||
|
email_re=EMAIL_RE,
|
||||||
|
recognized_tags=html5lib_shim.HTML_TAGS,
|
||||||
|
):
|
||||||
|
"""Creates a Linker instance
|
||||||
|
|
||||||
|
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||||
|
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||||
|
|
||||||
|
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||||
|
contents of; for example, you could set this to ``['pre']`` to skip
|
||||||
|
linkifying contents of ``pre`` tags
|
||||||
|
|
||||||
|
:arg bool parse_email: whether or not to linkify email addresses
|
||||||
|
|
||||||
|
:arg re url_re: url matching regex
|
||||||
|
|
||||||
|
:arg re email_re: email matching regex
|
||||||
|
|
||||||
|
:arg list-of-strings recognized_tags: the list of tags that linkify knows about;
|
||||||
|
everything else gets escaped
|
||||||
|
|
||||||
|
:returns: linkified text as unicode
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.callbacks = callbacks
|
||||||
|
self.skip_tags = skip_tags
|
||||||
|
self.parse_email = parse_email
|
||||||
|
self.url_re = url_re
|
||||||
|
self.email_re = email_re
|
||||||
|
|
||||||
|
# Create a parser/tokenizer that allows all HTML tags and escapes
|
||||||
|
# anything not in that list.
|
||||||
|
self.parser = html5lib_shim.BleachHTMLParser(
|
||||||
|
tags=recognized_tags,
|
||||||
|
strip=False,
|
||||||
|
consume_entities=True,
|
||||||
|
namespaceHTMLElements=False,
|
||||||
|
)
|
||||||
|
self.walker = html5lib_shim.getTreeWalker("etree")
|
||||||
|
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
||||||
|
quote_attr_values="always",
|
||||||
|
omit_optional_tags=False,
|
||||||
|
# linkify does not sanitize
|
||||||
|
sanitize=False,
|
||||||
|
# linkify alphabetizes
|
||||||
|
alphabetical_attributes=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def linkify(self, text):
|
||||||
|
"""Linkify specified text
|
||||||
|
|
||||||
|
:arg str text: the text to add links to
|
||||||
|
|
||||||
|
:returns: linkified text as unicode
|
||||||
|
|
||||||
|
:raises TypeError: if ``text`` is not a text type
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
raise TypeError("argument must be of text type")
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
dom = self.parser.parseFragment(text)
|
||||||
|
filtered = LinkifyFilter(
|
||||||
|
source=self.walker(dom),
|
||||||
|
callbacks=self.callbacks,
|
||||||
|
skip_tags=self.skip_tags,
|
||||||
|
parse_email=self.parse_email,
|
||||||
|
url_re=self.url_re,
|
||||||
|
email_re=self.email_re,
|
||||||
|
)
|
||||||
|
return self.serializer.render(filtered)
|
||||||
|
|
||||||
|
|
||||||
|
class LinkifyFilter(html5lib_shim.Filter):
|
||||||
|
"""html5lib filter that linkifies text
|
||||||
|
|
||||||
|
This will do the following:
|
||||||
|
|
||||||
|
* convert email addresses into links
|
||||||
|
* convert urls into links
|
||||||
|
* edit existing links by running them through callbacks--the default is to
|
||||||
|
add a ``rel="nofollow"``
|
||||||
|
|
||||||
|
This filter can be used anywhere html5lib filters can be used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
source,
|
||||||
|
callbacks=DEFAULT_CALLBACKS,
|
||||||
|
skip_tags=None,
|
||||||
|
parse_email=False,
|
||||||
|
url_re=URL_RE,
|
||||||
|
email_re=EMAIL_RE,
|
||||||
|
):
|
||||||
|
"""Creates a LinkifyFilter instance
|
||||||
|
|
||||||
|
:arg TreeWalker source: stream
|
||||||
|
|
||||||
|
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||||
|
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||||
|
|
||||||
|
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||||
|
contents of; for example, you could set this to ``['pre']`` to skip
|
||||||
|
linkifying contents of ``pre`` tags
|
||||||
|
|
||||||
|
:arg bool parse_email: whether or not to linkify email addresses
|
||||||
|
|
||||||
|
:arg re url_re: url matching regex
|
||||||
|
|
||||||
|
:arg re email_re: email matching regex
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(LinkifyFilter, self).__init__(source)
|
||||||
|
|
||||||
|
self.callbacks = callbacks or []
|
||||||
|
self.skip_tags = skip_tags or []
|
||||||
|
self.parse_email = parse_email
|
||||||
|
|
||||||
|
self.url_re = url_re
|
||||||
|
self.email_re = email_re
|
||||||
|
|
||||||
|
def apply_callbacks(self, attrs, is_new):
|
||||||
|
"""Given an attrs dict and an is_new bool, runs through callbacks
|
||||||
|
|
||||||
|
Callbacks can return an adjusted attrs dict or ``None``. In the case of
|
||||||
|
``None``, we stop going through callbacks and return that and the link
|
||||||
|
gets dropped.
|
||||||
|
|
||||||
|
:arg dict attrs: map of ``(namespace, name)`` -> ``value``
|
||||||
|
|
||||||
|
:arg bool is_new: whether or not this link was added by linkify
|
||||||
|
|
||||||
|
:returns: adjusted attrs dict or ``None``
|
||||||
|
|
||||||
|
"""
|
||||||
|
for cb in self.callbacks:
|
||||||
|
attrs = cb(attrs, is_new)
|
||||||
|
if attrs is None:
|
||||||
|
return None
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
def extract_character_data(self, token_list):
|
||||||
|
"""Extracts and squashes character sequences in a token stream"""
|
||||||
|
# FIXME(willkg): This is a terrible idea. What it does is drop all the
|
||||||
|
# tags from the token list and merge the Characters and SpaceCharacters
|
||||||
|
# tokens into a single text.
|
||||||
|
#
|
||||||
|
# So something like this::
|
||||||
|
#
|
||||||
|
# "<span>" "<b>" "some text" "</b>" "</span>"
|
||||||
|
#
|
||||||
|
# gets converted to "some text".
|
||||||
|
#
|
||||||
|
# This gets used to figure out the ``_text`` fauxttribute value for
|
||||||
|
# linkify callables.
|
||||||
|
#
|
||||||
|
# I'm not really sure how else to support that ``_text`` fauxttribute and
|
||||||
|
# maintain some modicum of backwards compatibility with previous versions
|
||||||
|
# of Bleach.
|
||||||
|
|
||||||
|
out = []
|
||||||
|
for token in token_list:
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type in ["Characters", "SpaceCharacters"]:
|
||||||
|
out.append(token["data"])
|
||||||
|
|
||||||
|
return "".join(out)
|
||||||
|
|
||||||
|
def handle_email_addresses(self, src_iter):
|
||||||
|
"""Handle email addresses in character tokens"""
|
||||||
|
for token in src_iter:
|
||||||
|
if token["type"] == "Characters":
|
||||||
|
text = token["data"]
|
||||||
|
new_tokens = []
|
||||||
|
end = 0
|
||||||
|
|
||||||
|
# For each email address we find in the text
|
||||||
|
for match in self.email_re.finditer(text):
|
||||||
|
if match.start() > end:
|
||||||
|
new_tokens.append(
|
||||||
|
{"type": "Characters", "data": text[end : match.start()]}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run attributes through the callbacks to see what we
|
||||||
|
# should do with this match
|
||||||
|
attrs = {
|
||||||
|
(None, "href"): "mailto:%s" % match.group(0),
|
||||||
|
"_text": match.group(0),
|
||||||
|
}
|
||||||
|
attrs = self.apply_callbacks(attrs, True)
|
||||||
|
|
||||||
|
if attrs is None:
|
||||||
|
# Just add the text--but not as a link
|
||||||
|
new_tokens.append(
|
||||||
|
{"type": "Characters", "data": match.group(0)}
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Add an "a" tag for the new link
|
||||||
|
_text = attrs.pop("_text", "")
|
||||||
|
attrs = alphabetize_attributes(attrs)
|
||||||
|
new_tokens.extend(
|
||||||
|
[
|
||||||
|
{"type": "StartTag", "name": "a", "data": attrs},
|
||||||
|
{"type": "Characters", "data": str(_text)},
|
||||||
|
{"type": "EndTag", "name": "a"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
end = match.end()
|
||||||
|
|
||||||
|
if new_tokens:
|
||||||
|
# Yield the adjusted set of tokens and then continue
|
||||||
|
# through the loop
|
||||||
|
if end < len(text):
|
||||||
|
new_tokens.append({"type": "Characters", "data": text[end:]})
|
||||||
|
|
||||||
|
for new_token in new_tokens:
|
||||||
|
yield new_token
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def strip_non_url_bits(self, fragment):
|
||||||
|
"""Strips non-url bits from the url
|
||||||
|
|
||||||
|
This accounts for over-eager matching by the regex.
|
||||||
|
|
||||||
|
"""
|
||||||
|
prefix = suffix = ""
|
||||||
|
|
||||||
|
while fragment:
|
||||||
|
# Try removing ( from the beginning and, if it's balanced, from the
|
||||||
|
# end, too
|
||||||
|
if fragment.startswith("("):
|
||||||
|
prefix = prefix + "("
|
||||||
|
fragment = fragment[1:]
|
||||||
|
|
||||||
|
if fragment.endswith(")"):
|
||||||
|
suffix = ")" + suffix
|
||||||
|
fragment = fragment[:-1]
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Now try extraneous things from the end. For example, sometimes we
|
||||||
|
# pick up ) at the end of a url, but the url is in a parenthesized
|
||||||
|
# phrase like:
|
||||||
|
#
|
||||||
|
# "i looked at the site (at http://example.com)"
|
||||||
|
|
||||||
|
if fragment.endswith(")") and "(" not in fragment:
|
||||||
|
fragment = fragment[:-1]
|
||||||
|
suffix = ")" + suffix
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle commas
|
||||||
|
if fragment.endswith(","):
|
||||||
|
fragment = fragment[:-1]
|
||||||
|
suffix = "," + suffix
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle periods
|
||||||
|
if fragment.endswith("."):
|
||||||
|
fragment = fragment[:-1]
|
||||||
|
suffix = "." + suffix
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Nothing matched, so we're done
|
||||||
|
break
|
||||||
|
|
||||||
|
return fragment, prefix, suffix
|
||||||
|
|
||||||
|
def handle_links(self, src_iter):
|
||||||
|
"""Handle links in character tokens"""
|
||||||
|
in_a = False # happens, if parse_email=True and if a mail was found
|
||||||
|
for token in src_iter:
|
||||||
|
if in_a:
|
||||||
|
if token["type"] == "EndTag" and token["name"] == "a":
|
||||||
|
in_a = False
|
||||||
|
yield token
|
||||||
|
continue
|
||||||
|
elif token["type"] == "StartTag" and token["name"] == "a":
|
||||||
|
in_a = True
|
||||||
|
yield token
|
||||||
|
continue
|
||||||
|
if token["type"] == "Characters":
|
||||||
|
text = token["data"]
|
||||||
|
new_tokens = []
|
||||||
|
end = 0
|
||||||
|
|
||||||
|
for match in self.url_re.finditer(text):
|
||||||
|
if match.start() > end:
|
||||||
|
new_tokens.append(
|
||||||
|
{"type": "Characters", "data": text[end : match.start()]}
|
||||||
|
)
|
||||||
|
|
||||||
|
url = match.group(0)
|
||||||
|
prefix = suffix = ""
|
||||||
|
|
||||||
|
# Sometimes we pick up too much in the url match, so look for
|
||||||
|
# bits we should drop and remove them from the match
|
||||||
|
url, prefix, suffix = self.strip_non_url_bits(url)
|
||||||
|
|
||||||
|
# If there's no protocol, add one
|
||||||
|
if PROTO_RE.search(url):
|
||||||
|
href = url
|
||||||
|
else:
|
||||||
|
href = "http://%s" % url
|
||||||
|
|
||||||
|
attrs = {(None, "href"): href, "_text": url}
|
||||||
|
attrs = self.apply_callbacks(attrs, True)
|
||||||
|
|
||||||
|
if attrs is None:
|
||||||
|
# Just add the text
|
||||||
|
new_tokens.append(
|
||||||
|
{"type": "Characters", "data": prefix + url + suffix}
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Add the "a" tag!
|
||||||
|
if prefix:
|
||||||
|
new_tokens.append({"type": "Characters", "data": prefix})
|
||||||
|
|
||||||
|
_text = attrs.pop("_text", "")
|
||||||
|
attrs = alphabetize_attributes(attrs)
|
||||||
|
|
||||||
|
new_tokens.extend(
|
||||||
|
[
|
||||||
|
{"type": "StartTag", "name": "a", "data": attrs},
|
||||||
|
{"type": "Characters", "data": str(_text)},
|
||||||
|
{"type": "EndTag", "name": "a"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if suffix:
|
||||||
|
new_tokens.append({"type": "Characters", "data": suffix})
|
||||||
|
|
||||||
|
end = match.end()
|
||||||
|
|
||||||
|
if new_tokens:
|
||||||
|
# Yield the adjusted set of tokens and then continue
|
||||||
|
# through the loop
|
||||||
|
if end < len(text):
|
||||||
|
new_tokens.append({"type": "Characters", "data": text[end:]})
|
||||||
|
|
||||||
|
for new_token in new_tokens:
|
||||||
|
yield new_token
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def handle_a_tag(self, token_buffer):
|
||||||
|
"""Handle the "a" tag
|
||||||
|
|
||||||
|
This could adjust the link or drop it altogether depending on what the
|
||||||
|
callbacks return.
|
||||||
|
|
||||||
|
This yields the new set of tokens.
|
||||||
|
|
||||||
|
"""
|
||||||
|
a_token = token_buffer[0]
|
||||||
|
if a_token["data"]:
|
||||||
|
attrs = a_token["data"]
|
||||||
|
else:
|
||||||
|
attrs = {}
|
||||||
|
text = self.extract_character_data(token_buffer)
|
||||||
|
attrs["_text"] = text
|
||||||
|
|
||||||
|
attrs = self.apply_callbacks(attrs, False)
|
||||||
|
|
||||||
|
if attrs is None:
|
||||||
|
# We're dropping the "a" tag and everything else and replacing
|
||||||
|
# it with character data. So emit that token.
|
||||||
|
yield {"type": "Characters", "data": text}
|
||||||
|
|
||||||
|
else:
|
||||||
|
new_text = attrs.pop("_text", "")
|
||||||
|
a_token["data"] = alphabetize_attributes(attrs)
|
||||||
|
|
||||||
|
if text == new_text:
|
||||||
|
# The callbacks didn't change the text, so we yield the new "a"
|
||||||
|
# token, then whatever else was there, then the end "a" token
|
||||||
|
yield a_token
|
||||||
|
for mem in token_buffer[1:]:
|
||||||
|
yield mem
|
||||||
|
|
||||||
|
else:
|
||||||
|
# If the callbacks changed the text, then we're going to drop
|
||||||
|
# all the tokens between the start and end "a" tags and replace
|
||||||
|
# it with the new text
|
||||||
|
yield a_token
|
||||||
|
yield {"type": "Characters", "data": str(new_text)}
|
||||||
|
yield token_buffer[-1]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
in_a = False
|
||||||
|
in_skip_tag = None
|
||||||
|
|
||||||
|
token_buffer = []
|
||||||
|
|
||||||
|
for token in super(LinkifyFilter, self).__iter__():
|
||||||
|
if in_a:
|
||||||
|
# Handle the case where we're in an "a" tag--we want to buffer tokens
|
||||||
|
# until we hit an end "a" tag.
|
||||||
|
if token["type"] == "EndTag" and token["name"] == "a":
|
||||||
|
# Add the end tag to the token buffer and then handle them
|
||||||
|
# and yield anything returned
|
||||||
|
token_buffer.append(token)
|
||||||
|
for new_token in self.handle_a_tag(token_buffer):
|
||||||
|
yield new_token
|
||||||
|
|
||||||
|
# Clear "a" related state and continue since we've yielded all
|
||||||
|
# the tokens we're going to yield
|
||||||
|
in_a = False
|
||||||
|
token_buffer = []
|
||||||
|
else:
|
||||||
|
token_buffer.append(token)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if token["type"] in ["StartTag", "EmptyTag"]:
|
||||||
|
if token["name"] in self.skip_tags:
|
||||||
|
# Skip tags start a "special mode" where we don't linkify
|
||||||
|
# anything until the end tag.
|
||||||
|
in_skip_tag = token["name"]
|
||||||
|
|
||||||
|
elif token["name"] == "a":
|
||||||
|
# The "a" tag is special--we switch to a slurp mode and
|
||||||
|
# slurp all the tokens until the end "a" tag and then
|
||||||
|
# figure out what to do with them there.
|
||||||
|
in_a = True
|
||||||
|
token_buffer.append(token)
|
||||||
|
|
||||||
|
# We buffer the start tag, so we don't want to yield it,
|
||||||
|
# yet
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif in_skip_tag and self.skip_tags:
|
||||||
|
# NOTE(willkg): We put this clause here since in_a and
|
||||||
|
# switching in and out of in_a takes precedence.
|
||||||
|
if token["type"] == "EndTag" and token["name"] == in_skip_tag:
|
||||||
|
in_skip_tag = None
|
||||||
|
|
||||||
|
elif not in_a and not in_skip_tag and token["type"] == "Characters":
|
||||||
|
new_stream = iter([token])
|
||||||
|
if self.parse_email:
|
||||||
|
new_stream = self.handle_email_addresses(new_stream)
|
||||||
|
|
||||||
|
new_stream = self.handle_links(new_stream)
|
||||||
|
|
||||||
|
for token in new_stream:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
# We've already yielded this token, so continue
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield token
|
|
@ -1,148 +1,645 @@
|
||||||
from __future__ import unicode_literals
|
from itertools import chain
|
||||||
import re
|
import re
|
||||||
from xml.sax.saxutils import escape, unescape
|
import warnings
|
||||||
|
|
||||||
from html5lib.constants import tokenTypes
|
from bleach._vendor.parse import urlparse
|
||||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
from xml.sax.saxutils import unescape
|
||||||
from html5lib.tokenizer import HTMLTokenizer
|
|
||||||
|
from bleach import html5lib_shim
|
||||||
|
from bleach.utils import alphabetize_attributes
|
||||||
|
|
||||||
|
|
||||||
PROTOS = HTMLSanitizerMixin.acceptable_protocols
|
#: List of allowed tags
|
||||||
PROTOS.remove('feed')
|
ALLOWED_TAGS = [
|
||||||
|
"a",
|
||||||
|
"abbr",
|
||||||
|
"acronym",
|
||||||
|
"b",
|
||||||
|
"blockquote",
|
||||||
|
"code",
|
||||||
|
"em",
|
||||||
|
"i",
|
||||||
|
"li",
|
||||||
|
"ol",
|
||||||
|
"strong",
|
||||||
|
"ul",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class BleachSanitizerMixin(HTMLSanitizerMixin):
|
#: Map of allowed attributes by tag
|
||||||
"""Mixin to replace sanitize_token() and sanitize_css()."""
|
ALLOWED_ATTRIBUTES = {
|
||||||
|
"a": ["href", "title"],
|
||||||
|
"abbr": ["title"],
|
||||||
|
"acronym": ["title"],
|
||||||
|
}
|
||||||
|
|
||||||
allowed_svg_properties = []
|
#: List of allowed styles
|
||||||
|
ALLOWED_STYLES = []
|
||||||
|
|
||||||
|
#: List of allowed protocols
|
||||||
|
ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
|
||||||
|
|
||||||
|
#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
|
||||||
|
INVISIBLE_CHARACTERS = "".join(
|
||||||
|
[chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
|
||||||
|
)
|
||||||
|
|
||||||
|
#: Regexp for characters that are invisible
|
||||||
|
INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
|
||||||
|
|
||||||
|
#: String to replace invisible characters with. This can be a character, a
|
||||||
|
#: string, or even a function that takes a Python re matchobj
|
||||||
|
INVISIBLE_REPLACEMENT_CHAR = "?"
|
||||||
|
|
||||||
|
|
||||||
|
class Cleaner:
|
||||||
|
"""Cleaner for cleaning HTML fragments of malicious content
|
||||||
|
|
||||||
|
This cleaner is a security-focused function whose sole purpose is to remove
|
||||||
|
malicious content from a string such that it can be displayed as content in
|
||||||
|
a web page.
|
||||||
|
|
||||||
|
To use::
|
||||||
|
|
||||||
|
from bleach.sanitizer import Cleaner
|
||||||
|
|
||||||
|
cleaner = Cleaner()
|
||||||
|
|
||||||
|
for text in all_the_yucky_things:
|
||||||
|
sanitized = cleaner.clean(text)
|
||||||
|
|
||||||
|
.. Note::
|
||||||
|
|
||||||
|
This cleaner is not designed to use to transform content to be used in
|
||||||
|
non-web-page contexts.
|
||||||
|
|
||||||
|
.. Warning::
|
||||||
|
|
||||||
|
This cleaner is not thread-safe--the html parser has internal state.
|
||||||
|
Create a separate cleaner per thread!
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
tags=ALLOWED_TAGS,
|
||||||
|
attributes=ALLOWED_ATTRIBUTES,
|
||||||
|
styles=ALLOWED_STYLES,
|
||||||
|
protocols=ALLOWED_PROTOCOLS,
|
||||||
|
strip=False,
|
||||||
|
strip_comments=True,
|
||||||
|
filters=None,
|
||||||
|
):
|
||||||
|
"""Initializes a Cleaner
|
||||||
|
|
||||||
|
:arg list tags: allowed list of tags; defaults to
|
||||||
|
``bleach.sanitizer.ALLOWED_TAGS``
|
||||||
|
|
||||||
|
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||||
|
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||||
|
|
||||||
|
:arg list styles: allowed list of css styles; defaults to
|
||||||
|
``bleach.sanitizer.ALLOWED_STYLES``
|
||||||
|
|
||||||
|
:arg list protocols: allowed list of protocols for links; defaults
|
||||||
|
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||||
|
|
||||||
|
:arg bool strip: whether or not to strip disallowed elements
|
||||||
|
|
||||||
|
:arg bool strip_comments: whether or not to strip HTML comments
|
||||||
|
|
||||||
|
:arg list filters: list of html5lib Filter classes to pass streamed content through
|
||||||
|
|
||||||
|
.. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
|
||||||
|
|
||||||
|
.. Warning::
|
||||||
|
|
||||||
|
Using filters changes the output of ``bleach.Cleaner.clean``.
|
||||||
|
Make sure the way the filters change the output are secure.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.tags = tags
|
||||||
|
self.attributes = attributes
|
||||||
|
self.styles = styles
|
||||||
|
self.protocols = protocols
|
||||||
|
self.strip = strip
|
||||||
|
self.strip_comments = strip_comments
|
||||||
|
self.filters = filters or []
|
||||||
|
|
||||||
|
self.parser = html5lib_shim.BleachHTMLParser(
|
||||||
|
tags=self.tags,
|
||||||
|
strip=self.strip,
|
||||||
|
consume_entities=False,
|
||||||
|
namespaceHTMLElements=False,
|
||||||
|
)
|
||||||
|
self.walker = html5lib_shim.getTreeWalker("etree")
|
||||||
|
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
||||||
|
quote_attr_values="always",
|
||||||
|
omit_optional_tags=False,
|
||||||
|
escape_lt_in_attrs=True,
|
||||||
|
# We want to leave entities as they are without escaping or
|
||||||
|
# resolving or expanding
|
||||||
|
resolve_entities=False,
|
||||||
|
# Bleach has its own sanitizer, so don't use the html5lib one
|
||||||
|
sanitize=False,
|
||||||
|
# Bleach sanitizer alphabetizes already, so don't use the html5lib one
|
||||||
|
alphabetical_attributes=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def clean(self, text):
|
||||||
|
"""Cleans text and returns sanitized result as unicode
|
||||||
|
|
||||||
|
:arg str text: text to be cleaned
|
||||||
|
|
||||||
|
:returns: sanitized text as unicode
|
||||||
|
|
||||||
|
:raises TypeError: if ``text`` is not a text type
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
message = (
|
||||||
|
"argument cannot be of '{name}' type, must be of text type".format(
|
||||||
|
name=text.__class__.__name__
|
||||||
|
)
|
||||||
|
)
|
||||||
|
raise TypeError(message)
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
dom = self.parser.parseFragment(text)
|
||||||
|
filtered = BleachSanitizerFilter(
|
||||||
|
source=self.walker(dom),
|
||||||
|
# Bleach-sanitizer-specific things
|
||||||
|
attributes=self.attributes,
|
||||||
|
strip_disallowed_elements=self.strip,
|
||||||
|
strip_html_comments=self.strip_comments,
|
||||||
|
# html5lib-sanitizer things
|
||||||
|
allowed_elements=self.tags,
|
||||||
|
allowed_css_properties=self.styles,
|
||||||
|
allowed_protocols=self.protocols,
|
||||||
|
allowed_svg_properties=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply any filters after the BleachSanitizerFilter
|
||||||
|
for filter_class in self.filters:
|
||||||
|
filtered = filter_class(source=filtered)
|
||||||
|
|
||||||
|
return self.serializer.render(filtered)
|
||||||
|
|
||||||
|
|
||||||
|
def attribute_filter_factory(attributes):
|
||||||
|
"""Generates attribute filter function for the given attributes value
|
||||||
|
|
||||||
|
The attributes value can take one of several shapes. This returns a filter
|
||||||
|
function appropriate to the attributes value. One nice thing about this is
|
||||||
|
that there's less if/then shenanigans in the ``allow_token`` method.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if callable(attributes):
|
||||||
|
return attributes
|
||||||
|
|
||||||
|
if isinstance(attributes, dict):
|
||||||
|
|
||||||
|
def _attr_filter(tag, attr, value):
|
||||||
|
if tag in attributes:
|
||||||
|
attr_val = attributes[tag]
|
||||||
|
if callable(attr_val):
|
||||||
|
return attr_val(tag, attr, value)
|
||||||
|
|
||||||
|
if attr in attr_val:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if "*" in attributes:
|
||||||
|
attr_val = attributes["*"]
|
||||||
|
if callable(attr_val):
|
||||||
|
return attr_val(tag, attr, value)
|
||||||
|
|
||||||
|
return attr in attr_val
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
return _attr_filter
|
||||||
|
|
||||||
|
if isinstance(attributes, list):
|
||||||
|
|
||||||
|
def _attr_filter(tag, attr, value):
|
||||||
|
return attr in attributes
|
||||||
|
|
||||||
|
return _attr_filter
|
||||||
|
|
||||||
|
raise ValueError("attributes needs to be a callable, a list or a dict")
|
||||||
|
|
||||||
|
|
||||||
|
class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
|
||||||
|
"""html5lib Filter that sanitizes text
|
||||||
|
|
||||||
|
This filter can be used anywhere html5lib filters can be used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
source,
|
||||||
|
attributes=ALLOWED_ATTRIBUTES,
|
||||||
|
strip_disallowed_elements=False,
|
||||||
|
strip_html_comments=True,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Creates a BleachSanitizerFilter instance
|
||||||
|
|
||||||
|
:arg Treewalker source: stream
|
||||||
|
|
||||||
|
:arg list tags: allowed list of tags; defaults to
|
||||||
|
``bleach.sanitizer.ALLOWED_TAGS``
|
||||||
|
|
||||||
|
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||||
|
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||||
|
|
||||||
|
:arg list styles: allowed list of css styles; defaults to
|
||||||
|
``bleach.sanitizer.ALLOWED_STYLES``
|
||||||
|
|
||||||
|
:arg list protocols: allowed list of protocols for links; defaults
|
||||||
|
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||||
|
|
||||||
|
:arg bool strip_disallowed_elements: whether or not to strip disallowed
|
||||||
|
elements
|
||||||
|
|
||||||
|
:arg bool strip_html_comments: whether or not to strip HTML comments
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.attr_filter = attribute_filter_factory(attributes)
|
||||||
|
self.strip_disallowed_elements = strip_disallowed_elements
|
||||||
|
self.strip_html_comments = strip_html_comments
|
||||||
|
|
||||||
|
# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
message="html5lib's sanitizer is deprecated",
|
||||||
|
category=DeprecationWarning,
|
||||||
|
module="bleach._vendor.html5lib",
|
||||||
|
)
|
||||||
|
return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
|
||||||
|
|
||||||
|
def sanitize_stream(self, token_iterator):
|
||||||
|
for token in token_iterator:
|
||||||
|
ret = self.sanitize_token(token)
|
||||||
|
|
||||||
|
if not ret:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(ret, list):
|
||||||
|
for subtoken in ret:
|
||||||
|
yield subtoken
|
||||||
|
else:
|
||||||
|
yield ret
|
||||||
|
|
||||||
|
def merge_characters(self, token_iterator):
|
||||||
|
"""Merge consecutive Characters tokens in a stream"""
|
||||||
|
characters_buffer = []
|
||||||
|
|
||||||
|
for token in token_iterator:
|
||||||
|
if characters_buffer:
|
||||||
|
if token["type"] == "Characters":
|
||||||
|
characters_buffer.append(token)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Merge all the characters tokens together into one and then
|
||||||
|
# operate on it.
|
||||||
|
new_token = {
|
||||||
|
"data": "".join(
|
||||||
|
[char_token["data"] for char_token in characters_buffer]
|
||||||
|
),
|
||||||
|
"type": "Characters",
|
||||||
|
}
|
||||||
|
characters_buffer = []
|
||||||
|
yield new_token
|
||||||
|
|
||||||
|
elif token["type"] == "Characters":
|
||||||
|
characters_buffer.append(token)
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield token
|
||||||
|
|
||||||
|
new_token = {
|
||||||
|
"data": "".join([char_token["data"] for char_token in characters_buffer]),
|
||||||
|
"type": "Characters",
|
||||||
|
}
|
||||||
|
yield new_token
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self.merge_characters(
|
||||||
|
self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
|
||||||
|
)
|
||||||
|
|
||||||
def sanitize_token(self, token):
|
def sanitize_token(self, token):
|
||||||
"""Sanitize a token either by HTML-encoding or dropping.
|
"""Sanitize a token either by HTML-encoding or dropping.
|
||||||
|
|
||||||
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
|
Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
|
||||||
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
|
['attribute', 'pairs'], 'tag': callable}.
|
||||||
|
|
||||||
Here callable is a function with two arguments of attribute name
|
Here callable is a function with two arguments of attribute name and
|
||||||
and value. It should return true of false.
|
value. It should return true of false.
|
||||||
|
|
||||||
Also gives the option to strip tags instead of encoding.
|
Also gives the option to strip tags instead of encoding.
|
||||||
|
|
||||||
"""
|
:arg dict token: token to sanitize
|
||||||
if (getattr(self, 'wildcard_attributes', None) is None and
|
|
||||||
isinstance(self.allowed_attributes, dict)):
|
:returns: token or list of tokens
|
||||||
self.wildcard_attributes = self.allowed_attributes.get('*', [])
|
|
||||||
|
"""
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type in ["StartTag", "EndTag", "EmptyTag"]:
|
||||||
|
if token["name"] in self.allowed_elements:
|
||||||
|
return self.allow_token(token)
|
||||||
|
|
||||||
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
|
|
||||||
tokenTypes['EmptyTag']):
|
|
||||||
if token['name'] in self.allowed_elements:
|
|
||||||
if 'data' in token:
|
|
||||||
if isinstance(self.allowed_attributes, dict):
|
|
||||||
allowed_attributes = self.allowed_attributes.get(
|
|
||||||
token['name'], [])
|
|
||||||
#print callable(allowed_attributes)
|
|
||||||
if not callable(allowed_attributes):
|
|
||||||
allowed_attributes += self.wildcard_attributes
|
|
||||||
else:
|
|
||||||
allowed_attributes = self.allowed_attributes
|
|
||||||
attrs = dict([(name, val) for name, val in
|
|
||||||
token['data'][::-1]
|
|
||||||
if (allowed_attributes(name, val)
|
|
||||||
if callable(allowed_attributes)
|
|
||||||
else name in allowed_attributes)])
|
|
||||||
for attr in self.attr_val_is_uri:
|
|
||||||
if attr not in attrs:
|
|
||||||
continue
|
|
||||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
|
||||||
unescape(attrs[attr])).lower()
|
|
||||||
# Remove replacement characters from unescaped
|
|
||||||
# characters.
|
|
||||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
|
||||||
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
|
|
||||||
and (val_unescaped.split(':')[0] not in
|
|
||||||
self.allowed_protocols)):
|
|
||||||
del attrs[attr]
|
|
||||||
for attr in self.svg_attr_val_allows_ref:
|
|
||||||
if attr in attrs:
|
|
||||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
|
||||||
' ',
|
|
||||||
unescape(attrs[attr]))
|
|
||||||
if (token['name'] in self.svg_allow_local_href and
|
|
||||||
'xlink:href' in attrs and
|
|
||||||
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
|
|
||||||
del attrs['xlink:href']
|
|
||||||
if 'style' in attrs:
|
|
||||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
|
||||||
token['data'] = [(name, val) for name, val in
|
|
||||||
attrs.items()]
|
|
||||||
return token
|
|
||||||
elif self.strip_disallowed_elements:
|
elif self.strip_disallowed_elements:
|
||||||
pass
|
return None
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if token['type'] == tokenTypes['EndTag']:
|
if "data" in token:
|
||||||
token['data'] = '</{0!s}>'.format(token['name'])
|
# Alphabetize the attributes before calling .disallowed_token()
|
||||||
elif token['data']:
|
# so that the resulting string is stable
|
||||||
attr = ' {0!s}="{1!s}"'
|
token["data"] = alphabetize_attributes(token["data"])
|
||||||
attrs = ''.join([attr.format(k, escape(v)) for k, v in
|
return self.disallowed_token(token)
|
||||||
token['data']])
|
|
||||||
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
|
elif token_type == "Comment":
|
||||||
else:
|
|
||||||
token['data'] = '<{0!s}>'.format(token['name'])
|
|
||||||
if token['selfClosing']:
|
|
||||||
token['data'] = token['data'][:-1] + '/>'
|
|
||||||
token['type'] = tokenTypes['Characters']
|
|
||||||
del token["name"]
|
|
||||||
return token
|
|
||||||
elif token['type'] == tokenTypes['Comment']:
|
|
||||||
if not self.strip_html_comments:
|
if not self.strip_html_comments:
|
||||||
|
# call lxml.sax.saxutils to escape &, <, and > in addition to " and '
|
||||||
|
token["data"] = html5lib_shim.escape(
|
||||||
|
token["data"], entities={'"': """, "'": "'"}
|
||||||
|
)
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif token_type == "Characters":
|
||||||
|
return self.sanitize_characters(token)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
|
||||||
|
def sanitize_characters(self, token):
|
||||||
|
"""Handles Characters tokens
|
||||||
|
|
||||||
|
Our overridden tokenizer doesn't do anything with entities. However,
|
||||||
|
that means that the serializer will convert all ``&`` in Characters
|
||||||
|
tokens to ``&``.
|
||||||
|
|
||||||
|
Since we don't want that, we extract entities here and convert them to
|
||||||
|
Entity tokens so the serializer will let them be.
|
||||||
|
|
||||||
|
:arg token: the Characters token to work on
|
||||||
|
|
||||||
|
:returns: a list of tokens
|
||||||
|
|
||||||
|
"""
|
||||||
|
data = token.get("data", "")
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
return token
|
||||||
|
|
||||||
|
data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
|
||||||
|
token["data"] = data
|
||||||
|
|
||||||
|
# If there isn't a & in the data, we can return now
|
||||||
|
if "&" not in data:
|
||||||
|
return token
|
||||||
|
|
||||||
|
new_tokens = []
|
||||||
|
|
||||||
|
# For each possible entity that starts with a "&", we try to extract an
|
||||||
|
# actual entity and re-tokenize accordingly
|
||||||
|
for part in html5lib_shim.next_possible_entity(data):
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if part.startswith("&"):
|
||||||
|
entity = html5lib_shim.match_entity(part)
|
||||||
|
if entity is not None:
|
||||||
|
if entity == "amp":
|
||||||
|
# LinkifyFilter can't match urls across token boundaries
|
||||||
|
# which is problematic with & since that shows up in
|
||||||
|
# querystrings all the time. This special-cases &
|
||||||
|
# and converts it to a & and sticks it in as a
|
||||||
|
# Characters token. It'll get merged with surrounding
|
||||||
|
# tokens in the BleachSanitizerfilter.__iter__ and
|
||||||
|
# escaped in the serializer.
|
||||||
|
new_tokens.append({"type": "Characters", "data": "&"})
|
||||||
|
else:
|
||||||
|
new_tokens.append({"type": "Entity", "name": entity})
|
||||||
|
|
||||||
|
# Length of the entity plus 2--one for & at the beginning
|
||||||
|
# and one for ; at the end
|
||||||
|
remainder = part[len(entity) + 2 :]
|
||||||
|
if remainder:
|
||||||
|
new_tokens.append({"type": "Characters", "data": remainder})
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_tokens.append({"type": "Characters", "data": part})
|
||||||
|
|
||||||
|
return new_tokens
|
||||||
|
|
||||||
|
def sanitize_uri_value(self, value, allowed_protocols):
|
||||||
|
"""Checks a uri value to see if it's allowed
|
||||||
|
|
||||||
|
:arg value: the uri value to sanitize
|
||||||
|
:arg allowed_protocols: list of allowed protocols
|
||||||
|
|
||||||
|
:returns: allowed value or None
|
||||||
|
|
||||||
|
"""
|
||||||
|
# NOTE(willkg): This transforms the value into one that's easier to
|
||||||
|
# match and verify, but shouldn't get returned since it's vastly
|
||||||
|
# different than the original value.
|
||||||
|
|
||||||
|
# Convert all character entities in the value
|
||||||
|
new_value = html5lib_shim.convert_entities(value)
|
||||||
|
|
||||||
|
# Nix backtick, space characters, and control characters
|
||||||
|
new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
|
||||||
|
|
||||||
|
# Remove REPLACEMENT characters
|
||||||
|
new_value = new_value.replace("\ufffd", "")
|
||||||
|
|
||||||
|
# Lowercase it--this breaks the value, but makes it easier to match
|
||||||
|
# against
|
||||||
|
new_value = new_value.lower()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Drop attributes with uri values that have protocols that aren't
|
||||||
|
# allowed
|
||||||
|
parsed = urlparse(new_value)
|
||||||
|
except ValueError:
|
||||||
|
# URI is impossible to parse, therefore it's not allowed
|
||||||
|
return None
|
||||||
|
|
||||||
|
if parsed.scheme:
|
||||||
|
# If urlparse found a scheme, check that
|
||||||
|
if parsed.scheme in allowed_protocols:
|
||||||
|
return value
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Allow uris that are just an anchor
|
||||||
|
if new_value.startswith("#"):
|
||||||
|
return value
|
||||||
|
|
||||||
|
# Handle protocols that urlparse doesn't recognize like "myprotocol"
|
||||||
|
if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
|
||||||
|
return value
|
||||||
|
|
||||||
|
# If there's no protocol/scheme specified, then assume it's "http"
|
||||||
|
# and see if that's allowed
|
||||||
|
if "http" in allowed_protocols:
|
||||||
|
return value
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def allow_token(self, token):
|
||||||
|
"""Handles the case where we're allowing the tag"""
|
||||||
|
if "data" in token:
|
||||||
|
# Loop through all the attributes and drop the ones that are not
|
||||||
|
# allowed, are unsafe or break other rules. Additionally, fix
|
||||||
|
# attribute values that need fixing.
|
||||||
|
#
|
||||||
|
# At the end of this loop, we have the final set of attributes
|
||||||
|
# we're keeping.
|
||||||
|
attrs = {}
|
||||||
|
for namespaced_name, val in token["data"].items():
|
||||||
|
namespace, name = namespaced_name
|
||||||
|
|
||||||
|
# Drop attributes that are not explicitly allowed
|
||||||
|
#
|
||||||
|
# NOTE(willkg): We pass in the attribute name--not a namespaced
|
||||||
|
# name.
|
||||||
|
if not self.attr_filter(token["name"], name, val):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Drop attributes with uri values that use a disallowed protocol
|
||||||
|
# Sanitize attributes with uri values
|
||||||
|
if namespaced_name in self.attr_val_is_uri:
|
||||||
|
new_value = self.sanitize_uri_value(val, self.allowed_protocols)
|
||||||
|
if new_value is None:
|
||||||
|
continue
|
||||||
|
val = new_value
|
||||||
|
|
||||||
|
# Drop values in svg attrs with non-local IRIs
|
||||||
|
if namespaced_name in self.svg_attr_val_allows_ref:
|
||||||
|
new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
|
||||||
|
new_val = new_val.strip()
|
||||||
|
if not new_val:
|
||||||
|
continue
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Replace the val with the unescaped version because
|
||||||
|
# it's a iri
|
||||||
|
val = new_val
|
||||||
|
|
||||||
|
# Drop href and xlink:href attr for svg elements with non-local IRIs
|
||||||
|
if (None, token["name"]) in self.svg_allow_local_href:
|
||||||
|
if namespaced_name in [
|
||||||
|
(None, "href"),
|
||||||
|
(html5lib_shim.namespaces["xlink"], "href"),
|
||||||
|
]:
|
||||||
|
if re.search(r"^\s*[^#\s]", val):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If it's a style attribute, sanitize it
|
||||||
|
if namespaced_name == (None, "style"):
|
||||||
|
val = self.sanitize_css(val)
|
||||||
|
|
||||||
|
# At this point, we want to keep the attribute, so add it in
|
||||||
|
attrs[namespaced_name] = val
|
||||||
|
|
||||||
|
token["data"] = alphabetize_attributes(attrs)
|
||||||
|
|
||||||
|
return token
|
||||||
|
|
||||||
|
def disallowed_token(self, token):
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type == "EndTag":
|
||||||
|
token["data"] = "</%s>" % token["name"]
|
||||||
|
|
||||||
|
elif token["data"]:
|
||||||
|
assert token_type in ("StartTag", "EmptyTag")
|
||||||
|
attrs = []
|
||||||
|
for (ns, name), v in token["data"].items():
|
||||||
|
# If we end up with a namespace, but no name, switch them so we
|
||||||
|
# have a valid name to use.
|
||||||
|
if ns and not name:
|
||||||
|
ns, name = name, ns
|
||||||
|
|
||||||
|
# Figure out namespaced name if the namespace is appropriate
|
||||||
|
# and exists; if the ns isn't in prefixes, then drop it.
|
||||||
|
if ns is None or ns not in html5lib_shim.prefixes:
|
||||||
|
namespaced_name = name
|
||||||
|
else:
|
||||||
|
namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
|
||||||
|
|
||||||
|
attrs.append(
|
||||||
|
' %s="%s"'
|
||||||
|
% (
|
||||||
|
namespaced_name,
|
||||||
|
# NOTE(willkg): HTMLSerializer escapes attribute values
|
||||||
|
# already, so if we do it here (like HTMLSerializer does),
|
||||||
|
# then we end up double-escaping.
|
||||||
|
v,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
|
||||||
|
|
||||||
|
else:
|
||||||
|
token["data"] = "<%s>" % token["name"]
|
||||||
|
|
||||||
|
if token.get("selfClosing"):
|
||||||
|
token["data"] = token["data"][:-1] + "/>"
|
||||||
|
|
||||||
|
token["type"] = "Characters"
|
||||||
|
|
||||||
|
del token["name"]
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def sanitize_css(self, style):
|
def sanitize_css(self, style):
|
||||||
"""HTMLSanitizerMixin.sanitize_css replacement.
|
"""Sanitizes css in style tags"""
|
||||||
|
# Convert entities in the style so that it can be parsed as CSS
|
||||||
|
style = html5lib_shim.convert_entities(style)
|
||||||
|
|
||||||
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
|
# Drop any url values before we do anything else
|
||||||
border-*, margin-*, and padding-*. We only whitelist what's in
|
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
|
||||||
the whitelist.
|
|
||||||
|
|
||||||
"""
|
# The gauntlet of sanitization
|
||||||
# disallow urls
|
|
||||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
# Validate the css in the style tag and if it's not valid, then drop
|
||||||
|
# the whole thing.
|
||||||
|
parts = style.split(";")
|
||||||
|
gauntlet = re.compile(
|
||||||
|
r"""^( # consider a style attribute value as composed of:
|
||||||
|
[/:,#%!.\s\w] # a non-newline character
|
||||||
|
|\w-\w # 3 characters in the form \w-\w
|
||||||
|
|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
|
||||||
|
|"[\s\w]+" # a double quoted string of [\s\w]+
|
||||||
|
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
|
||||||
|
)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
|
||||||
|
flags=re.U | re.VERBOSE,
|
||||||
|
)
|
||||||
|
|
||||||
# gauntlet
|
|
||||||
# TODO: Make sure this does what it's meant to - I *think* it wants to
|
|
||||||
# validate style attribute contents.
|
|
||||||
parts = style.split(';')
|
|
||||||
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
|
|
||||||
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
|
|
||||||
for part in parts:
|
for part in parts:
|
||||||
if not gauntlet.match(part):
|
if not gauntlet.match(part):
|
||||||
return ''
|
return ""
|
||||||
|
|
||||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||||
return ''
|
return ""
|
||||||
|
|
||||||
clean = []
|
clean = []
|
||||||
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
|
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||||
if not value:
|
if not value:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if prop.lower() in self.allowed_css_properties:
|
if prop.lower() in self.allowed_css_properties:
|
||||||
clean.append(prop + ': ' + value + ';')
|
clean.append(prop + ": " + value + ";")
|
||||||
|
|
||||||
elif prop.lower() in self.allowed_svg_properties:
|
elif prop.lower() in self.allowed_svg_properties:
|
||||||
clean.append(prop + ': ' + value + ';')
|
clean.append(prop + ": " + value + ";")
|
||||||
|
|
||||||
return ' '.join(clean)
|
return " ".join(clean)
|
||||||
|
|
||||||
|
|
||||||
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
|
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
|
||||||
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
|
|
||||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
|
||||||
lowercaseElementName, lowercaseAttrName,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for token in HTMLTokenizer.__iter__(self):
|
|
||||||
token = self.sanitize_token(token)
|
|
||||||
if token:
|
|
||||||
yield token
|
|
||||||
|
|
21
lib/bleach/utils.py
Normal file
21
lib/bleach/utils.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
def _attr_key(attr):
|
||||||
|
"""Returns appropriate key for sorting attribute names
|
||||||
|
|
||||||
|
Attribute names are a tuple of ``(namespace, name)`` where namespace can be
|
||||||
|
``None`` or a string. These can't be compared in Python 3, so we conver the
|
||||||
|
``None`` to an empty string.
|
||||||
|
|
||||||
|
"""
|
||||||
|
key = (attr[0][0] or ""), attr[0][1]
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
def alphabetize_attributes(attrs):
|
||||||
|
"""Takes a dict of attributes (or None) and returns them alphabetized"""
|
||||||
|
if not attrs:
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])
|
Loading…
Add table
Add a link
Reference in a new issue