mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 05:01:14 -07:00
Update bleach-4.1.0
This commit is contained in:
parent
4086529906
commit
a4130d6c56
51 changed files with 17071 additions and 568 deletions
|
@ -1,401 +1,131 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import logging
|
||||
import re
|
||||
import packaging.version
|
||||
|
||||
import html5lib
|
||||
from html5lib.sanitizer import HTMLSanitizer
|
||||
from html5lib.serializer.htmlserializer import HTMLSerializer
|
||||
|
||||
from . import callbacks as linkify_callbacks
|
||||
from .encoding import force_unicode
|
||||
from .sanitizer import BleachSanitizer
|
||||
from bleach.linkifier import (
|
||||
DEFAULT_CALLBACKS,
|
||||
Linker,
|
||||
)
|
||||
from bleach.sanitizer import (
|
||||
ALLOWED_ATTRIBUTES,
|
||||
ALLOWED_PROTOCOLS,
|
||||
ALLOWED_STYLES,
|
||||
ALLOWED_TAGS,
|
||||
Cleaner,
|
||||
)
|
||||
|
||||
|
||||
VERSION = (1, 4, 2)
|
||||
__version__ = '.'.join([str(n) for n in VERSION])
|
||||
|
||||
__all__ = ['clean', 'linkify']
|
||||
|
||||
log = logging.getLogger('bleach')
|
||||
|
||||
ALLOWED_TAGS = [
|
||||
'a',
|
||||
'abbr',
|
||||
'acronym',
|
||||
'b',
|
||||
'blockquote',
|
||||
'code',
|
||||
'em',
|
||||
'i',
|
||||
'li',
|
||||
'ol',
|
||||
'strong',
|
||||
'ul',
|
||||
]
|
||||
|
||||
ALLOWED_ATTRIBUTES = {
|
||||
'a': ['href', 'title'],
|
||||
'abbr': ['title'],
|
||||
'acronym': ['title'],
|
||||
}
|
||||
|
||||
ALLOWED_STYLES = []
|
||||
|
||||
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
|
||||
|
||||
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
||||
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
||||
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
||||
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
||||
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
||||
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
||||
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
||||
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
||||
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
||||
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
||||
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
||||
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
||||
xn xxx ye yt yu za zm zw""".split()
|
||||
|
||||
# Make sure that .com doesn't get matched by .co first
|
||||
TLDS.reverse()
|
||||
|
||||
PROTOCOLS = HTMLSanitizer.acceptable_protocols
|
||||
|
||||
url_re = re.compile(
|
||||
r"""\(* # Match any opening parentheses.
|
||||
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
||||
([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
||||
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||
# except for # and ~, which happen in practice)
|
||||
""".format('|'.join(PROTOCOLS), '|'.join(TLDS)),
|
||||
re.IGNORECASE | re.VERBOSE | re.UNICODE)
|
||||
|
||||
proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
|
||||
|
||||
punct_re = re.compile(r'([\.,]+)$')
|
||||
|
||||
email_re = re.compile(
|
||||
r"""(?<!//)
|
||||
(([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
|
||||
(\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)* # dot-atom
|
||||
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
||||
|\\[\001-011\013\014\016-\177])*" # quoted-string
|
||||
)@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.? # domain
|
||||
""",
|
||||
re.IGNORECASE | re.MULTILINE | re.VERBOSE)
|
||||
|
||||
NODE_TEXT = 4 # The numeric ID of a text node in simpletree.
|
||||
|
||||
ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
|
||||
# a simple routine that returns the tag name with the namespace prefix
|
||||
# as returned by etree's Element.tag attribute
|
||||
|
||||
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
||||
# yyyymmdd
|
||||
__releasedate__ = "20210825"
|
||||
# x.y.z or x.y.z.dev0 -- semver
|
||||
__version__ = "4.1.0"
|
||||
VERSION = packaging.version.Version(__version__)
|
||||
|
||||
|
||||
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
|
||||
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
|
||||
strip_comments=True):
|
||||
"""Clean an HTML fragment and return it
|
||||
__all__ = ["clean", "linkify"]
|
||||
|
||||
:arg text: the text to clean
|
||||
:arg tags: whitelist of allowed tags; defaults to
|
||||
``bleach.ALLOWED_TAGS``
|
||||
:arg attributes: whitelist of allowed attributes; defaults to
|
||||
``bleach.ALLOWED_ATTRIBUTES``
|
||||
:arg styles: whitelist of allowed css; defaults to
|
||||
``bleach.ALLOWED_STYLES``
|
||||
:arg protocols: whitelist of allowed protocols for links; defaults
|
||||
to ``bleach.ALLOWED_PROTOCOLS``
|
||||
:arg strip: whether or not to strip disallowed elements
|
||||
:arg strip_comments: whether or not to strip HTML comments
|
||||
|
||||
def clean(
|
||||
text,
|
||||
tags=ALLOWED_TAGS,
|
||||
attributes=ALLOWED_ATTRIBUTES,
|
||||
styles=ALLOWED_STYLES,
|
||||
protocols=ALLOWED_PROTOCOLS,
|
||||
strip=False,
|
||||
strip_comments=True,
|
||||
):
|
||||
"""Clean an HTML fragment of malicious content and return it
|
||||
|
||||
This function is a security-focused function whose sole purpose is to
|
||||
remove malicious content from a string such that it can be displayed as
|
||||
content in a web page.
|
||||
|
||||
This function is not designed to use to transform content to be used in
|
||||
non-web-page contexts.
|
||||
|
||||
Example::
|
||||
|
||||
import bleach
|
||||
|
||||
better_text = bleach.clean(yucky_text)
|
||||
|
||||
|
||||
.. Note::
|
||||
|
||||
If you're cleaning a lot of text and passing the same argument values or
|
||||
you want more configurability, consider using a
|
||||
:py:class:`bleach.sanitizer.Cleaner` instance.
|
||||
|
||||
:arg str text: the text to clean
|
||||
|
||||
:arg list tags: allowed list of tags; defaults to
|
||||
``bleach.sanitizer.ALLOWED_TAGS``
|
||||
|
||||
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||
|
||||
:arg list styles: allowed list of css styles; defaults to
|
||||
``bleach.sanitizer.ALLOWED_STYLES``
|
||||
|
||||
:arg list protocols: allowed list of protocols for links; defaults
|
||||
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||
|
||||
:arg bool strip: whether or not to strip disallowed elements
|
||||
|
||||
:arg bool strip_comments: whether or not to strip HTML comments
|
||||
|
||||
:returns: cleaned text as unicode
|
||||
|
||||
"""
|
||||
if not text:
|
||||
return ''
|
||||
|
||||
text = force_unicode(text)
|
||||
|
||||
class s(BleachSanitizer):
|
||||
allowed_elements = tags
|
||||
allowed_attributes = attributes
|
||||
allowed_css_properties = styles
|
||||
allowed_protocols = protocols
|
||||
strip_disallowed_elements = strip
|
||||
strip_html_comments = strip_comments
|
||||
|
||||
parser = html5lib.HTMLParser(tokenizer=s)
|
||||
|
||||
return _render(parser.parseFragment(text))
|
||||
cleaner = Cleaner(
|
||||
tags=tags,
|
||||
attributes=attributes,
|
||||
styles=styles,
|
||||
protocols=protocols,
|
||||
strip=strip,
|
||||
strip_comments=strip_comments,
|
||||
)
|
||||
return cleaner.clean(text)
|
||||
|
||||
|
||||
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
|
||||
parse_email=False, tokenizer=HTMLSanitizer):
|
||||
"""Convert URL-like strings in an HTML fragment to links.
|
||||
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
|
||||
"""Convert URL-like strings in an HTML fragment to links
|
||||
|
||||
This function converts strings that look like URLs, domain names and email
|
||||
addresses in text that may be an HTML fragment to links, while preserving:
|
||||
|
||||
1. links already in the string
|
||||
2. urls found in attributes
|
||||
3. email addresses
|
||||
|
||||
linkify does a best-effort approach and tries to recover from bad
|
||||
situations due to crazy text.
|
||||
|
||||
.. Note::
|
||||
|
||||
If you're linking a lot of text and passing the same argument values or
|
||||
you want more configurability, consider using a
|
||||
:py:class:`bleach.linkifier.Linker` instance.
|
||||
|
||||
.. Note::
|
||||
|
||||
If you have text that you want to clean and then linkify, consider using
|
||||
the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
|
||||
pass. That way you're not parsing the HTML twice.
|
||||
|
||||
:arg str text: the text to linkify
|
||||
|
||||
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||
|
||||
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||
contents of; for example, you could set this to ``['pre']`` to skip
|
||||
linkifying contents of ``pre`` tags
|
||||
|
||||
:arg bool parse_email: whether or not to linkify email addresses
|
||||
|
||||
:returns: linkified text as unicode
|
||||
|
||||
linkify() converts strings that look like URLs or domain names in a
|
||||
blob of text that may be an HTML fragment to links, while preserving
|
||||
(a) links already in the string, (b) urls found in attributes, and
|
||||
(c) email addresses.
|
||||
"""
|
||||
text = force_unicode(text)
|
||||
|
||||
if not text:
|
||||
return ''
|
||||
|
||||
parser = html5lib.HTMLParser(tokenizer=tokenizer)
|
||||
|
||||
forest = parser.parseFragment(text)
|
||||
_seen = set([])
|
||||
|
||||
def replace_nodes(tree, new_frag, node, index=0):
|
||||
"""
|
||||
Doesn't really replace nodes, but inserts the nodes contained in
|
||||
new_frag into the treee at position index and returns the number
|
||||
of nodes inserted.
|
||||
If node is passed in, it is removed from the tree
|
||||
"""
|
||||
count = 0
|
||||
new_tree = parser.parseFragment(new_frag)
|
||||
# capture any non-tag text at the start of the fragment
|
||||
if new_tree.text:
|
||||
if index == 0:
|
||||
tree.text = tree.text or ''
|
||||
tree.text += new_tree.text
|
||||
else:
|
||||
tree[index - 1].tail = tree[index - 1].tail or ''
|
||||
tree[index - 1].tail += new_tree.text
|
||||
# the put in the tagged elements into the old tree
|
||||
for n in new_tree:
|
||||
if n.tag == ETREE_TAG('a'):
|
||||
_seen.add(n)
|
||||
tree.insert(index + count, n)
|
||||
count += 1
|
||||
# if we got a node to remove...
|
||||
if node is not None:
|
||||
tree.remove(node)
|
||||
return count
|
||||
|
||||
def strip_wrapping_parentheses(fragment):
|
||||
"""Strips wrapping parentheses.
|
||||
|
||||
Returns a tuple of the following format::
|
||||
|
||||
(string stripped from wrapping parentheses,
|
||||
count of stripped opening parentheses,
|
||||
count of stripped closing parentheses)
|
||||
"""
|
||||
opening_parentheses = closing_parentheses = 0
|
||||
# Count consecutive opening parentheses
|
||||
# at the beginning of the fragment (string).
|
||||
for char in fragment:
|
||||
if char == '(':
|
||||
opening_parentheses += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if opening_parentheses:
|
||||
newer_frag = ''
|
||||
# Cut the consecutive opening brackets from the fragment.
|
||||
fragment = fragment[opening_parentheses:]
|
||||
# Reverse the fragment for easier detection of parentheses
|
||||
# inside the URL.
|
||||
reverse_fragment = fragment[::-1]
|
||||
skip = False
|
||||
for char in reverse_fragment:
|
||||
# Remove the closing parentheses if it has a matching
|
||||
# opening parentheses (they are balanced).
|
||||
if (char == ')' and
|
||||
closing_parentheses < opening_parentheses and
|
||||
not skip):
|
||||
closing_parentheses += 1
|
||||
continue
|
||||
# Do not remove ')' from the URL itself.
|
||||
elif char != ')':
|
||||
skip = True
|
||||
newer_frag += char
|
||||
fragment = newer_frag[::-1]
|
||||
|
||||
return fragment, opening_parentheses, closing_parentheses
|
||||
|
||||
def apply_callbacks(attrs, new):
|
||||
for cb in callbacks:
|
||||
attrs = cb(attrs, new)
|
||||
if attrs is None:
|
||||
return None
|
||||
return attrs
|
||||
|
||||
def _render_inner(node):
|
||||
out = ['' if node.text is None else node.text]
|
||||
for subnode in node:
|
||||
out.append(_render(subnode))
|
||||
if subnode.tail:
|
||||
out.append(subnode.tail)
|
||||
return ''.join(out)
|
||||
|
||||
def linkify_nodes(tree, parse_text=True):
|
||||
children = len(tree)
|
||||
current_child = -1
|
||||
# start at -1 to process the parent first
|
||||
while current_child < len(tree):
|
||||
if current_child < 0:
|
||||
node = tree
|
||||
if parse_text and node.text:
|
||||
new_txt = old_txt = node.text
|
||||
if parse_email:
|
||||
new_txt = re.sub(email_re, email_repl, node.text)
|
||||
if new_txt and new_txt != node.text:
|
||||
node.text = ''
|
||||
adj = replace_nodes(tree, new_txt, None, 0)
|
||||
children += adj
|
||||
current_child += adj
|
||||
linkify_nodes(tree, True)
|
||||
continue
|
||||
|
||||
new_txt = re.sub(url_re, link_repl, new_txt)
|
||||
if new_txt != old_txt:
|
||||
node.text = ''
|
||||
adj = replace_nodes(tree, new_txt, None, 0)
|
||||
children += adj
|
||||
current_child += adj
|
||||
continue
|
||||
else:
|
||||
node = tree[current_child]
|
||||
|
||||
if parse_text and node.tail:
|
||||
new_tail = old_tail = node.tail
|
||||
if parse_email:
|
||||
new_tail = re.sub(email_re, email_repl, new_tail)
|
||||
if new_tail != node.tail:
|
||||
node.tail = ''
|
||||
adj = replace_nodes(tree, new_tail, None,
|
||||
current_child + 1)
|
||||
# Insert the new nodes made from my tail into
|
||||
# the tree right after me. current_child+1
|
||||
children += adj
|
||||
continue
|
||||
|
||||
new_tail = re.sub(url_re, link_repl, new_tail)
|
||||
if new_tail != old_tail:
|
||||
node.tail = ''
|
||||
adj = replace_nodes(tree, new_tail, None,
|
||||
current_child + 1)
|
||||
children += adj
|
||||
|
||||
if node.tag == ETREE_TAG('a') and not (node in _seen):
|
||||
if not node.get('href', None) is None:
|
||||
attrs = dict(node.items())
|
||||
|
||||
_text = attrs['_text'] = _render_inner(node)
|
||||
|
||||
attrs = apply_callbacks(attrs, False)
|
||||
|
||||
if attrs is None:
|
||||
# <a> tag replaced by the text within it
|
||||
adj = replace_nodes(tree, _text, node,
|
||||
current_child)
|
||||
current_child -= 1
|
||||
# pull back current_child by 1 to scan the
|
||||
# new nodes again.
|
||||
else:
|
||||
text = force_unicode(attrs.pop('_text'))
|
||||
for attr_key, attr_val in attrs.items():
|
||||
node.set(attr_key, attr_val)
|
||||
|
||||
for n in reversed(list(node)):
|
||||
node.remove(n)
|
||||
text = parser.parseFragment(text)
|
||||
node.text = text.text
|
||||
for n in text:
|
||||
node.append(n)
|
||||
_seen.add(node)
|
||||
|
||||
elif current_child >= 0:
|
||||
if node.tag == ETREE_TAG('pre') and skip_pre:
|
||||
linkify_nodes(node, False)
|
||||
elif not (node in _seen):
|
||||
linkify_nodes(node, True)
|
||||
|
||||
current_child += 1
|
||||
|
||||
def email_repl(match):
|
||||
addr = match.group(0).replace('"', '"')
|
||||
link = {
|
||||
'_text': addr,
|
||||
'href': 'mailto:{0!s}'.format(addr),
|
||||
}
|
||||
link = apply_callbacks(link, True)
|
||||
|
||||
if link is None:
|
||||
return addr
|
||||
|
||||
_href = link.pop('href')
|
||||
_text = link.pop('_text')
|
||||
|
||||
repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
|
||||
attr = '{0!s}="{1!s}"'
|
||||
attribs = ' '.join(attr.format(k, v) for k, v in link.items())
|
||||
return repl.format(_href, attribs, _text)
|
||||
|
||||
def link_repl(match):
|
||||
url = match.group(0)
|
||||
open_brackets = close_brackets = 0
|
||||
if url.startswith('('):
|
||||
_wrapping = strip_wrapping_parentheses(url)
|
||||
url, open_brackets, close_brackets = _wrapping
|
||||
end = ''
|
||||
m = re.search(punct_re, url)
|
||||
if m:
|
||||
end = m.group(0)
|
||||
url = url[0:m.start()]
|
||||
if re.search(proto_re, url):
|
||||
href = url
|
||||
else:
|
||||
href = ''.join(['http://', url])
|
||||
|
||||
link = {
|
||||
'_text': url,
|
||||
'href': href,
|
||||
}
|
||||
|
||||
link = apply_callbacks(link, True)
|
||||
|
||||
if link is None:
|
||||
return '(' * open_brackets + url + ')' * close_brackets
|
||||
|
||||
_text = link.pop('_text')
|
||||
_href = link.pop('href')
|
||||
|
||||
repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
|
||||
attr = '{0!s}="{1!s}"'
|
||||
attribs = ' '.join(attr.format(k, v) for k, v in link.items())
|
||||
|
||||
return repl.format('(' * open_brackets,
|
||||
_href, attribs, _text, end,
|
||||
')' * close_brackets)
|
||||
|
||||
try:
|
||||
linkify_nodes(forest)
|
||||
except RuntimeError as e:
|
||||
# If we hit the max recursion depth, just return what we've got.
|
||||
log.exception('Probable recursion error: {0!r}'.format(e))
|
||||
|
||||
return _render(forest)
|
||||
|
||||
|
||||
def _render(tree):
|
||||
"""Try rendering as HTML, then XML, then give up."""
|
||||
return force_unicode(_serialize(tree))
|
||||
|
||||
|
||||
def _serialize(domtree):
|
||||
walker = html5lib.treewalkers.getTreeWalker('etree')
|
||||
stream = walker(domtree)
|
||||
serializer = HTMLSerializer(quote_attr_values=True,
|
||||
alphabetical_attributes=True,
|
||||
omit_optional_tags=False)
|
||||
return serializer.render(stream)
|
||||
linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email)
|
||||
return linker.linkify(text)
|
||||
|
|
61
lib/bleach/_vendor/README.rst
Normal file
61
lib/bleach/_vendor/README.rst
Normal file
|
@ -0,0 +1,61 @@
|
|||
=======================
|
||||
Vendored library policy
|
||||
=======================
|
||||
|
||||
To simplify Bleach development, we're now vendoring certain libraries that
|
||||
we use.
|
||||
|
||||
Vendored libraries must follow these rules:
|
||||
|
||||
1. Vendored libraries must be pure Python--no compiling.
|
||||
2. Source code for the libary is included in this directory.
|
||||
3. License must be included in this repo and in the Bleach distribution.
|
||||
4. Requirements of the library become requirements of Bleach.
|
||||
5. No modifications to the library may be made.
|
||||
|
||||
|
||||
Adding/Updating a vendored library
|
||||
==================================
|
||||
|
||||
Way to vendor a library or update a version:
|
||||
|
||||
1. Update ``vendor.txt`` with the library, version, and hash. You can use
|
||||
`hashin <https://pypi.org/project/hashin/>`_.
|
||||
2. Remove all old files and directories of the old version.
|
||||
3. Run ``pip_install_vendor.sh`` and check everything it produced in including
|
||||
the ``.dist-info`` directory and contents.
|
||||
4. Update the bleach minor version in the next release.
|
||||
|
||||
|
||||
Reviewing a change involving a vendored library
|
||||
===============================================
|
||||
|
||||
Way to verify a vendored library addition/update:
|
||||
|
||||
1. Pull down the branch.
|
||||
2. Delete all the old files and directories of the old version.
|
||||
3. Run ``pip_install_vendor.sh``.
|
||||
4. Run ``git diff`` and verify there are no changes.
|
||||
|
||||
|
||||
NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info
|
||||
|
||||
|
||||
Removing/Unvendoring a vendored library
|
||||
=======================================
|
||||
|
||||
A vendored library might be removed for any of the following reasons:
|
||||
|
||||
* it violates the vendoring policy (e.g. an incompatible license
|
||||
change)
|
||||
* a suitable replacement is found
|
||||
* bleach has the resources to test and QA new bleach releases against
|
||||
multiple versions of the previously vendored library
|
||||
|
||||
To unvendor a library:
|
||||
|
||||
1. Remove the library and its hashes from ``vendor.txt``.
|
||||
2. Remove library files and directories from this directory.
|
||||
3. Run ``install_vendor.sh`` and check the previously vendored library including
|
||||
the ``.dist-info`` directory and contents is not installed.
|
||||
4. Update the bleach minor version in the next release.
|
0
lib/bleach/_vendor/__init__.py
Normal file
0
lib/bleach/_vendor/__init__.py
Normal file
66
lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
Normal file
66
lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst
Normal file
|
@ -0,0 +1,66 @@
|
|||
Credits
|
||||
=======
|
||||
|
||||
``html5lib`` is written and maintained by:
|
||||
|
||||
- James Graham
|
||||
- Sam Sneddon
|
||||
- Łukasz Langa
|
||||
- Will Kahn-Greene
|
||||
|
||||
|
||||
Patches and suggestions
|
||||
-----------------------
|
||||
(In chronological order, by first commit:)
|
||||
|
||||
- Anne van Kesteren
|
||||
- Lachlan Hunt
|
||||
- lantis63
|
||||
- Sam Ruby
|
||||
- Thomas Broyer
|
||||
- Tim Fletcher
|
||||
- Mark Pilgrim
|
||||
- Ryan King
|
||||
- Philip Taylor
|
||||
- Edward Z. Yang
|
||||
- fantasai
|
||||
- Philip Jägenstedt
|
||||
- Ms2ger
|
||||
- Mohammad Taha Jahangir
|
||||
- Andy Wingo
|
||||
- Andreas Madsack
|
||||
- Karim Valiev
|
||||
- Juan Carlos Garcia Segovia
|
||||
- Mike West
|
||||
- Marc DM
|
||||
- Simon Sapin
|
||||
- Michael[tm] Smith
|
||||
- Ritwik Gupta
|
||||
- Marc Abramowitz
|
||||
- Tony Lopes
|
||||
- lilbludevil
|
||||
- Kevin
|
||||
- Drew Hubl
|
||||
- Austin Kumbera
|
||||
- Jim Baker
|
||||
- Jon Dufresne
|
||||
- Donald Stufft
|
||||
- Alex Gaynor
|
||||
- Nik Nyby
|
||||
- Jakub Wilk
|
||||
- Sigmund Cherem
|
||||
- Gabi Davar
|
||||
- Florian Mounier
|
||||
- neumond
|
||||
- Vitalik Verhovodov
|
||||
- Kovid Goyal
|
||||
- Adam Chainz
|
||||
- John Vandenberg
|
||||
- Eric Amorde
|
||||
- Benedikt Morbach
|
||||
- Jonathan Vanasco
|
||||
- Tom Most
|
||||
- Ville Skyttä
|
||||
- Hugo van Kemenade
|
||||
- Mark Vasilkov
|
||||
|
1
lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
Normal file
1
lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER
Normal file
|
@ -0,0 +1 @@
|
|||
pip
|
552
lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
Normal file
552
lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA
Normal file
|
@ -0,0 +1,552 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: html5lib
|
||||
Version: 1.1
|
||||
Summary: HTML parser based on the WHATWG HTML specification
|
||||
Home-page: https://github.com/html5lib/html5lib-python
|
||||
Maintainer: James Graham
|
||||
Maintainer-email: james@hoppipolla.co.uk
|
||||
License: MIT License
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Classifier: Programming Language :: Python :: 2.7
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing :: Markup :: HTML
|
||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
|
||||
Requires-Dist: six (>=1.9)
|
||||
Requires-Dist: webencodings
|
||||
Provides-Extra: all
|
||||
Requires-Dist: genshi ; extra == 'all'
|
||||
Requires-Dist: chardet (>=2.2) ; extra == 'all'
|
||||
Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all'
|
||||
Provides-Extra: chardet
|
||||
Requires-Dist: chardet (>=2.2) ; extra == 'chardet'
|
||||
Provides-Extra: genshi
|
||||
Requires-Dist: genshi ; extra == 'genshi'
|
||||
Provides-Extra: lxml
|
||||
Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml'
|
||||
|
||||
html5lib
|
||||
========
|
||||
|
||||
.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master
|
||||
:target: https://travis-ci.org/html5lib/html5lib-python
|
||||
|
||||
|
||||
html5lib is a pure-python library for parsing HTML. It is designed to
|
||||
conform to the WHATWG HTML specification, as is implemented by all major
|
||||
web browsers.
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Simple usage follows this pattern:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
with open("mydocument.html", "rb") as f:
|
||||
document = html5lib.parse(f)
|
||||
|
||||
or:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
document = html5lib.parse("<p>Hello World!")
|
||||
|
||||
By default, the ``document`` will be an ``xml.etree`` element instance.
|
||||
Whenever possible, html5lib chooses the accelerated ``ElementTree``
|
||||
implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
|
||||
|
||||
Two other tree types are supported: ``xml.dom.minidom`` and
|
||||
``lxml.etree``. To use an alternative format, specify the name of
|
||||
a treebuilder:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
with open("mydocument.html", "rb") as f:
|
||||
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
|
||||
|
||||
When using with ``urllib2`` (Python 2), the charset from HTTP should be
|
||||
pass into html5lib as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from contextlib import closing
|
||||
from urllib2 import urlopen
|
||||
import html5lib
|
||||
|
||||
with closing(urlopen("http://example.com/")) as f:
|
||||
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
|
||||
|
||||
When using with ``urllib.request`` (Python 3), the charset from HTTP
|
||||
should be pass into html5lib as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from urllib.request import urlopen
|
||||
import html5lib
|
||||
|
||||
with urlopen("http://example.com/") as f:
|
||||
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
|
||||
|
||||
To have more control over the parser, create a parser object explicitly.
|
||||
For instance, to make the parser raise exceptions on parse errors, use:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
with open("mydocument.html", "rb") as f:
|
||||
parser = html5lib.HTMLParser(strict=True)
|
||||
document = parser.parse(f)
|
||||
|
||||
When you're instantiating parser objects explicitly, pass a treebuilder
|
||||
class as the ``tree`` keyword argument to use an alternative document
|
||||
format:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
|
||||
minidom_document = parser.parse("<p>Hello World!")
|
||||
|
||||
More documentation is available at https://html5lib.readthedocs.io/.
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install html5lib
|
||||
|
||||
The goal is to support a (non-strict) superset of the versions that `pip
|
||||
supports
|
||||
<https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
|
||||
|
||||
Optional Dependencies
|
||||
---------------------
|
||||
|
||||
The following third-party libraries may be used for additional
|
||||
functionality:
|
||||
|
||||
- ``lxml`` is supported as a tree format (for both building and
|
||||
walking) under CPython (but *not* PyPy where it is known to cause
|
||||
segfaults);
|
||||
|
||||
- ``genshi`` has a treewalker (but not builder); and
|
||||
|
||||
- ``chardet`` can be used as a fallback when character encoding cannot
|
||||
be determined.
|
||||
|
||||
|
||||
Bugs
|
||||
----
|
||||
|
||||
Please report any bugs on the `issue tracker
|
||||
<https://github.com/html5lib/html5lib-python/issues>`_.
|
||||
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
Unit tests require the ``pytest`` and ``mock`` libraries and can be
|
||||
run using the ``py.test`` command in the root directory.
|
||||
|
||||
Test data are contained in a separate `html5lib-tests
|
||||
<https://github.com/html5lib/html5lib-tests>`_ repository and included
|
||||
as a submodule, thus for git checkouts they must be initialized::
|
||||
|
||||
$ git submodule init
|
||||
$ git submodule update
|
||||
|
||||
If you have all compatible Python implementations available on your
|
||||
system, you can run tests on all of them using the ``tox`` utility,
|
||||
which can be found on PyPI.
|
||||
|
||||
|
||||
Questions?
|
||||
----------
|
||||
|
||||
There's a mailing list available for support on Google Groups,
|
||||
`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
|
||||
though you may get a quicker response asking on IRC in `#whatwg on
|
||||
irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
1.1
|
||||
~~~
|
||||
|
||||
UNRELEASED
|
||||
|
||||
Breaking changes:
|
||||
|
||||
* Drop support for Python 3.3. (#358)
|
||||
* Drop support for Python 3.4. (#421)
|
||||
|
||||
Deprecations:
|
||||
|
||||
* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
|
||||
``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
|
||||
<https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
|
||||
use. (#443)
|
||||
|
||||
Other changes:
|
||||
|
||||
* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
|
||||
``html5lib`` keeps working in future Python versions. (#403)
|
||||
* Drop optional ``datrie`` dependency. (#442)
|
||||
|
||||
|
||||
1.0.1
|
||||
~~~~~
|
||||
|
||||
Released on December 7, 2017
|
||||
|
||||
Breaking changes:
|
||||
|
||||
* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
|
||||
* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
|
||||
|
||||
Features:
|
||||
|
||||
* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
|
||||
Will Kahn-Greene!)
|
||||
* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
|
||||
* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
|
||||
* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
|
||||
* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
|
||||
* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
|
||||
Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
|
||||
* Semver-compliant version number.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* Add support for setuptools < 18.5 to support environment markers. (Thank you,
|
||||
John Vandenberg!)
|
||||
* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
|
||||
* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
|
||||
you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
|
||||
* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
|
||||
Kahn-Greene!)
|
||||
* Include license file in generated wheel package. (#350) (Thank you, Jon
|
||||
Dufresne!)
|
||||
* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
|
||||
* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
|
||||
Komal Dembla, Hugo!)
|
||||
|
||||
|
||||
1.0
|
||||
~~~
|
||||
|
||||
Released and unreleased on December 7, 2017. Badly packaged release.
|
||||
|
||||
|
||||
0.999999999/1.0b10
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Released on July 15, 2016
|
||||
|
||||
* Fix attribute order going to the tree builder to be document order
|
||||
instead of reverse document order(!).
|
||||
|
||||
|
||||
0.99999999/1.0b9
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Released on July 14, 2016
|
||||
|
||||
* **Added ordereddict as a mandatory dependency on Python 2.6.**
|
||||
|
||||
* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
|
||||
extras that will do the right thing based on the specific
|
||||
interpreter implementation.
|
||||
|
||||
* Now requires the ``mock`` package for the testsuite.
|
||||
|
||||
* Cease supporting DATrie under PyPy.
|
||||
|
||||
* **Remove PullDOM support, as this hasn't ever been properly
|
||||
tested, doesn't entirely work, and as far as I can tell is
|
||||
completely unused by anyone.**
|
||||
|
||||
* Move testsuite to ``py.test``.
|
||||
|
||||
* **Fix #124: move to webencodings for decoding the input byte stream;
|
||||
this makes html5lib compliant with the Encoding Standard, and
|
||||
introduces a required dependency on webencodings.**
|
||||
|
||||
* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
|
||||
|
||||
* **Fix comments containing double-dash with lxml 3.5 and above.**
|
||||
|
||||
* **Use scripting disabled by default (as we don't implement
|
||||
scripting).**
|
||||
|
||||
* **Fix #11, avoiding the XSS bug potentially caused by serializer
|
||||
allowing attribute values to be escaped out of in old browser versions,
|
||||
changing the quote_attr_values option on serializer to take one of
|
||||
three values, "always" (the old True value), "legacy" (the new option,
|
||||
and the new default), and "spec" (the old False value, and the old
|
||||
default).**
|
||||
|
||||
* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
|
||||
(instead of the tokenizer); as such, this will require amending all
|
||||
callers of it to use it via the treewalker API.**
|
||||
|
||||
* **Drop support of charade, now that chardet is supported once more.**
|
||||
|
||||
* **Replace the charset keyword argument on parse and related methods
|
||||
with a set of keyword arguments: override_encoding, transport_encoding,
|
||||
same_origin_parent_encoding, likely_encoding, and default_encoding.**
|
||||
|
||||
* **Move filters._base, treebuilder._base, and treewalkers._base to .base
|
||||
to clarify their status as public.**
|
||||
|
||||
* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
|
||||
sanitizer.htmlsanitizer module and move that to sanitizer. This means
|
||||
anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
|
||||
code changes.**
|
||||
|
||||
* **Rename treewalkers.lxmletree to .etree_lxml and
|
||||
treewalkers.genshistream to .genshi to have a consistent API.**
|
||||
|
||||
* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
|
||||
utils) to be underscore prefixed to clarify their status as private.
|
||||
|
||||
|
||||
0.9999999/1.0b8
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Released on September 10, 2015
|
||||
|
||||
* Fix #195: fix the sanitizer to drop broken URLs (it threw an
|
||||
exception between 0.9999 and 0.999999).
|
||||
|
||||
|
||||
0.999999/1.0b7
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Released on July 7, 2015
|
||||
|
||||
* Fix #189: fix the sanitizer to allow relative URLs again (as it did
|
||||
prior to 0.9999/1.0b5).
|
||||
|
||||
|
||||
0.99999/1.0b6
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
Released on April 30, 2015
|
||||
|
||||
* Fix #188: fix the sanitizer to not throw an exception when sanitizing
|
||||
bogus data URLs.
|
||||
|
||||
|
||||
0.9999/1.0b5
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Released on April 29, 2015
|
||||
|
||||
* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
|
||||
this sounds, this has no known security implications. No known version
|
||||
of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
|
||||
Chrome (1 to current), or Opera (12 to current) will run any script
|
||||
provided in these attributes.
|
||||
|
||||
* Pass error message to the ParseError exception in strict parsing mode.
|
||||
|
||||
* Allow data URIs in the sanitizer, with a whitelist of content-types.
|
||||
|
||||
* Add support for Python implementations that don't support lone
|
||||
surrogates (read: Jython). Fixes #2.
|
||||
|
||||
* Remove localization of error messages. This functionality was totally
|
||||
unused (and untested that everything was localizable), so we may as
|
||||
well follow numerous browsers in not supporting translating technical
|
||||
strings.
|
||||
|
||||
* Expose treewalkers.pprint as a public API.
|
||||
|
||||
* Add a documentEncoding property to HTML5Parser, fix #121.
|
||||
|
||||
|
||||
0.999
|
||||
~~~~~
|
||||
|
||||
Released on December 23, 2013
|
||||
|
||||
* Fix #127: add work-around for CPython issue #20007: .read(0) on
|
||||
http.client.HTTPResponse drops the rest of the content.
|
||||
|
||||
* Fix #115: lxml treewalker can now deal with fragments containing, at
|
||||
their root level, text nodes with non-ASCII characters on Python 2.
|
||||
|
||||
|
||||
0.99
|
||||
~~~~
|
||||
|
||||
Released on September 10, 2013
|
||||
|
||||
* No library changes from 1.0b3; released as 0.99 as pip has changed
|
||||
behaviour from 1.4 to avoid installing pre-release versions per
|
||||
PEP 440.
|
||||
|
||||
|
||||
1.0b3
|
||||
~~~~~
|
||||
|
||||
Released on July 24, 2013
|
||||
|
||||
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
|
||||
implementation using it should be moved to
|
||||
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
|
||||
for years.
|
||||
|
||||
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
|
||||
object, thereby fixing any case where html5lib is passed a
|
||||
non-seekable RawIOBase-like object.
|
||||
|
||||
|
||||
1.0b2
|
||||
~~~~~
|
||||
|
||||
Released on June 27, 2013
|
||||
|
||||
* Removed reordering of attributes within the serializer. There is now
|
||||
an ``alphabetical_attributes`` option which preserves the previous
|
||||
behaviour through a new filter. This allows attribute order to be
|
||||
preserved through html5lib if the tree builder preserves order.
|
||||
|
||||
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
|
||||
``treeadapters.sax.to_sax`` which is generic and supports any
|
||||
treewalker; it also resolves all known bugs with ``dom2sax``.
|
||||
|
||||
* Fix treewalker assertions on hitting bytes strings on
|
||||
Python 2. Previous to 1.0b1, treewalkers coped with mixed
|
||||
bytes/unicode data on Python 2; this reintroduces this prior
|
||||
behaviour on Python 2. Behaviour is unchanged on Python 3.
|
||||
|
||||
|
||||
1.0b1
|
||||
~~~~~
|
||||
|
||||
Released on May 17, 2013
|
||||
|
||||
* Implementation updated to implement the `HTML specification
|
||||
<http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
|
||||
2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
|
||||
|
||||
* Python 3.2+ supported in a single codebase using the ``six`` library.
|
||||
|
||||
* Removed support for Python 2.5 and older.
|
||||
|
||||
* Removed the deprecated Beautiful Soup 3 treebuilder.
|
||||
``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
|
||||
since it doesn't support namespaces, foreign content like SVG and
|
||||
MathML is parsed incorrectly.
|
||||
|
||||
* Removed ``simpletree`` from the package. The default tree builder is
|
||||
now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
|
||||
available, and ``xml.etree.ElementTree`` otherwise).
|
||||
|
||||
* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
|
||||
output was well-formed XML, and hence provided little of use.
|
||||
|
||||
* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
|
||||
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
|
||||
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
|
||||
|
||||
* Optional heuristic character encoding detection now based on
|
||||
``charade`` for Python 2.6 - 3.3 compatibility.
|
||||
|
||||
* Optional ``Genshi`` treewalker support fixed.
|
||||
|
||||
* Many bugfixes, including:
|
||||
|
||||
* #33: null in attribute value breaks XML AttValue;
|
||||
|
||||
* #4: nested, indirect descendant, <button> causes infinite loop;
|
||||
|
||||
* `Google Code 215
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
|
||||
detect seekable streams;
|
||||
|
||||
* `Google Code 206
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
|
||||
support for <video preload=...>, <audio preload=...>;
|
||||
|
||||
* `Google Code 205
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
|
||||
support for <video poster=...>;
|
||||
|
||||
* `Google Code 202
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
|
||||
file breaks InputStream.
|
||||
|
||||
* Source code is now mostly PEP 8 compliant.
|
||||
|
||||
* Test harness has been improved and now depends on ``nose``.
|
||||
|
||||
* Documentation updated and moved to https://html5lib.readthedocs.io/.
|
||||
|
||||
|
||||
0.95
|
||||
~~~~
|
||||
|
||||
Released on February 11, 2012
|
||||
|
||||
|
||||
0.90
|
||||
~~~~
|
||||
|
||||
Released on January 17, 2010
|
||||
|
||||
|
||||
0.11.1
|
||||
~~~~~~
|
||||
|
||||
Released on June 12, 2008
|
||||
|
||||
|
||||
0.11
|
||||
~~~~
|
||||
|
||||
Released on June 10, 2008
|
||||
|
||||
|
||||
0.10
|
||||
~~~~
|
||||
|
||||
Released on October 7, 2007
|
||||
|
||||
|
||||
0.9
|
||||
~~~
|
||||
|
||||
Released on March 11, 2007
|
||||
|
||||
|
||||
0.2
|
||||
~~~
|
||||
|
||||
Released on January 8, 2007
|
||||
|
||||
|
41
lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
Normal file
41
lib/bleach/_vendor/html5lib-1.1.dist-info/RECORD
Normal file
|
@ -0,0 +1,41 @@
|
|||
html5lib-1.1.dist-info/AUTHORS.rst,sha256=DrNAMifoDpuQyJn-KW-H6K8Tt2a5rKnV2UF4-DRrGUI,983
|
||||
html5lib-1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
html5lib-1.1.dist-info/LICENSE,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
|
||||
html5lib-1.1.dist-info/METADATA,sha256=Y3w-nd_22HQnQRy3yypVsV_ke2FF94uUD4-vGpc2DnI,16076
|
||||
html5lib-1.1.dist-info/RECORD,,
|
||||
html5lib-1.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
html5lib-1.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
||||
html5lib-1.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
|
||||
html5lib/__init__.py,sha256=pWnYcfZ69wNLrdQL7bpr49FUi8O8w0KhKCOHsyRgYGQ,1143
|
||||
html5lib/_ihatexml.py,sha256=ifOwF7pXqmyThIXc3boWc96s4MDezqRrRVp7FwDYUFs,16728
|
||||
html5lib/_inputstream.py,sha256=IKuMiY8rzb7pqIGCpbvTqsxysLEpgEHWYvYEFu4LUAI,32300
|
||||
html5lib/_tokenizer.py,sha256=WvJQa2Mli4NtTmhLXkX8Jy5FcWttqCaiDTiKyaw8D-k,77028
|
||||
html5lib/_trie/__init__.py,sha256=nqfgO910329BEVJ5T4psVwQtjd2iJyEXQ2-X8c1YxwU,109
|
||||
html5lib/_trie/_base.py,sha256=CaybYyMro8uERQYjby2tTeSUatnWDfWroUN9N7ety5w,1013
|
||||
html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
|
||||
html5lib/_utils.py,sha256=AxAJSG15eyarCgKMnlUwzs1X6jFHXqEvhlYEOxAFmis,4919
|
||||
html5lib/constants.py,sha256=Ll-yzLU_jcjyAI_h57zkqZ7aQWE5t5xA4y_jQgoUUhw,83464
|
||||
html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
|
||||
html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
|
||||
html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
|
||||
html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
|
||||
html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
|
||||
html5lib/filters/sanitizer.py,sha256=XGNSdzIqDTaHot1V-rRj1V_XOolApJ7n95tHP9JcgNU,26885
|
||||
html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
|
||||
html5lib/html5parser.py,sha256=w5hZJh0cvD3g4CS196DiTmuGpSKCMYe1GS46-yf_WZQ,117174
|
||||
html5lib/serializer.py,sha256=K2kfoLyMPMFPfdusfR30SrxNkf0mJB92-P5_RntyaaI,15747
|
||||
html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
|
||||
html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
|
||||
html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
|
||||
html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
|
||||
html5lib/treebuilders/base.py,sha256=oeZNGEB-kt90YJGVH05gb5a8E7ids2AbYwGRsVCieWk,14553
|
||||
html5lib/treebuilders/dom.py,sha256=22whb0C71zXIsai5mamg6qzBEiigcBIvaDy4Asw3at0,8925
|
||||
html5lib/treebuilders/etree.py,sha256=EbmHx-wQ-11MVucTPtF7Ul92-mQGN3Udu_KfDn-Ifhk,12824
|
||||
html5lib/treebuilders/etree_lxml.py,sha256=OazDHZGO_q4FnVs4Dhs4hzzn2JwGAOs-rfV8LAlUGW4,14754
|
||||
html5lib/treewalkers/__init__.py,sha256=OBPtc1TU5mGyy18QDMxKEyYEz0wxFUUNj5v0-XgmYhY,5719
|
||||
html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
|
||||
html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
|
||||
html5lib/treewalkers/etree.py,sha256=gkD4tfEfRWPsEGvgHHJxZmKZXUvBzVVGz3v5C_MIiOE,4539
|
||||
html5lib/treewalkers/etree_lxml.py,sha256=eLedbn6nPjlpebibsWVijey7WEpzDwxU3ubwUoudBuA,6345
|
||||
html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
|
6
lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
Normal file
6
lib/bleach/_vendor/html5lib-1.1.dist-info/WHEEL
Normal file
|
@ -0,0 +1,6 @@
|
|||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.34.2)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py2-none-any
|
||||
Tag: py3-none-any
|
||||
|
1
lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
Normal file
1
lib/bleach/_vendor/html5lib-1.1.dist-info/top_level.txt
Normal file
|
@ -0,0 +1 @@
|
|||
html5lib
|
35
lib/bleach/_vendor/html5lib/__init__.py
Normal file
35
lib/bleach/_vendor/html5lib/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
HTML parsing library based on the `WHATWG HTML specification
|
||||
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
||||
existing HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage::
|
||||
|
||||
import html5lib
|
||||
with open("my_document.html", "rb") as f:
|
||||
tree = html5lib.parse(f)
|
||||
|
||||
For convenience, this module re-exports the following names:
|
||||
|
||||
* :func:`~.html5parser.parse`
|
||||
* :func:`~.html5parser.parseFragment`
|
||||
* :class:`~.html5parser.HTMLParser`
|
||||
* :func:`~.treebuilders.getTreeBuilder`
|
||||
* :func:`~.treewalkers.getTreeWalker`
|
||||
* :func:`~.serializer.serialize`
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .html5parser import HTMLParser, parse, parseFragment
|
||||
from .treebuilders import getTreeBuilder
|
||||
from .treewalkers import getTreeWalker
|
||||
from .serializer import serialize
|
||||
|
||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||
"getTreeWalker", "serialize"]
|
||||
|
||||
# this has to be at the top level, see how setup.py parses this
|
||||
#: Distribution version number.
|
||||
__version__ = "1.1"
|
289
lib/bleach/_vendor/html5lib/_ihatexml.py
Normal file
289
lib/bleach/_vendor/html5lib/_ihatexml.py
Normal file
|
@ -0,0 +1,289 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .constants import DataLossWarning
|
||||
|
||||
baseChar = """
|
||||
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
||||
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
||||
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
||||
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
||||
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
||||
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
||||
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
||||
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
||||
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
||||
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
||||
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
||||
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
||||
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
||||
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
||||
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
||||
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
||||
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
||||
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
||||
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
||||
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
||||
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
||||
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
||||
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
||||
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
||||
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
||||
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
||||
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
||||
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
||||
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
||||
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
||||
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
||||
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
||||
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
||||
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
||||
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
||||
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
||||
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
||||
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
||||
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
||||
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
||||
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
||||
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
||||
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
||||
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
||||
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
||||
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||
|
||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||
|
||||
combiningCharacter = """
|
||||
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
||||
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
||||
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
||||
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
||||
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
||||
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
||||
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
||||
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
||||
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
||||
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
||||
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
||||
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
||||
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
||||
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
||||
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
||||
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
||||
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
||||
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
||||
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
||||
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
||||
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
||||
#x3099 | #x309A"""
|
||||
|
||||
digit = """
|
||||
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
||||
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
||||
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
||||
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||
|
||||
extender = """
|
||||
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
||||
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||
|
||||
letter = " | ".join([baseChar, ideographic])
|
||||
|
||||
# Without the
|
||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||
extender])
|
||||
nameFirst = " | ".join([letter, "_"])
|
||||
|
||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||
|
||||
|
||||
def charStringToList(chars):
|
||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||
rv = []
|
||||
for item in charRanges:
|
||||
foundMatch = False
|
||||
for regexp in (reChar, reCharRange):
|
||||
match = regexp.match(item)
|
||||
if match is not None:
|
||||
rv.append([hexToInt(item) for item in match.groups()])
|
||||
if len(rv[-1]) == 1:
|
||||
rv[-1] = rv[-1] * 2
|
||||
foundMatch = True
|
||||
break
|
||||
if not foundMatch:
|
||||
assert len(item) == 1
|
||||
|
||||
rv.append([ord(item)] * 2)
|
||||
rv = normaliseCharList(rv)
|
||||
return rv
|
||||
|
||||
|
||||
def normaliseCharList(charList):
|
||||
charList = sorted(charList)
|
||||
for item in charList:
|
||||
assert item[1] >= item[0]
|
||||
rv = []
|
||||
i = 0
|
||||
while i < len(charList):
|
||||
j = 1
|
||||
rv.append(charList[i])
|
||||
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
|
||||
rv[-1][1] = charList[i + j][1]
|
||||
j += 1
|
||||
i += j
|
||||
return rv
|
||||
|
||||
|
||||
# We don't really support characters above the BMP :(
|
||||
max_unicode = int("FFFF", 16)
|
||||
|
||||
|
||||
def missingRanges(charList):
|
||||
rv = []
|
||||
if charList[0] != 0:
|
||||
rv.append([0, charList[0][0] - 1])
|
||||
for i, item in enumerate(charList[:-1]):
|
||||
rv.append([item[1] + 1, charList[i + 1][0] - 1])
|
||||
if charList[-1][1] != max_unicode:
|
||||
rv.append([charList[-1][1] + 1, max_unicode])
|
||||
return rv
|
||||
|
||||
|
||||
def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(escapeRegexp(chr(item[0])))
|
||||
else:
|
||||
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
||||
escapeRegexp(chr(item[1])))
|
||||
return "[%s]" % "".join(rv)
|
||||
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, "\\" + char)
|
||||
|
||||
return string
|
||||
|
||||
# output from the above
|
||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
# Simpler things
|
||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
||||
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
|
||||
def __init__(self,
|
||||
dropXmlnsLocalName=False,
|
||||
dropXmlnsAttrNs=False,
|
||||
preventDoubleDashComments=False,
|
||||
preventDashAtCommentEnd=False,
|
||||
replaceFormFeedCharacters=True,
|
||||
preventSingleQuotePubid=False):
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
|
||||
self.preventDoubleDashComments = preventDoubleDashComments
|
||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||
|
||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||
|
||||
self.preventSingleQuotePubid = preventSingleQuotePubid
|
||||
|
||||
self.replaceCache = {}
|
||||
|
||||
def coerceAttribute(self, name, namespace=None):
|
||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
||||
return None
|
||||
elif (self.dropXmlnsAttrNs and
|
||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
||||
return None
|
||||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
if self.preventDoubleDashComments:
|
||||
while "--" in data:
|
||||
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||
data = data.replace("--", "- -")
|
||||
if data.endswith("-"):
|
||||
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
||||
data += " "
|
||||
return data
|
||||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
for _ in range(data.count("\x0C")):
|
||||
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||
data = data.replace("\x0C", " ")
|
||||
# Other non-xml characters
|
||||
return data
|
||||
|
||||
def coercePubid(self, data):
|
||||
dataOutput = data
|
||||
for char in nonPubidCharRegexp.findall(data):
|
||||
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
dataOutput = dataOutput.replace(char, replacement)
|
||||
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
||||
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
||||
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
||||
return dataOutput
|
||||
|
||||
def toXmlName(self, name):
|
||||
nameFirst = name[0]
|
||||
nameRest = name[1:]
|
||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||
if m:
|
||||
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||
else:
|
||||
nameFirstOutput = nameFirst
|
||||
|
||||
nameRestOutput = nameRest
|
||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||
for char in replaceChars:
|
||||
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||
return nameFirstOutput + nameRestOutput
|
||||
|
||||
def getReplacementCharacter(self, char):
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
return replacement
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
name = name.replace(item, self.unescapeChar(item))
|
||||
return name
|
||||
|
||||
def escapeChar(self, char):
|
||||
replacement = "U%05X" % ord(char)
|
||||
self.replaceCache[char] = replacement
|
||||
return replacement
|
||||
|
||||
def unescapeChar(self, charcode):
|
||||
return chr(int(charcode[1:], 16))
|
918
lib/bleach/_vendor/html5lib/_inputstream.py
Normal file
918
lib/bleach/_vendor/html5lib/_inputstream.py
Normal file
|
@ -0,0 +1,918 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from six import text_type
|
||||
from six.moves import http_client, urllib
|
||||
|
||||
import codecs
|
||||
import re
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import webencodings
|
||||
|
||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from .constants import _ReparseException
|
||||
from . import _utils
|
||||
|
||||
# Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
||||
|
||||
if _utils.supports_lone_surrogates:
|
||||
# Use one extra step of indirection and create surrogates with
|
||||
# eval. Not using this indirection would introduce an illegal
|
||||
# unicode literal on platforms not supporting such lone
|
||||
# surrogates.
|
||||
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
||||
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
||||
"]")
|
||||
else:
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||
|
||||
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, 0x10FFFF}
|
||||
|
||||
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
||||
|
||||
# Cache for charsUntil()
|
||||
charsUntilRegEx = {}
|
||||
|
||||
|
||||
class BufferedStream(object):
|
||||
"""Buffering for streams that do not have buffering of their own
|
||||
|
||||
The buffer is implemented as a list of chunks on the assumption that
|
||||
joining many strings will be slow since it is O(n**2)
|
||||
"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.buffer = []
|
||||
self.position = [-1, 0] # chunk number, offset
|
||||
|
||||
def tell(self):
|
||||
pos = 0
|
||||
for chunk in self.buffer[:self.position[0]]:
|
||||
pos += len(chunk)
|
||||
pos += self.position[1]
|
||||
return pos
|
||||
|
||||
def seek(self, pos):
|
||||
assert pos <= self._bufferedBytes()
|
||||
offset = pos
|
||||
i = 0
|
||||
while len(self.buffer[i]) < offset:
|
||||
offset -= len(self.buffer[i])
|
||||
i += 1
|
||||
self.position = [i, offset]
|
||||
|
||||
def read(self, bytes):
|
||||
if not self.buffer:
|
||||
return self._readStream(bytes)
|
||||
elif (self.position[0] == len(self.buffer) and
|
||||
self.position[1] == len(self.buffer[-1])):
|
||||
return self._readStream(bytes)
|
||||
else:
|
||||
return self._readFromBuffer(bytes)
|
||||
|
||||
def _bufferedBytes(self):
|
||||
return sum([len(item) for item in self.buffer])
|
||||
|
||||
def _readStream(self, bytes):
|
||||
data = self.stream.read(bytes)
|
||||
self.buffer.append(data)
|
||||
self.position[0] += 1
|
||||
self.position[1] = len(data)
|
||||
return data
|
||||
|
||||
def _readFromBuffer(self, bytes):
|
||||
remainingBytes = bytes
|
||||
rv = []
|
||||
bufferIndex = self.position[0]
|
||||
bufferOffset = self.position[1]
|
||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||
assert remainingBytes > 0
|
||||
bufferedData = self.buffer[bufferIndex]
|
||||
|
||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||
bytesToRead = remainingBytes
|
||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||
else:
|
||||
bytesToRead = len(bufferedData) - bufferOffset
|
||||
self.position = [bufferIndex, len(bufferedData)]
|
||||
bufferIndex += 1
|
||||
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
||||
remainingBytes -= bytesToRead
|
||||
|
||||
bufferOffset = 0
|
||||
|
||||
if remainingBytes:
|
||||
rv.append(self._readStream(remainingBytes))
|
||||
|
||||
return b"".join(rv)
|
||||
|
||||
|
||||
def HTMLInputStream(source, **kwargs):
|
||||
# Work around Python bug #20007: read(0) closes the connection.
|
||||
# http://bugs.python.org/issue20007
|
||||
if (isinstance(source, http_client.HTTPResponse) or
|
||||
# Also check for addinfourl wrapping HTTPResponse
|
||||
(isinstance(source, urllib.response.addbase) and
|
||||
isinstance(source.fp, http_client.HTTPResponse))):
|
||||
isUnicode = False
|
||||
elif hasattr(source, "read"):
|
||||
isUnicode = isinstance(source.read(0), text_type)
|
||||
else:
|
||||
isUnicode = isinstance(source, text_type)
|
||||
|
||||
if isUnicode:
|
||||
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
||||
if encodings:
|
||||
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
||||
|
||||
return HTMLUnicodeInputStream(source, **kwargs)
|
||||
else:
|
||||
return HTMLBinaryInputStream(source, **kwargs)
|
||||
|
||||
|
||||
class HTMLUnicodeInputStream(object):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
_defaultChunkSize = 10240
|
||||
|
||||
def __init__(self, source):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
|
||||
if not _utils.supports_lone_surrogates:
|
||||
# Such platforms will have already checked for such
|
||||
# surrogate errors, so no need to do this checking.
|
||||
self.reportCharacterErrors = None
|
||||
elif len("\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
||||
self.dataStream = self.openStream(source)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.chunk = ""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
self.errors = []
|
||||
|
||||
# number of (complete) lines in previous chunks
|
||||
self.prevNumLines = 0
|
||||
# number of columns in the last line of the previous chunk
|
||||
self.prevNumCols = 0
|
||||
|
||||
# Deal with CR LF and surrogates split over chunk boundaries
|
||||
self._bufferedCharacter = None
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
stream = StringIO(source)
|
||||
|
||||
return stream
|
||||
|
||||
def _position(self, offset):
|
||||
chunk = self.chunk
|
||||
nLines = chunk.count('\n', 0, offset)
|
||||
positionLine = self.prevNumLines + nLines
|
||||
lastLinePos = chunk.rfind('\n', 0, offset)
|
||||
if lastLinePos == -1:
|
||||
positionColumn = self.prevNumCols + offset
|
||||
else:
|
||||
positionColumn = offset - (lastLinePos + 1)
|
||||
return (positionLine, positionColumn)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self._position(self.chunkOffset)
|
||||
return (line + 1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
# Read a new chunk from the input stream if necessary
|
||||
if self.chunkOffset >= self.chunkSize:
|
||||
if not self.readChunk():
|
||||
return EOF
|
||||
|
||||
chunkOffset = self.chunkOffset
|
||||
char = self.chunk[chunkOffset]
|
||||
self.chunkOffset = chunkOffset + 1
|
||||
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=None):
|
||||
if chunkSize is None:
|
||||
chunkSize = self._defaultChunkSize
|
||||
|
||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||
|
||||
self.chunk = ""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
|
||||
data = self.dataStream.read(chunkSize)
|
||||
|
||||
# Deal with CR LF and surrogates broken across chunks
|
||||
if self._bufferedCharacter:
|
||||
data = self._bufferedCharacter + data
|
||||
self._bufferedCharacter = None
|
||||
elif not data:
|
||||
# We have no more data, bye-bye stream
|
||||
return False
|
||||
|
||||
if len(data) > 1:
|
||||
lastv = ord(data[-1])
|
||||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
if self.reportCharacterErrors:
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
self.chunk = data
|
||||
self.chunkSize = len(data)
|
||||
|
||||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for _ in range(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
# Someone picked the wrong compile option
|
||||
# You lose
|
||||
skip = False
|
||||
for match in invalid_unicode_re.finditer(data):
|
||||
if skip:
|
||||
continue
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
# Pretty sure there should be endianness issues here
|
||||
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
||||
# We have a surrogate pair!
|
||||
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||
pos == len(data) - 1):
|
||||
self.errors.append("invalid-codepoint")
|
||||
else:
|
||||
skip = False
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def charsUntil(self, characters, opposite=False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in 'characters' or EOF. 'characters' must be
|
||||
a container that supports the 'in' method and iteration over its
|
||||
characters.
|
||||
"""
|
||||
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = charsUntilRegEx[(characters, opposite)]
|
||||
except KeyError:
|
||||
if __debug__:
|
||||
for c in characters:
|
||||
assert(ord(c) < 128)
|
||||
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = "^%s" % regex
|
||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
||||
|
||||
rv = []
|
||||
|
||||
while True:
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.chunk, self.chunkOffset)
|
||||
if m is None:
|
||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||
# then stop
|
||||
if self.chunkOffset != self.chunkSize:
|
||||
break
|
||||
else:
|
||||
end = m.end()
|
||||
# If not the whole chunk matched, return everything
|
||||
# up to the part that didn't match
|
||||
if end != self.chunkSize:
|
||||
rv.append(self.chunk[self.chunkOffset:end])
|
||||
self.chunkOffset = end
|
||||
break
|
||||
# If the whole remainder of the chunk matched,
|
||||
# use it all and read the next chunk
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
r = "".join(rv)
|
||||
return r
|
||||
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
if char is not EOF:
|
||||
if self.chunkOffset == 0:
|
||||
# unget is called quite rarely, so it's a good idea to do
|
||||
# more work here if it saves a bit of work in the frequently
|
||||
# called char and charsUntil.
|
||||
# So, just prepend the ungotten character onto the current
|
||||
# chunk:
|
||||
self.chunk = char + self.chunk
|
||||
self.chunkSize += 1
|
||||
else:
|
||||
self.chunkOffset -= 1
|
||||
assert self.chunk[self.chunkOffset] == char
|
||||
|
||||
|
||||
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
||||
same_origin_parent_encoding=None, likely_encoding=None,
|
||||
default_encoding="windows-1252", useChardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||
|
||||
# Encoding Information
|
||||
# Number of bytes to use when looking for a meta element with
|
||||
# encoding information
|
||||
self.numBytesMeta = 1024
|
||||
# Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
# Things from args
|
||||
self.override_encoding = override_encoding
|
||||
self.transport_encoding = transport_encoding
|
||||
self.same_origin_parent_encoding = same_origin_parent_encoding
|
||||
self.likely_encoding = likely_encoding
|
||||
self.default_encoding = default_encoding
|
||||
|
||||
# Determine encoding
|
||||
self.charEncoding = self.determineEncoding(useChardet)
|
||||
assert self.charEncoding[0] is not None
|
||||
|
||||
# Call superclass
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
||||
HTMLUnicodeInputStream.reset(self)
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
stream = BytesIO(source)
|
||||
|
||||
try:
|
||||
stream.seek(stream.tell())
|
||||
except Exception:
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def determineEncoding(self, chardet=True):
|
||||
# BOMs take precedence over everything
|
||||
# This will also read past the BOM if present
|
||||
charEncoding = self.detectBOM(), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# If we've been overridden, we've been overridden
|
||||
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Now check the transport layer
|
||||
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Look for meta elements with encoding information
|
||||
charEncoding = self.detectEncodingMeta(), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Parent document encoding
|
||||
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
||||
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
||||
return charEncoding
|
||||
|
||||
# "likely" encoding
|
||||
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Guess with chardet, if available
|
||||
if chardet:
|
||||
try:
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
assert isinstance(buffer, bytes)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = lookupEncoding(detector.result['encoding'])
|
||||
self.rawStream.seek(0)
|
||||
if encoding is not None:
|
||||
return encoding, "tentative"
|
||||
|
||||
# Try the default encoding
|
||||
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Fallback to html5lib's default if even that hasn't worked
|
||||
return lookupEncoding("windows-1252"), "tentative"
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
assert self.charEncoding[1] != "certain"
|
||||
newEncoding = lookupEncoding(newEncoding)
|
||||
if newEncoding is None:
|
||||
return
|
||||
if newEncoding.name in ("utf-16be", "utf-16le"):
|
||||
newEncoding = lookupEncoding("utf-8")
|
||||
assert newEncoding is not None
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
self.reset()
|
||||
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
an encoding can be determined from the BOM return the name of the
|
||||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
string = self.rawStream.read(4)
|
||||
assert isinstance(string, bytes)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
if encoding:
|
||||
self.rawStream.seek(seek)
|
||||
return lookupEncoding(encoding)
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
return None
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
assert isinstance(buffer, bytes)
|
||||
parser = EncodingParser(buffer)
|
||||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
||||
encoding = lookupEncoding("utf-8")
|
||||
|
||||
return encoding
|
||||
|
||||
|
||||
class EncodingBytes(bytes):
|
||||
"""String-like object with an associated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
assert isinstance(value, bytes)
|
||||
return bytes.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
# pylint:disable=unused-argument
|
||||
self._position = -1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
p = self._position = self._position + 1
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
return self[p:p + 1]
|
||||
|
||||
def next(self):
|
||||
# Py2 compat
|
||||
return self.__next__()
|
||||
|
||||
def previous(self):
|
||||
p = self._position
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
self._position = p = p - 1
|
||||
return self[p:p + 1]
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
self._position = position
|
||||
|
||||
def getPosition(self):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
if self._position >= 0:
|
||||
return self._position
|
||||
else:
|
||||
return None
|
||||
|
||||
position = property(getPosition, setPosition)
|
||||
|
||||
def getCurrentByte(self):
|
||||
return self[self.position:self.position + 1]
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharactersBytes):
|
||||
"""Skip past a list of characters"""
|
||||
p = self.position # use property for the error-checking
|
||||
while p < len(self):
|
||||
c = self[p:p + 1]
|
||||
if c not in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def skipUntil(self, chars):
|
||||
p = self.position
|
||||
while p < len(self):
|
||||
c = self[p:p + 1]
|
||||
if c in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
rv = self.startswith(bytes, self.position)
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
def jumpTo(self, bytes):
|
||||
"""Look for the next sequence of bytes matching a given sequence. If
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
try:
|
||||
self._position = self.index(bytes, self.position) + len(bytes) - 1
|
||||
except ValueError:
|
||||
raise StopIteration
|
||||
return True
|
||||
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
def __init__(self, data):
|
||||
"""string - the data to work on for encoding detection"""
|
||||
self.data = EncodingBytes(data)
|
||||
self.encoding = None
|
||||
|
||||
def getEncoding(self):
|
||||
if b"<meta" not in self.data:
|
||||
return None
|
||||
|
||||
methodDispatch = (
|
||||
(b"<!--", self.handleComment),
|
||||
(b"<meta", self.handleMeta),
|
||||
(b"</", self.handlePossibleEndTag),
|
||||
(b"<!", self.handleOther),
|
||||
(b"<?", self.handleOther),
|
||||
(b"<", self.handlePossibleStartTag))
|
||||
for _ in self.data:
|
||||
keepParsing = True
|
||||
try:
|
||||
self.data.jumpTo(b"<")
|
||||
except StopIteration:
|
||||
break
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
except StopIteration:
|
||||
keepParsing = False
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
"""Skip over comments"""
|
||||
return self.data.jumpTo(b"-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharactersBytes:
|
||||
# if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
# We have a valid meta element we want to search for attributes
|
||||
hasPragma = False
|
||||
pendingEncoding = None
|
||||
while True:
|
||||
# Try to find the next attribute after the current position
|
||||
attr = self.getAttribute()
|
||||
if attr is None:
|
||||
return True
|
||||
else:
|
||||
if attr[0] == b"http-equiv":
|
||||
hasPragma = attr[1] == b"content-type"
|
||||
if hasPragma and pendingEncoding is not None:
|
||||
self.encoding = pendingEncoding
|
||||
return False
|
||||
elif attr[0] == b"charset":
|
||||
tentativeEncoding = attr[1]
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
elif attr[0] == b"content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
if tentativeEncoding is not None:
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
if hasPragma:
|
||||
self.encoding = codec
|
||||
return False
|
||||
else:
|
||||
pendingEncoding = codec
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
next(self.data)
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
data = self.data
|
||||
if data.currentByte not in asciiLettersBytes:
|
||||
# If the next byte is not an ascii letter either ignore this
|
||||
# fragment (possible start tag case) or treat it according to
|
||||
# handleOther
|
||||
if endTag:
|
||||
data.previous()
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
c = data.skipUntil(spacesAngleBrackets)
|
||||
if c == b"<":
|
||||
# return to the first step in the overall "two step" algorithm
|
||||
# reprocessing the < byte
|
||||
data.previous()
|
||||
else:
|
||||
# Read all attributes
|
||||
attr = self.getAttribute()
|
||||
while attr is not None:
|
||||
attr = self.getAttribute()
|
||||
return True
|
||||
|
||||
def handleOther(self):
|
||||
return self.data.jumpTo(b">")
|
||||
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
data = self.data
|
||||
# Step 1 (skip chars)
|
||||
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
||||
assert c is None or len(c) == 1
|
||||
# Step 2
|
||||
if c in (b">", None):
|
||||
return None
|
||||
# Step 3
|
||||
attrName = []
|
||||
attrValue = []
|
||||
# Step 4 attribute name
|
||||
while True:
|
||||
if c == b"=" and attrName:
|
||||
break
|
||||
elif c in spaceCharactersBytes:
|
||||
# Step 6!
|
||||
c = data.skip()
|
||||
break
|
||||
elif c in (b"/", b">"):
|
||||
return b"".join(attrName), b""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrName.append(c)
|
||||
# Step 5
|
||||
c = next(data)
|
||||
# Step 7
|
||||
if c != b"=":
|
||||
data.previous()
|
||||
return b"".join(attrName), b""
|
||||
# Step 8
|
||||
next(data)
|
||||
# Step 9
|
||||
c = data.skip()
|
||||
# Step 10
|
||||
if c in (b"'", b'"'):
|
||||
# 10.1
|
||||
quoteChar = c
|
||||
while True:
|
||||
# 10.2
|
||||
c = next(data)
|
||||
# 10.3
|
||||
if c == quoteChar:
|
||||
next(data)
|
||||
return b"".join(attrName), b"".join(attrValue)
|
||||
# 10.4
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
# 10.5
|
||||
else:
|
||||
attrValue.append(c)
|
||||
elif c == b">":
|
||||
return b"".join(attrName), b""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
# Step 11
|
||||
while True:
|
||||
c = next(data)
|
||||
if c in spacesAngleBrackets:
|
||||
return b"".join(attrName), b"".join(attrValue)
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
def __init__(self, data):
|
||||
assert isinstance(data, bytes)
|
||||
self.data = data
|
||||
|
||||
def parse(self):
|
||||
try:
|
||||
# Check if the attr name is charset
|
||||
# otherwise return
|
||||
self.data.jumpTo(b"charset")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
if not self.data.currentByte == b"=":
|
||||
# If there is no = sign keep looking for attrs
|
||||
return None
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
# Look for an encoding between matching quote marks
|
||||
if self.data.currentByte in (b'"', b"'"):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
if self.data.jumpTo(quoteMark):
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
# Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.skipUntil(spaceCharactersBytes)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
# Return the whole remaining value
|
||||
return self.data[oldPosition:]
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def lookupEncoding(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if isinstance(encoding, bytes):
|
||||
try:
|
||||
encoding = encoding.decode("ascii")
|
||||
except UnicodeDecodeError:
|
||||
return None
|
||||
|
||||
if encoding is not None:
|
||||
try:
|
||||
return webencodings.lookup(encoding)
|
||||
except AttributeError:
|
||||
return None
|
||||
else:
|
||||
return None
|
1735
lib/bleach/_vendor/html5lib/_tokenizer.py
Normal file
1735
lib/bleach/_vendor/html5lib/_tokenizer.py
Normal file
File diff suppressed because it is too large
Load diff
5
lib/bleach/_vendor/html5lib/_trie/__init__.py
Normal file
5
lib/bleach/_vendor/html5lib/_trie/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .py import Trie
|
||||
|
||||
__all__ = ["Trie"]
|
40
lib/bleach/_vendor/html5lib/_trie/_base.py
Normal file
40
lib/bleach/_vendor/html5lib/_trie/_base.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
try:
|
||||
from collections.abc import Mapping
|
||||
except ImportError: # Python 2.7
|
||||
from collections import Mapping
|
||||
|
||||
|
||||
class Trie(Mapping):
|
||||
"""Abstract base class for tries"""
|
||||
|
||||
def keys(self, prefix=None):
|
||||
# pylint:disable=arguments-differ
|
||||
keys = super(Trie, self).keys()
|
||||
|
||||
if prefix is None:
|
||||
return set(keys)
|
||||
|
||||
return {x for x in keys if x.startswith(prefix)}
|
||||
|
||||
def has_keys_with_prefix(self, prefix):
|
||||
for key in self.keys():
|
||||
if key.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def longest_prefix(self, prefix):
|
||||
if prefix in self:
|
||||
return prefix
|
||||
|
||||
for i in range(1, len(prefix) + 1):
|
||||
if prefix[:-i] in self:
|
||||
return prefix[:-i]
|
||||
|
||||
raise KeyError(prefix)
|
||||
|
||||
def longest_prefix_item(self, prefix):
|
||||
lprefix = self.longest_prefix(prefix)
|
||||
return (lprefix, self[lprefix])
|
67
lib/bleach/_vendor/html5lib/_trie/py.py
Normal file
67
lib/bleach/_vendor/html5lib/_trie/py.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
from bisect import bisect_left
|
||||
|
||||
from ._base import Trie as ABCTrie
|
||||
|
||||
|
||||
class Trie(ABCTrie):
|
||||
def __init__(self, data):
|
||||
if not all(isinstance(x, text_type) for x in data.keys()):
|
||||
raise TypeError("All keys must be strings")
|
||||
|
||||
self._data = data
|
||||
self._keys = sorted(data.keys())
|
||||
self._cachestr = ""
|
||||
self._cachepoints = (0, len(data))
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self._data
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._data)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._data[key]
|
||||
|
||||
def keys(self, prefix=None):
|
||||
if prefix is None or prefix == "" or not self._keys:
|
||||
return set(self._keys)
|
||||
|
||||
if prefix.startswith(self._cachestr):
|
||||
lo, hi = self._cachepoints
|
||||
start = i = bisect_left(self._keys, prefix, lo, hi)
|
||||
else:
|
||||
start = i = bisect_left(self._keys, prefix)
|
||||
|
||||
keys = set()
|
||||
if start == len(self._keys):
|
||||
return keys
|
||||
|
||||
while self._keys[i].startswith(prefix):
|
||||
keys.add(self._keys[i])
|
||||
i += 1
|
||||
|
||||
self._cachestr = prefix
|
||||
self._cachepoints = (start, i)
|
||||
|
||||
return keys
|
||||
|
||||
def has_keys_with_prefix(self, prefix):
|
||||
if prefix in self._data:
|
||||
return True
|
||||
|
||||
if prefix.startswith(self._cachestr):
|
||||
lo, hi = self._cachepoints
|
||||
i = bisect_left(self._keys, prefix, lo, hi)
|
||||
else:
|
||||
i = bisect_left(self._keys, prefix)
|
||||
|
||||
if i == len(self._keys):
|
||||
return False
|
||||
|
||||
return self._keys[i].startswith(prefix)
|
159
lib/bleach/_vendor/html5lib/_utils.py
Normal file
159
lib/bleach/_vendor/html5lib/_utils.py
Normal file
|
@ -0,0 +1,159 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from types import ModuleType
|
||||
|
||||
try:
|
||||
from collections.abc import Mapping
|
||||
except ImportError:
|
||||
from collections import Mapping
|
||||
|
||||
from six import text_type, PY3
|
||||
|
||||
if PY3:
|
||||
import xml.etree.ElementTree as default_etree
|
||||
else:
|
||||
try:
|
||||
import xml.etree.cElementTree as default_etree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as default_etree
|
||||
|
||||
|
||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||
"supports_lone_surrogates"]
|
||||
|
||||
|
||||
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||
# caught by the below test. In general this would be any platform
|
||||
# using UTF-16 as its encoding of unicode strings, such as
|
||||
# Jython. This is because UTF-16 itself is based on the use of such
|
||||
# surrogates, and there is no mechanism to further escape such
|
||||
# escapes.
|
||||
try:
|
||||
_x = eval('"\\uD800"') # pylint:disable=eval-used
|
||||
if not isinstance(_x, text_type):
|
||||
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||
_x = eval('u"\\uD800"') # pylint:disable=eval-used
|
||||
assert isinstance(_x, text_type)
|
||||
except Exception:
|
||||
supports_lone_surrogates = False
|
||||
else:
|
||||
supports_lone_surrogates = True
|
||||
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
"""Dict with 2 special properties:
|
||||
|
||||
On initiation, keys that are lists, sets or tuples are converted to
|
||||
multiple keys so accessing any one of the items in the original
|
||||
list-like object returns the matching value
|
||||
|
||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||
md["foo"] == "baz"
|
||||
|
||||
A default value which can be set through the default attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, items=()):
|
||||
_dictEntries = []
|
||||
for name, value in items:
|
||||
if isinstance(name, (list, tuple, frozenset, set)):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
assert len(self) == len(_dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
||||
|
||||
def __get__(self, instance, owner=None):
|
||||
return BoundMethodDispatcher(instance, self)
|
||||
|
||||
|
||||
class BoundMethodDispatcher(Mapping):
|
||||
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
|
||||
def __init__(self, instance, dispatcher):
|
||||
self.instance = instance
|
||||
self.dispatcher = dispatcher
|
||||
|
||||
def __getitem__(self, key):
|
||||
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
|
||||
# on a function, __get__ is used to bind a function to an instance as a bound method
|
||||
return self.dispatcher[key].__get__(self.instance)
|
||||
|
||||
def get(self, key, default):
|
||||
if key in self.dispatcher:
|
||||
return self[key]
|
||||
else:
|
||||
return default
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.dispatcher)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dispatcher)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.dispatcher
|
||||
|
||||
|
||||
# Some utility functions to deal with weirdness around UCS2 vs UCS4
|
||||
# python builds
|
||||
|
||||
def isSurrogatePair(data):
|
||||
return (len(data) == 2 and
|
||||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||
|
||||
|
||||
def surrogatePairToCodepoint(data):
|
||||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||
(ord(data[1]) - 0xDC00))
|
||||
return char_val
|
||||
|
||||
# Module Factory Factory (no, this isn't Java, I know)
|
||||
# Here to stop this being duplicated all over the place.
|
||||
|
||||
|
||||
def moduleFactoryFactory(factory):
|
||||
moduleCache = {}
|
||||
|
||||
def moduleFactory(baseModule, *args, **kwargs):
|
||||
if isinstance(ModuleType.__name__, type("")):
|
||||
name = "_%s_factory" % baseModule.__name__
|
||||
else:
|
||||
name = b"_%s_factory" % baseModule.__name__
|
||||
|
||||
kwargs_tuple = tuple(kwargs.items())
|
||||
|
||||
try:
|
||||
return moduleCache[name][args][kwargs_tuple]
|
||||
except KeyError:
|
||||
mod = ModuleType(name)
|
||||
objs = factory(baseModule, *args, **kwargs)
|
||||
mod.__dict__.update(objs)
|
||||
if "name" not in moduleCache:
|
||||
moduleCache[name] = {}
|
||||
if "args" not in moduleCache[name]:
|
||||
moduleCache[name][args] = {}
|
||||
if "kwargs" not in moduleCache[name][args]:
|
||||
moduleCache[name][args][kwargs_tuple] = {}
|
||||
moduleCache[name][args][kwargs_tuple] = mod
|
||||
return mod
|
||||
|
||||
return moduleFactory
|
||||
|
||||
|
||||
def memoize(func):
|
||||
cache = {}
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
key = (tuple(args), tuple(kwargs.items()))
|
||||
if key not in cache:
|
||||
cache[key] = func(*args, **kwargs)
|
||||
return cache[key]
|
||||
|
||||
return wrapped
|
2946
lib/bleach/_vendor/html5lib/constants.py
Normal file
2946
lib/bleach/_vendor/html5lib/constants.py
Normal file
File diff suppressed because it is too large
Load diff
0
lib/bleach/_vendor/html5lib/filters/__init__.py
Normal file
0
lib/bleach/_vendor/html5lib/filters/__init__.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def _attr_key(attr):
|
||||
"""Return an appropriate key for an attribute for sorting
|
||||
|
||||
Attributes have a namespace that can be either ``None`` or a string. We
|
||||
can't compare the two because they're different types, so we convert
|
||||
``None`` to an empty string first.
|
||||
|
||||
"""
|
||||
return (attr[0][0] or ''), attr[0][1]
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Alphabetizes attributes for elements"""
|
||||
def __iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
attrs = OrderedDict()
|
||||
for name, value in sorted(token["data"].items(),
|
||||
key=_attr_key):
|
||||
attrs[name] = value
|
||||
token["data"] = attrs
|
||||
yield token
|
12
lib/bleach/_vendor/html5lib/filters/base.py
Normal file
12
lib/bleach/_vendor/html5lib/filters/base.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
73
lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
Normal file
73
lib/bleach/_vendor/html5lib/filters/inject_meta_charset.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
|
||||
def __init__(self, source, encoding):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg source: the source token stream
|
||||
|
||||
:arg encoding: the encoding to set
|
||||
|
||||
"""
|
||||
base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == "head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == "meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
for (namespace, name), value in token["data"].items():
|
||||
if namespace is not None:
|
||||
continue
|
||||
elif name.lower() == 'charset':
|
||||
token["data"][(namespace, name)] = self.encoding
|
||||
meta_found = True
|
||||
break
|
||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
else:
|
||||
if has_http_equiv_content_type and (None, "content") in token["data"]:
|
||||
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == "head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": "head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": {(None, "charset"): self.encoding}}
|
||||
yield {"type": "EndTag", "name": "head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == "head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": {(None, "charset"): self.encoding}}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
93
lib/bleach/_vendor/html5lib/filters/lint.py
Normal file
93
lib/bleach/_vendor/html5lib/filters/lint.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from six import text_type
|
||||
|
||||
from . import base
|
||||
from ..constants import namespaces, voidElements
|
||||
|
||||
from ..constants import spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Lints the token stream for errors
|
||||
|
||||
If it finds any errors, it'll raise an ``AssertionError``.
|
||||
|
||||
"""
|
||||
def __init__(self, source, require_matching_tags=True):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg source: the source token stream
|
||||
|
||||
:arg require_matching_tags: whether or not to require matching tags
|
||||
|
||||
"""
|
||||
super(Filter, self).__init__(source)
|
||||
self.require_matching_tags = require_matching_tags
|
||||
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
namespace = token["namespace"]
|
||||
name = token["name"]
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
assert isinstance(token["data"], dict)
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
assert type == "EmptyTag"
|
||||
else:
|
||||
assert type == "StartTag"
|
||||
if type == "StartTag" and self.require_matching_tags:
|
||||
open_elements.append((namespace, name))
|
||||
for (namespace, name), value in token["data"].items():
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
assert isinstance(value, text_type)
|
||||
|
||||
elif type == "EndTag":
|
||||
namespace = token["namespace"]
|
||||
name = token["name"]
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
|
||||
elif self.require_matching_tags:
|
||||
start = open_elements.pop()
|
||||
assert start == (namespace, name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
assert isinstance(data, text_type)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
assert isinstance(data, text_type)
|
||||
assert data != ""
|
||||
if type == "SpaceCharacters":
|
||||
assert data.strip(spaceCharacters) == ""
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
assert name is None or isinstance(name, text_type)
|
||||
assert token["publicId"] is None or isinstance(name, text_type)
|
||||
assert token["systemId"] is None or isinstance(name, text_type)
|
||||
|
||||
elif type == "Entity":
|
||||
assert isinstance(token["name"], text_type)
|
||||
|
||||
elif type == "SerializerError":
|
||||
assert isinstance(token["data"], text_type)
|
||||
|
||||
else:
|
||||
assert False, "Unknown token type: %(type)s" % {"type": type}
|
||||
|
||||
yield token
|
207
lib/bleach/_vendor/html5lib/filters/optionaltags.py
Normal file
207
lib/bleach/_vendor/html5lib/filters/optionaltags.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Removes optional tags from the token stream"""
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if (token["data"] or
|
||||
not self.is_optional_start(token["name"], previous, next)):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
yield token
|
||||
else:
|
||||
yield token
|
||||
|
||||
def is_optional_start(self, tagname, previous, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in 'html':
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
# XXX: we also omit the start tag if the head element is empty
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return True
|
||||
elif type == "EndTag":
|
||||
return next["name"] == "head"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return next["name"] not in ('script', 'style')
|
||||
else:
|
||||
return True
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return next["name"] == "col"
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tbody':
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous['type'] == 'EndTag' and \
|
||||
previous['name'] in ('tbody', 'thead', 'tfoot'):
|
||||
return False
|
||||
return next["name"] == 'tr'
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def is_optional_end(self, tagname, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in ('html', 'head', 'body'):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] == tagname
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('dt', 'dd'):
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('dt', 'dd')
|
||||
elif tagname == 'dd':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, article, aside,
|
||||
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||
# there is no more content in the parent element.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return next["name"] in ('address', 'article', 'aside',
|
||||
'blockquote', 'datagrid', 'dialog',
|
||||
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'header', 'hr', 'menu', 'nav', 'ol',
|
||||
'p', 'pre', 'section', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'option':
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if it is immediately followed by an <code>optgroup</code>
|
||||
# element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('option', 'optgroup')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('rt', 'rp'):
|
||||
# An rt element's end tag may be omitted if the rt element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
# An rp element's end tag may be omitted if the rp element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('rt', 'rp')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return next["name"] != 'colgroup'
|
||||
else:
|
||||
return True
|
||||
elif tagname in ('thead', 'tbody'):
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ['tbody', 'tfoot']
|
||||
elif tagname == 'tbody':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tfoot':
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] == 'tbody'
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('td', 'th'):
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('td', 'th')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
return False
|
916
lib/bleach/_vendor/html5lib/filters/sanitizer.py
Normal file
916
lib/bleach/_vendor/html5lib/filters/sanitizer.py
Normal file
|
@ -0,0 +1,916 @@
|
|||
"""Deprecated from html5lib 1.1.
|
||||
|
||||
See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
|
||||
information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
|
||||
is recommended as a replacement. Please let us know in the aforementioned issue
|
||||
if Bleach is unsuitable for your needs.
|
||||
|
||||
"""
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from six.moves import urllib_parse as urlparse
|
||||
|
||||
from . import base
|
||||
from ..constants import namespaces, prefixes
|
||||
|
||||
__all__ = ["Filter"]
|
||||
|
||||
|
||||
_deprecation_msg = (
|
||||
"html5lib's sanitizer is deprecated; see " +
|
||||
"https://github.com/html5lib/html5lib-python/issues/443 and please let " +
|
||||
"us know if Bleach is unsuitable for your needs"
|
||||
)
|
||||
|
||||
warnings.warn(_deprecation_msg, DeprecationWarning)
|
||||
|
||||
allowed_elements = frozenset((
|
||||
(namespaces['html'], 'a'),
|
||||
(namespaces['html'], 'abbr'),
|
||||
(namespaces['html'], 'acronym'),
|
||||
(namespaces['html'], 'address'),
|
||||
(namespaces['html'], 'area'),
|
||||
(namespaces['html'], 'article'),
|
||||
(namespaces['html'], 'aside'),
|
||||
(namespaces['html'], 'audio'),
|
||||
(namespaces['html'], 'b'),
|
||||
(namespaces['html'], 'big'),
|
||||
(namespaces['html'], 'blockquote'),
|
||||
(namespaces['html'], 'br'),
|
||||
(namespaces['html'], 'button'),
|
||||
(namespaces['html'], 'canvas'),
|
||||
(namespaces['html'], 'caption'),
|
||||
(namespaces['html'], 'center'),
|
||||
(namespaces['html'], 'cite'),
|
||||
(namespaces['html'], 'code'),
|
||||
(namespaces['html'], 'col'),
|
||||
(namespaces['html'], 'colgroup'),
|
||||
(namespaces['html'], 'command'),
|
||||
(namespaces['html'], 'datagrid'),
|
||||
(namespaces['html'], 'datalist'),
|
||||
(namespaces['html'], 'dd'),
|
||||
(namespaces['html'], 'del'),
|
||||
(namespaces['html'], 'details'),
|
||||
(namespaces['html'], 'dfn'),
|
||||
(namespaces['html'], 'dialog'),
|
||||
(namespaces['html'], 'dir'),
|
||||
(namespaces['html'], 'div'),
|
||||
(namespaces['html'], 'dl'),
|
||||
(namespaces['html'], 'dt'),
|
||||
(namespaces['html'], 'em'),
|
||||
(namespaces['html'], 'event-source'),
|
||||
(namespaces['html'], 'fieldset'),
|
||||
(namespaces['html'], 'figcaption'),
|
||||
(namespaces['html'], 'figure'),
|
||||
(namespaces['html'], 'footer'),
|
||||
(namespaces['html'], 'font'),
|
||||
(namespaces['html'], 'form'),
|
||||
(namespaces['html'], 'header'),
|
||||
(namespaces['html'], 'h1'),
|
||||
(namespaces['html'], 'h2'),
|
||||
(namespaces['html'], 'h3'),
|
||||
(namespaces['html'], 'h4'),
|
||||
(namespaces['html'], 'h5'),
|
||||
(namespaces['html'], 'h6'),
|
||||
(namespaces['html'], 'hr'),
|
||||
(namespaces['html'], 'i'),
|
||||
(namespaces['html'], 'img'),
|
||||
(namespaces['html'], 'input'),
|
||||
(namespaces['html'], 'ins'),
|
||||
(namespaces['html'], 'keygen'),
|
||||
(namespaces['html'], 'kbd'),
|
||||
(namespaces['html'], 'label'),
|
||||
(namespaces['html'], 'legend'),
|
||||
(namespaces['html'], 'li'),
|
||||
(namespaces['html'], 'm'),
|
||||
(namespaces['html'], 'map'),
|
||||
(namespaces['html'], 'menu'),
|
||||
(namespaces['html'], 'meter'),
|
||||
(namespaces['html'], 'multicol'),
|
||||
(namespaces['html'], 'nav'),
|
||||
(namespaces['html'], 'nextid'),
|
||||
(namespaces['html'], 'ol'),
|
||||
(namespaces['html'], 'output'),
|
||||
(namespaces['html'], 'optgroup'),
|
||||
(namespaces['html'], 'option'),
|
||||
(namespaces['html'], 'p'),
|
||||
(namespaces['html'], 'pre'),
|
||||
(namespaces['html'], 'progress'),
|
||||
(namespaces['html'], 'q'),
|
||||
(namespaces['html'], 's'),
|
||||
(namespaces['html'], 'samp'),
|
||||
(namespaces['html'], 'section'),
|
||||
(namespaces['html'], 'select'),
|
||||
(namespaces['html'], 'small'),
|
||||
(namespaces['html'], 'sound'),
|
||||
(namespaces['html'], 'source'),
|
||||
(namespaces['html'], 'spacer'),
|
||||
(namespaces['html'], 'span'),
|
||||
(namespaces['html'], 'strike'),
|
||||
(namespaces['html'], 'strong'),
|
||||
(namespaces['html'], 'sub'),
|
||||
(namespaces['html'], 'sup'),
|
||||
(namespaces['html'], 'table'),
|
||||
(namespaces['html'], 'tbody'),
|
||||
(namespaces['html'], 'td'),
|
||||
(namespaces['html'], 'textarea'),
|
||||
(namespaces['html'], 'time'),
|
||||
(namespaces['html'], 'tfoot'),
|
||||
(namespaces['html'], 'th'),
|
||||
(namespaces['html'], 'thead'),
|
||||
(namespaces['html'], 'tr'),
|
||||
(namespaces['html'], 'tt'),
|
||||
(namespaces['html'], 'u'),
|
||||
(namespaces['html'], 'ul'),
|
||||
(namespaces['html'], 'var'),
|
||||
(namespaces['html'], 'video'),
|
||||
(namespaces['mathml'], 'maction'),
|
||||
(namespaces['mathml'], 'math'),
|
||||
(namespaces['mathml'], 'merror'),
|
||||
(namespaces['mathml'], 'mfrac'),
|
||||
(namespaces['mathml'], 'mi'),
|
||||
(namespaces['mathml'], 'mmultiscripts'),
|
||||
(namespaces['mathml'], 'mn'),
|
||||
(namespaces['mathml'], 'mo'),
|
||||
(namespaces['mathml'], 'mover'),
|
||||
(namespaces['mathml'], 'mpadded'),
|
||||
(namespaces['mathml'], 'mphantom'),
|
||||
(namespaces['mathml'], 'mprescripts'),
|
||||
(namespaces['mathml'], 'mroot'),
|
||||
(namespaces['mathml'], 'mrow'),
|
||||
(namespaces['mathml'], 'mspace'),
|
||||
(namespaces['mathml'], 'msqrt'),
|
||||
(namespaces['mathml'], 'mstyle'),
|
||||
(namespaces['mathml'], 'msub'),
|
||||
(namespaces['mathml'], 'msubsup'),
|
||||
(namespaces['mathml'], 'msup'),
|
||||
(namespaces['mathml'], 'mtable'),
|
||||
(namespaces['mathml'], 'mtd'),
|
||||
(namespaces['mathml'], 'mtext'),
|
||||
(namespaces['mathml'], 'mtr'),
|
||||
(namespaces['mathml'], 'munder'),
|
||||
(namespaces['mathml'], 'munderover'),
|
||||
(namespaces['mathml'], 'none'),
|
||||
(namespaces['svg'], 'a'),
|
||||
(namespaces['svg'], 'animate'),
|
||||
(namespaces['svg'], 'animateColor'),
|
||||
(namespaces['svg'], 'animateMotion'),
|
||||
(namespaces['svg'], 'animateTransform'),
|
||||
(namespaces['svg'], 'clipPath'),
|
||||
(namespaces['svg'], 'circle'),
|
||||
(namespaces['svg'], 'defs'),
|
||||
(namespaces['svg'], 'desc'),
|
||||
(namespaces['svg'], 'ellipse'),
|
||||
(namespaces['svg'], 'font-face'),
|
||||
(namespaces['svg'], 'font-face-name'),
|
||||
(namespaces['svg'], 'font-face-src'),
|
||||
(namespaces['svg'], 'g'),
|
||||
(namespaces['svg'], 'glyph'),
|
||||
(namespaces['svg'], 'hkern'),
|
||||
(namespaces['svg'], 'linearGradient'),
|
||||
(namespaces['svg'], 'line'),
|
||||
(namespaces['svg'], 'marker'),
|
||||
(namespaces['svg'], 'metadata'),
|
||||
(namespaces['svg'], 'missing-glyph'),
|
||||
(namespaces['svg'], 'mpath'),
|
||||
(namespaces['svg'], 'path'),
|
||||
(namespaces['svg'], 'polygon'),
|
||||
(namespaces['svg'], 'polyline'),
|
||||
(namespaces['svg'], 'radialGradient'),
|
||||
(namespaces['svg'], 'rect'),
|
||||
(namespaces['svg'], 'set'),
|
||||
(namespaces['svg'], 'stop'),
|
||||
(namespaces['svg'], 'svg'),
|
||||
(namespaces['svg'], 'switch'),
|
||||
(namespaces['svg'], 'text'),
|
||||
(namespaces['svg'], 'title'),
|
||||
(namespaces['svg'], 'tspan'),
|
||||
(namespaces['svg'], 'use'),
|
||||
))
|
||||
|
||||
allowed_attributes = frozenset((
|
||||
# HTML attributes
|
||||
(None, 'abbr'),
|
||||
(None, 'accept'),
|
||||
(None, 'accept-charset'),
|
||||
(None, 'accesskey'),
|
||||
(None, 'action'),
|
||||
(None, 'align'),
|
||||
(None, 'alt'),
|
||||
(None, 'autocomplete'),
|
||||
(None, 'autofocus'),
|
||||
(None, 'axis'),
|
||||
(None, 'background'),
|
||||
(None, 'balance'),
|
||||
(None, 'bgcolor'),
|
||||
(None, 'bgproperties'),
|
||||
(None, 'border'),
|
||||
(None, 'bordercolor'),
|
||||
(None, 'bordercolordark'),
|
||||
(None, 'bordercolorlight'),
|
||||
(None, 'bottompadding'),
|
||||
(None, 'cellpadding'),
|
||||
(None, 'cellspacing'),
|
||||
(None, 'ch'),
|
||||
(None, 'challenge'),
|
||||
(None, 'char'),
|
||||
(None, 'charoff'),
|
||||
(None, 'choff'),
|
||||
(None, 'charset'),
|
||||
(None, 'checked'),
|
||||
(None, 'cite'),
|
||||
(None, 'class'),
|
||||
(None, 'clear'),
|
||||
(None, 'color'),
|
||||
(None, 'cols'),
|
||||
(None, 'colspan'),
|
||||
(None, 'compact'),
|
||||
(None, 'contenteditable'),
|
||||
(None, 'controls'),
|
||||
(None, 'coords'),
|
||||
(None, 'data'),
|
||||
(None, 'datafld'),
|
||||
(None, 'datapagesize'),
|
||||
(None, 'datasrc'),
|
||||
(None, 'datetime'),
|
||||
(None, 'default'),
|
||||
(None, 'delay'),
|
||||
(None, 'dir'),
|
||||
(None, 'disabled'),
|
||||
(None, 'draggable'),
|
||||
(None, 'dynsrc'),
|
||||
(None, 'enctype'),
|
||||
(None, 'end'),
|
||||
(None, 'face'),
|
||||
(None, 'for'),
|
||||
(None, 'form'),
|
||||
(None, 'frame'),
|
||||
(None, 'galleryimg'),
|
||||
(None, 'gutter'),
|
||||
(None, 'headers'),
|
||||
(None, 'height'),
|
||||
(None, 'hidefocus'),
|
||||
(None, 'hidden'),
|
||||
(None, 'high'),
|
||||
(None, 'href'),
|
||||
(None, 'hreflang'),
|
||||
(None, 'hspace'),
|
||||
(None, 'icon'),
|
||||
(None, 'id'),
|
||||
(None, 'inputmode'),
|
||||
(None, 'ismap'),
|
||||
(None, 'keytype'),
|
||||
(None, 'label'),
|
||||
(None, 'leftspacing'),
|
||||
(None, 'lang'),
|
||||
(None, 'list'),
|
||||
(None, 'longdesc'),
|
||||
(None, 'loop'),
|
||||
(None, 'loopcount'),
|
||||
(None, 'loopend'),
|
||||
(None, 'loopstart'),
|
||||
(None, 'low'),
|
||||
(None, 'lowsrc'),
|
||||
(None, 'max'),
|
||||
(None, 'maxlength'),
|
||||
(None, 'media'),
|
||||
(None, 'method'),
|
||||
(None, 'min'),
|
||||
(None, 'multiple'),
|
||||
(None, 'name'),
|
||||
(None, 'nohref'),
|
||||
(None, 'noshade'),
|
||||
(None, 'nowrap'),
|
||||
(None, 'open'),
|
||||
(None, 'optimum'),
|
||||
(None, 'pattern'),
|
||||
(None, 'ping'),
|
||||
(None, 'point-size'),
|
||||
(None, 'poster'),
|
||||
(None, 'pqg'),
|
||||
(None, 'preload'),
|
||||
(None, 'prompt'),
|
||||
(None, 'radiogroup'),
|
||||
(None, 'readonly'),
|
||||
(None, 'rel'),
|
||||
(None, 'repeat-max'),
|
||||
(None, 'repeat-min'),
|
||||
(None, 'replace'),
|
||||
(None, 'required'),
|
||||
(None, 'rev'),
|
||||
(None, 'rightspacing'),
|
||||
(None, 'rows'),
|
||||
(None, 'rowspan'),
|
||||
(None, 'rules'),
|
||||
(None, 'scope'),
|
||||
(None, 'selected'),
|
||||
(None, 'shape'),
|
||||
(None, 'size'),
|
||||
(None, 'span'),
|
||||
(None, 'src'),
|
||||
(None, 'start'),
|
||||
(None, 'step'),
|
||||
(None, 'style'),
|
||||
(None, 'summary'),
|
||||
(None, 'suppress'),
|
||||
(None, 'tabindex'),
|
||||
(None, 'target'),
|
||||
(None, 'template'),
|
||||
(None, 'title'),
|
||||
(None, 'toppadding'),
|
||||
(None, 'type'),
|
||||
(None, 'unselectable'),
|
||||
(None, 'usemap'),
|
||||
(None, 'urn'),
|
||||
(None, 'valign'),
|
||||
(None, 'value'),
|
||||
(None, 'variable'),
|
||||
(None, 'volume'),
|
||||
(None, 'vspace'),
|
||||
(None, 'vrml'),
|
||||
(None, 'width'),
|
||||
(None, 'wrap'),
|
||||
(namespaces['xml'], 'lang'),
|
||||
# MathML attributes
|
||||
(None, 'actiontype'),
|
||||
(None, 'align'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnlines'),
|
||||
(None, 'columnspacing'),
|
||||
(None, 'columnspan'),
|
||||
(None, 'depth'),
|
||||
(None, 'display'),
|
||||
(None, 'displaystyle'),
|
||||
(None, 'equalcolumns'),
|
||||
(None, 'equalrows'),
|
||||
(None, 'fence'),
|
||||
(None, 'fontstyle'),
|
||||
(None, 'fontweight'),
|
||||
(None, 'frame'),
|
||||
(None, 'height'),
|
||||
(None, 'linethickness'),
|
||||
(None, 'lspace'),
|
||||
(None, 'mathbackground'),
|
||||
(None, 'mathcolor'),
|
||||
(None, 'mathvariant'),
|
||||
(None, 'mathvariant'),
|
||||
(None, 'maxsize'),
|
||||
(None, 'minsize'),
|
||||
(None, 'other'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowlines'),
|
||||
(None, 'rowspacing'),
|
||||
(None, 'rowspan'),
|
||||
(None, 'rspace'),
|
||||
(None, 'scriptlevel'),
|
||||
(None, 'selection'),
|
||||
(None, 'separator'),
|
||||
(None, 'stretchy'),
|
||||
(None, 'width'),
|
||||
(None, 'width'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xlink'], 'show'),
|
||||
(namespaces['xlink'], 'type'),
|
||||
# SVG attributes
|
||||
(None, 'accent-height'),
|
||||
(None, 'accumulate'),
|
||||
(None, 'additive'),
|
||||
(None, 'alphabetic'),
|
||||
(None, 'arabic-form'),
|
||||
(None, 'ascent'),
|
||||
(None, 'attributeName'),
|
||||
(None, 'attributeType'),
|
||||
(None, 'baseProfile'),
|
||||
(None, 'bbox'),
|
||||
(None, 'begin'),
|
||||
(None, 'by'),
|
||||
(None, 'calcMode'),
|
||||
(None, 'cap-height'),
|
||||
(None, 'class'),
|
||||
(None, 'clip-path'),
|
||||
(None, 'color'),
|
||||
(None, 'color-rendering'),
|
||||
(None, 'content'),
|
||||
(None, 'cx'),
|
||||
(None, 'cy'),
|
||||
(None, 'd'),
|
||||
(None, 'dx'),
|
||||
(None, 'dy'),
|
||||
(None, 'descent'),
|
||||
(None, 'display'),
|
||||
(None, 'dur'),
|
||||
(None, 'end'),
|
||||
(None, 'fill'),
|
||||
(None, 'fill-opacity'),
|
||||
(None, 'fill-rule'),
|
||||
(None, 'font-family'),
|
||||
(None, 'font-size'),
|
||||
(None, 'font-stretch'),
|
||||
(None, 'font-style'),
|
||||
(None, 'font-variant'),
|
||||
(None, 'font-weight'),
|
||||
(None, 'from'),
|
||||
(None, 'fx'),
|
||||
(None, 'fy'),
|
||||
(None, 'g1'),
|
||||
(None, 'g2'),
|
||||
(None, 'glyph-name'),
|
||||
(None, 'gradientUnits'),
|
||||
(None, 'hanging'),
|
||||
(None, 'height'),
|
||||
(None, 'horiz-adv-x'),
|
||||
(None, 'horiz-origin-x'),
|
||||
(None, 'id'),
|
||||
(None, 'ideographic'),
|
||||
(None, 'k'),
|
||||
(None, 'keyPoints'),
|
||||
(None, 'keySplines'),
|
||||
(None, 'keyTimes'),
|
||||
(None, 'lang'),
|
||||
(None, 'marker-end'),
|
||||
(None, 'marker-mid'),
|
||||
(None, 'marker-start'),
|
||||
(None, 'markerHeight'),
|
||||
(None, 'markerUnits'),
|
||||
(None, 'markerWidth'),
|
||||
(None, 'mathematical'),
|
||||
(None, 'max'),
|
||||
(None, 'min'),
|
||||
(None, 'name'),
|
||||
(None, 'offset'),
|
||||
(None, 'opacity'),
|
||||
(None, 'orient'),
|
||||
(None, 'origin'),
|
||||
(None, 'overline-position'),
|
||||
(None, 'overline-thickness'),
|
||||
(None, 'panose-1'),
|
||||
(None, 'path'),
|
||||
(None, 'pathLength'),
|
||||
(None, 'points'),
|
||||
(None, 'preserveAspectRatio'),
|
||||
(None, 'r'),
|
||||
(None, 'refX'),
|
||||
(None, 'refY'),
|
||||
(None, 'repeatCount'),
|
||||
(None, 'repeatDur'),
|
||||
(None, 'requiredExtensions'),
|
||||
(None, 'requiredFeatures'),
|
||||
(None, 'restart'),
|
||||
(None, 'rotate'),
|
||||
(None, 'rx'),
|
||||
(None, 'ry'),
|
||||
(None, 'slope'),
|
||||
(None, 'stemh'),
|
||||
(None, 'stemv'),
|
||||
(None, 'stop-color'),
|
||||
(None, 'stop-opacity'),
|
||||
(None, 'strikethrough-position'),
|
||||
(None, 'strikethrough-thickness'),
|
||||
(None, 'stroke'),
|
||||
(None, 'stroke-dasharray'),
|
||||
(None, 'stroke-dashoffset'),
|
||||
(None, 'stroke-linecap'),
|
||||
(None, 'stroke-linejoin'),
|
||||
(None, 'stroke-miterlimit'),
|
||||
(None, 'stroke-opacity'),
|
||||
(None, 'stroke-width'),
|
||||
(None, 'systemLanguage'),
|
||||
(None, 'target'),
|
||||
(None, 'text-anchor'),
|
||||
(None, 'to'),
|
||||
(None, 'transform'),
|
||||
(None, 'type'),
|
||||
(None, 'u1'),
|
||||
(None, 'u2'),
|
||||
(None, 'underline-position'),
|
||||
(None, 'underline-thickness'),
|
||||
(None, 'unicode'),
|
||||
(None, 'unicode-range'),
|
||||
(None, 'units-per-em'),
|
||||
(None, 'values'),
|
||||
(None, 'version'),
|
||||
(None, 'viewBox'),
|
||||
(None, 'visibility'),
|
||||
(None, 'width'),
|
||||
(None, 'widths'),
|
||||
(None, 'x'),
|
||||
(None, 'x-height'),
|
||||
(None, 'x1'),
|
||||
(None, 'x2'),
|
||||
(namespaces['xlink'], 'actuate'),
|
||||
(namespaces['xlink'], 'arcrole'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xlink'], 'role'),
|
||||
(namespaces['xlink'], 'show'),
|
||||
(namespaces['xlink'], 'title'),
|
||||
(namespaces['xlink'], 'type'),
|
||||
(namespaces['xml'], 'base'),
|
||||
(namespaces['xml'], 'lang'),
|
||||
(namespaces['xml'], 'space'),
|
||||
(None, 'y'),
|
||||
(None, 'y1'),
|
||||
(None, 'y2'),
|
||||
(None, 'zoomAndPan'),
|
||||
))
|
||||
|
||||
attr_val_is_uri = frozenset((
|
||||
(None, 'href'),
|
||||
(None, 'src'),
|
||||
(None, 'cite'),
|
||||
(None, 'action'),
|
||||
(None, 'longdesc'),
|
||||
(None, 'poster'),
|
||||
(None, 'background'),
|
||||
(None, 'datasrc'),
|
||||
(None, 'dynsrc'),
|
||||
(None, 'lowsrc'),
|
||||
(None, 'ping'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xml'], 'base'),
|
||||
))
|
||||
|
||||
svg_attr_val_allows_ref = frozenset((
|
||||
(None, 'clip-path'),
|
||||
(None, 'color-profile'),
|
||||
(None, 'cursor'),
|
||||
(None, 'fill'),
|
||||
(None, 'filter'),
|
||||
(None, 'marker'),
|
||||
(None, 'marker-start'),
|
||||
(None, 'marker-mid'),
|
||||
(None, 'marker-end'),
|
||||
(None, 'mask'),
|
||||
(None, 'stroke'),
|
||||
))
|
||||
|
||||
svg_allow_local_href = frozenset((
|
||||
(None, 'altGlyph'),
|
||||
(None, 'animate'),
|
||||
(None, 'animateColor'),
|
||||
(None, 'animateMotion'),
|
||||
(None, 'animateTransform'),
|
||||
(None, 'cursor'),
|
||||
(None, 'feImage'),
|
||||
(None, 'filter'),
|
||||
(None, 'linearGradient'),
|
||||
(None, 'pattern'),
|
||||
(None, 'radialGradient'),
|
||||
(None, 'textpath'),
|
||||
(None, 'tref'),
|
||||
(None, 'set'),
|
||||
(None, 'use')
|
||||
))
|
||||
|
||||
allowed_css_properties = frozenset((
|
||||
'azimuth',
|
||||
'background-color',
|
||||
'border-bottom-color',
|
||||
'border-collapse',
|
||||
'border-color',
|
||||
'border-left-color',
|
||||
'border-right-color',
|
||||
'border-top-color',
|
||||
'clear',
|
||||
'color',
|
||||
'cursor',
|
||||
'direction',
|
||||
'display',
|
||||
'elevation',
|
||||
'float',
|
||||
'font',
|
||||
'font-family',
|
||||
'font-size',
|
||||
'font-style',
|
||||
'font-variant',
|
||||
'font-weight',
|
||||
'height',
|
||||
'letter-spacing',
|
||||
'line-height',
|
||||
'overflow',
|
||||
'pause',
|
||||
'pause-after',
|
||||
'pause-before',
|
||||
'pitch',
|
||||
'pitch-range',
|
||||
'richness',
|
||||
'speak',
|
||||
'speak-header',
|
||||
'speak-numeral',
|
||||
'speak-punctuation',
|
||||
'speech-rate',
|
||||
'stress',
|
||||
'text-align',
|
||||
'text-decoration',
|
||||
'text-indent',
|
||||
'unicode-bidi',
|
||||
'vertical-align',
|
||||
'voice-family',
|
||||
'volume',
|
||||
'white-space',
|
||||
'width',
|
||||
))
|
||||
|
||||
allowed_css_keywords = frozenset((
|
||||
'auto',
|
||||
'aqua',
|
||||
'black',
|
||||
'block',
|
||||
'blue',
|
||||
'bold',
|
||||
'both',
|
||||
'bottom',
|
||||
'brown',
|
||||
'center',
|
||||
'collapse',
|
||||
'dashed',
|
||||
'dotted',
|
||||
'fuchsia',
|
||||
'gray',
|
||||
'green',
|
||||
'!important',
|
||||
'italic',
|
||||
'left',
|
||||
'lime',
|
||||
'maroon',
|
||||
'medium',
|
||||
'none',
|
||||
'navy',
|
||||
'normal',
|
||||
'nowrap',
|
||||
'olive',
|
||||
'pointer',
|
||||
'purple',
|
||||
'red',
|
||||
'right',
|
||||
'solid',
|
||||
'silver',
|
||||
'teal',
|
||||
'top',
|
||||
'transparent',
|
||||
'underline',
|
||||
'white',
|
||||
'yellow',
|
||||
))
|
||||
|
||||
allowed_svg_properties = frozenset((
|
||||
'fill',
|
||||
'fill-opacity',
|
||||
'fill-rule',
|
||||
'stroke',
|
||||
'stroke-width',
|
||||
'stroke-linecap',
|
||||
'stroke-linejoin',
|
||||
'stroke-opacity',
|
||||
))
|
||||
|
||||
allowed_protocols = frozenset((
|
||||
'ed2k',
|
||||
'ftp',
|
||||
'http',
|
||||
'https',
|
||||
'irc',
|
||||
'mailto',
|
||||
'news',
|
||||
'gopher',
|
||||
'nntp',
|
||||
'telnet',
|
||||
'webcal',
|
||||
'xmpp',
|
||||
'callto',
|
||||
'feed',
|
||||
'urn',
|
||||
'aim',
|
||||
'rsync',
|
||||
'tag',
|
||||
'ssh',
|
||||
'sftp',
|
||||
'rtsp',
|
||||
'afs',
|
||||
'data',
|
||||
))
|
||||
|
||||
allowed_content_types = frozenset((
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/gif',
|
||||
'image/webp',
|
||||
'image/bmp',
|
||||
'text/plain',
|
||||
))
|
||||
|
||||
|
||||
data_content_type = re.compile(r'''
|
||||
^
|
||||
# Match a content type <application>/<type>
|
||||
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||
# Match any character set and encoding
|
||||
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||
# Assume the rest is data
|
||||
,.*
|
||||
$
|
||||
''',
|
||||
re.VERBOSE)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
|
||||
def __init__(self,
|
||||
source,
|
||||
allowed_elements=allowed_elements,
|
||||
allowed_attributes=allowed_attributes,
|
||||
allowed_css_properties=allowed_css_properties,
|
||||
allowed_css_keywords=allowed_css_keywords,
|
||||
allowed_svg_properties=allowed_svg_properties,
|
||||
allowed_protocols=allowed_protocols,
|
||||
allowed_content_types=allowed_content_types,
|
||||
attr_val_is_uri=attr_val_is_uri,
|
||||
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
||||
svg_allow_local_href=svg_allow_local_href):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg allowed_elements: set of elements to allow--everything else will
|
||||
be escaped
|
||||
|
||||
:arg allowed_attributes: set of attributes to allow in
|
||||
elements--everything else will be stripped
|
||||
|
||||
:arg allowed_css_properties: set of CSS properties to allow--everything
|
||||
else will be stripped
|
||||
|
||||
:arg allowed_css_keywords: set of CSS keywords to allow--everything
|
||||
else will be stripped
|
||||
|
||||
:arg allowed_svg_properties: set of SVG properties to allow--everything
|
||||
else will be removed
|
||||
|
||||
:arg allowed_protocols: set of allowed protocols for URIs
|
||||
|
||||
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
|
||||
|
||||
:arg attr_val_is_uri: set of attributes that have URI values--values
|
||||
that have a scheme not listed in ``allowed_protocols`` are removed
|
||||
|
||||
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
|
||||
references
|
||||
|
||||
:arg svg_allow_local_href: set of SVG elements that can have local
|
||||
hrefs--these are removed
|
||||
|
||||
"""
|
||||
super(Filter, self).__init__(source)
|
||||
|
||||
warnings.warn(_deprecation_msg, DeprecationWarning)
|
||||
|
||||
self.allowed_elements = allowed_elements
|
||||
self.allowed_attributes = allowed_attributes
|
||||
self.allowed_css_properties = allowed_css_properties
|
||||
self.allowed_css_keywords = allowed_css_keywords
|
||||
self.allowed_svg_properties = allowed_svg_properties
|
||||
self.allowed_protocols = allowed_protocols
|
||||
self.allowed_content_types = allowed_content_types
|
||||
self.attr_val_is_uri = attr_val_is_uri
|
||||
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
|
||||
self.svg_allow_local_href = svg_allow_local_href
|
||||
|
||||
def __iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
|
||||
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
|
||||
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
|
||||
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
|
||||
# allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
|
||||
# accommodate filters which use token_type differently
|
||||
token_type = token["type"]
|
||||
if token_type in ("StartTag", "EndTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
namespace = token["namespace"]
|
||||
if ((namespace, name) in self.allowed_elements or
|
||||
(namespace is None and
|
||||
(namespaces["html"], name) in self.allowed_elements)):
|
||||
return self.allowed_token(token)
|
||||
else:
|
||||
return self.disallowed_token(token)
|
||||
elif token_type == "Comment":
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def allowed_token(self, token):
|
||||
if "data" in token:
|
||||
attrs = token["data"]
|
||||
attr_names = set(attrs.keys())
|
||||
|
||||
# Remove forbidden attributes
|
||||
for to_remove in (attr_names - self.allowed_attributes):
|
||||
del token["data"][to_remove]
|
||||
attr_names.remove(to_remove)
|
||||
|
||||
# Remove attributes with disallowed URL values
|
||||
for attr in (attr_names & self.attr_val_is_uri):
|
||||
assert attr in attrs
|
||||
# I don't have a clue where this regexp comes from or why it matches those
|
||||
# characters, nor why we call unescape. I just know it's always been here.
|
||||
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
||||
# this will do is remove *more* than it otherwise would.
|
||||
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
try:
|
||||
uri = urlparse.urlparse(val_unescaped)
|
||||
except ValueError:
|
||||
uri = None
|
||||
del attrs[attr]
|
||||
if uri and uri.scheme:
|
||||
if uri.scheme not in self.allowed_protocols:
|
||||
del attrs[attr]
|
||||
if uri.scheme == 'data':
|
||||
m = data_content_type.match(uri.path)
|
||||
if not m:
|
||||
del attrs[attr]
|
||||
elif m.group('content_type') not in self.allowed_content_types:
|
||||
del attrs[attr]
|
||||
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
|
||||
attrs[(namespaces['xlink'], 'href')])):
|
||||
del attrs[(namespaces['xlink'], 'href')]
|
||||
if (None, 'style') in attrs:
|
||||
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
|
||||
token["data"] = attrs
|
||||
return token
|
||||
|
||||
def disallowed_token(self, token):
|
||||
token_type = token["type"]
|
||||
if token_type == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
assert token_type in ("StartTag", "EmptyTag")
|
||||
attrs = []
|
||||
for (ns, name), v in token["data"].items():
|
||||
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
|
||||
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token.get("selfClosing"):
|
||||
token["data"] = token["data"][:-1] + "/>"
|
||||
|
||||
token["type"] = "Characters"
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||
return ''
|
||||
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if keyword not in self.allowed_css_keywords and \
|
||||
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
38
lib/bleach/_vendor/html5lib/filters/whitespace.py
Normal file
38
lib/bleach/_vendor/html5lib/filters/whitespace.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from . import base
|
||||
from ..constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Collapses whitespace except in pre, textarea, and script elements"""
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
preserve += 1
|
||||
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||
# Test on token["data"] above to not introduce spaces where there were not
|
||||
token["data"] = " "
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
|
||||
yield token
|
||||
|
||||
|
||||
def collapse_spaces(text):
|
||||
return SPACES_REGEX.sub(' ', text)
|
2795
lib/bleach/_vendor/html5lib/html5parser.py
Normal file
2795
lib/bleach/_vendor/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load diff
409
lib/bleach/_vendor/html5lib/serializer.py
Normal file
409
lib/bleach/_vendor/html5lib/serializer.py
Normal file
|
@ -0,0 +1,409 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
import re
|
||||
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
|
||||
from .constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from .constants import rcdataElements, entities, xmlEntities
|
||||
from . import treewalkers, _utils
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
|
||||
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
|
||||
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
|
||||
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
|
||||
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
|
||||
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
|
||||
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
|
||||
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
|
||||
"\u3000]")
|
||||
|
||||
|
||||
_encode_entity_map = {}
|
||||
_is_ucs4 = len("\U0010FFFF") == 1
|
||||
for k, v in list(entities.items()):
|
||||
# skip multi-character entities
|
||||
if ((_is_ucs4 and len(v) > 1) or
|
||||
(not _is_ucs4 and len(v) > 2)):
|
||||
continue
|
||||
if v != "&":
|
||||
if len(v) == 2:
|
||||
v = _utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
v = ord(v)
|
||||
if v not in _encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
_encode_entity_map[v] = k
|
||||
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = _encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||
return ("".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
|
||||
register_error("htmlentityreplace", htmlentityreplace_errors)
|
||||
|
||||
|
||||
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
||||
"""Serializes the input token stream using the specified treewalker
|
||||
|
||||
:arg input: the token stream to serialize
|
||||
|
||||
:arg tree: the treewalker to use
|
||||
|
||||
:arg encoding: the encoding to use
|
||||
|
||||
:arg serializer_opts: any options to pass to the
|
||||
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
|
||||
|
||||
:returns: the tree serialized as a string
|
||||
|
||||
Example:
|
||||
|
||||
>>> from html5lib.html5parser import parse
|
||||
>>> from html5lib.serializer import serialize
|
||||
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
|
||||
>>> serialize(token_stream, omit_optional_tags=False)
|
||||
'<html><head></head><body><p>Hi!</p></body></html>'
|
||||
|
||||
"""
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
return s.render(walker(input), encoding)
|
||||
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
# attribute quoting options
|
||||
quote_attr_values = "legacy" # be secure by default
|
||||
quote_char = '"'
|
||||
use_best_quote_char = True
|
||||
|
||||
# tag syntax options
|
||||
omit_optional_tags = True
|
||||
minimize_boolean_attributes = True
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
|
||||
# escaping options
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
resolve_entities = True
|
||||
|
||||
# miscellaneous options
|
||||
alphabetical_attributes = False
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"omit_optional_tags", "minimize_boolean_attributes",
|
||||
"use_trailing_solidus", "space_before_trailing_solidus",
|
||||
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
||||
"alphabetical_attributes", "inject_meta_charset",
|
||||
"strip_whitespace", "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize HTMLSerializer
|
||||
|
||||
:arg inject_meta_charset: Whether or not to inject the meta charset.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg quote_attr_values: Whether to quote attribute values that don't
|
||||
require quoting per legacy browser behavior (``"legacy"``), when
|
||||
required by the standard (``"spec"``), or always (``"always"``).
|
||||
|
||||
Defaults to ``"legacy"``.
|
||||
|
||||
:arg quote_char: Use given quote character for attribute quoting.
|
||||
|
||||
Defaults to ``"`` which will use double quotes unless attribute
|
||||
value contains a double quote, in which case single quotes are
|
||||
used.
|
||||
|
||||
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
|
||||
values.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg escape_rcdata: Whether to escape characters that need to be
|
||||
escaped within normal elements within rcdata elements such as
|
||||
style.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg resolve_entities: Whether to resolve named character entities that
|
||||
appear in the source tree. The XML predefined entities < >
|
||||
& " ' are unaffected by this setting.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg strip_whitespace: Whether to remove semantically meaningless
|
||||
whitespace. (This compresses all whitespace to a single space
|
||||
except within ``pre``.)
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg minimize_boolean_attributes: Shortens boolean attributes to give
|
||||
just the attribute value, for example::
|
||||
|
||||
<input disabled="disabled">
|
||||
|
||||
becomes::
|
||||
|
||||
<input disabled>
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
|
||||
start tag of void elements (empty elements whose end tag is
|
||||
forbidden). E.g. ``<hr/>``.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg space_before_trailing_solidus: Places a space immediately before
|
||||
the closing slash in a tag using a trailing solidus. E.g.
|
||||
``<hr />``. Requires ``use_trailing_solidus=True``.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg sanitize: Strip all unsafe or unknown constructs from output.
|
||||
See :py:class:`html5lib.filters.sanitizer.Filter`.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg omit_optional_tags: Omit start/end tags that are optional.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
"""
|
||||
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
||||
if len(unexpected_args) > 0:
|
||||
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
|
||||
if 'quote_char' in kwargs:
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||
self.errors = []
|
||||
self.strict = False
|
||||
|
||||
def encode(self, string):
|
||||
assert(isinstance(string, text_type))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, "htmlentityreplace")
|
||||
else:
|
||||
return string
|
||||
|
||||
def encodeStrict(self, string):
|
||||
assert(isinstance(string, text_type))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, "strict")
|
||||
else:
|
||||
return string
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
# pylint:disable=too-many-nested-blocks
|
||||
self.encoding = encoding
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
|
||||
if encoding and self.inject_meta_charset:
|
||||
from .filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# Alphabetical attributes is here under the assumption that none of
|
||||
# the later filters add or change order of attributes; it needs to be
|
||||
# before the sanitizer so escaped elements come out correctly
|
||||
if self.alphabetical_attributes:
|
||||
from .filters.alphabeticalattributes import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
# WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
from .filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from .filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
from .filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = "<!DOCTYPE %s" % token["name"]
|
||||
|
||||
if token["publicId"]:
|
||||
doctype += ' PUBLIC "%s"' % token["publicId"]
|
||||
elif token["systemId"]:
|
||||
doctype += " SYSTEM"
|
||||
if token["systemId"]:
|
||||
if token["systemId"].find('"') >= 0:
|
||||
if token["systemId"].find("'") >= 0:
|
||||
self.serializeError("System identifier contains both single and double quote characters")
|
||||
quote_char = "'"
|
||||
else:
|
||||
quote_char = '"'
|
||||
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||
|
||||
doctype += ">"
|
||||
yield self.encodeStrict(doctype)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError("Unexpected </ in CDATA")
|
||||
yield self.encode(token["data"])
|
||||
else:
|
||||
yield self.encode(escape(token["data"]))
|
||||
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
yield self.encodeStrict("<%s" % name)
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
for (_, attr_name), attr_value in token["data"].items():
|
||||
# TODO: Add namespace support here
|
||||
k = attr_name
|
||||
v = attr_value
|
||||
yield self.encodeStrict(' ')
|
||||
|
||||
yield self.encodeStrict(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple()) and
|
||||
k not in booleanAttributes.get("", tuple())):
|
||||
yield self.encodeStrict("=")
|
||||
if self.quote_attr_values == "always" or len(v) == 0:
|
||||
quote_attr = True
|
||||
elif self.quote_attr_values == "spec":
|
||||
quote_attr = _quoteAttributeSpec.search(v) is not None
|
||||
elif self.quote_attr_values == "legacy":
|
||||
quote_attr = _quoteAttributeLegacy.search(v) is not None
|
||||
else:
|
||||
raise ValueError("quote_attr_values must be one of: "
|
||||
"'always', 'spec', or 'legacy'")
|
||||
v = v.replace("&", "&")
|
||||
if self.escape_lt_in_attrs:
|
||||
v = v.replace("<", "<")
|
||||
if quote_attr:
|
||||
quote_char = self.quote_char
|
||||
if self.use_best_quote_char:
|
||||
if "'" in v and '"' not in v:
|
||||
quote_char = '"'
|
||||
elif '"' in v and "'" not in v:
|
||||
quote_char = "'"
|
||||
if quote_char == "'":
|
||||
v = v.replace("'", "'")
|
||||
else:
|
||||
v = v.replace('"', """)
|
||||
yield self.encodeStrict(quote_char)
|
||||
yield self.encode(v)
|
||||
yield self.encodeStrict(quote_char)
|
||||
else:
|
||||
yield self.encode(v)
|
||||
if name in voidElements and self.use_trailing_solidus:
|
||||
if self.space_before_trailing_solidus:
|
||||
yield self.encodeStrict(" /")
|
||||
else:
|
||||
yield self.encodeStrict("/")
|
||||
yield self.encode(">")
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
yield self.encodeStrict("</%s>" % name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError("Comment contains --")
|
||||
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||
|
||||
elif type == "Entity":
|
||||
name = token["name"]
|
||||
key = name + ";"
|
||||
if key not in entities:
|
||||
self.serializeError("Entity %s not recognized" % name)
|
||||
if self.resolve_entities and key not in xmlEntities:
|
||||
data = entities[key]
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
yield self.encodeStrict(data)
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
def render(self, treewalker, encoding=None):
|
||||
"""Serializes the stream from the treewalker into a string
|
||||
|
||||
:arg treewalker: the treewalker to serialize
|
||||
|
||||
:arg encoding: the string encoding to use
|
||||
|
||||
:returns: the serialized tree
|
||||
|
||||
Example:
|
||||
|
||||
>>> from html5lib import parse, getTreeWalker
|
||||
>>> from html5lib.serializer import HTMLSerializer
|
||||
>>> token_stream = parse('<html><body>Hi!</body></html>')
|
||||
>>> walker = getTreeWalker('etree')
|
||||
>>> serializer = HTMLSerializer(omit_optional_tags=False)
|
||||
>>> serializer.render(walker(token_stream))
|
||||
'<html><head></head><body>Hi!</body></html>'
|
||||
|
||||
"""
|
||||
if encoding:
|
||||
return b"".join(list(self.serialize(treewalker, encoding)))
|
||||
else:
|
||||
return "".join(list(self.serialize(treewalker)))
|
||||
|
||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
if self.strict:
|
||||
raise SerializeError
|
||||
|
||||
|
||||
class SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
30
lib/bleach/_vendor/html5lib/treeadapters/__init__.py
Normal file
30
lib/bleach/_vendor/html5lib/treeadapters/__init__.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
"""Tree adapters let you convert from one tree structure to another
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
from html5lib.treeadapters import genshi
|
||||
|
||||
doc = '<html><body>Hi!</body></html>'
|
||||
treebuilder = html5lib.getTreeBuilder('etree')
|
||||
parser = html5lib.HTMLParser(tree=treebuilder)
|
||||
tree = parser.parse(doc)
|
||||
TreeWalker = html5lib.getTreeWalker('etree')
|
||||
|
||||
genshi_tree = genshi.to_genshi(TreeWalker(tree))
|
||||
|
||||
"""
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import sax
|
||||
|
||||
__all__ = ["sax"]
|
||||
|
||||
try:
|
||||
from . import genshi # noqa
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
__all__.append("genshi")
|
54
lib/bleach/_vendor/html5lib/treeadapters/genshi.py
Normal file
54
lib/bleach/_vendor/html5lib/treeadapters/genshi.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from genshi.core import QName, Attrs
|
||||
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
||||
|
||||
|
||||
def to_genshi(walker):
|
||||
"""Convert a tree to a genshi tree
|
||||
|
||||
:arg walker: the treewalker to use to walk the tree to convert it
|
||||
|
||||
:returns: generator of genshi nodes
|
||||
|
||||
"""
|
||||
text = []
|
||||
for token in walker:
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
text.append(token["data"])
|
||||
elif text:
|
||||
yield TEXT, "".join(text), (None, -1, -1)
|
||||
text = []
|
||||
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
if token["namespace"]:
|
||||
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
|
||||
for attr, value in token["data"].items()])
|
||||
yield (START, (QName(name), attrs), (None, -1, -1))
|
||||
if type == "EmptyTag":
|
||||
type = "EndTag"
|
||||
|
||||
if type == "EndTag":
|
||||
if token["namespace"]:
|
||||
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
|
||||
yield END, QName(name), (None, -1, -1)
|
||||
|
||||
elif type == "Comment":
|
||||
yield COMMENT, token["data"], (None, -1, -1)
|
||||
|
||||
elif type == "Doctype":
|
||||
yield DOCTYPE, (token["name"], token["publicId"],
|
||||
token["systemId"]), (None, -1, -1)
|
||||
|
||||
else:
|
||||
pass # FIXME: What to do?
|
||||
|
||||
if text:
|
||||
yield TEXT, "".join(text), (None, -1, -1)
|
50
lib/bleach/_vendor/html5lib/treeadapters/sax.py
Normal file
50
lib/bleach/_vendor/html5lib/treeadapters/sax.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.sax.xmlreader import AttributesNSImpl
|
||||
|
||||
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
|
||||
|
||||
prefix_mapping = {}
|
||||
for prefix, localName, namespace in adjustForeignAttributes.values():
|
||||
if prefix is not None:
|
||||
prefix_mapping[prefix] = namespace
|
||||
|
||||
|
||||
def to_sax(walker, handler):
|
||||
"""Call SAX-like content handler based on treewalker walker
|
||||
|
||||
:arg walker: the treewalker to use to walk the tree to convert it
|
||||
|
||||
:arg handler: SAX handler to use
|
||||
|
||||
"""
|
||||
handler.startDocument()
|
||||
for prefix, namespace in prefix_mapping.items():
|
||||
handler.startPrefixMapping(prefix, namespace)
|
||||
|
||||
for token in walker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
continue
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
attrs = AttributesNSImpl(token["data"],
|
||||
unadjustForeignAttributes)
|
||||
handler.startElementNS((token["namespace"], token["name"]),
|
||||
token["name"],
|
||||
attrs)
|
||||
if type == "EmptyTag":
|
||||
handler.endElementNS((token["namespace"], token["name"]),
|
||||
token["name"])
|
||||
elif type == "EndTag":
|
||||
handler.endElementNS((token["namespace"], token["name"]),
|
||||
token["name"])
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
handler.characters(token["data"])
|
||||
elif type == "Comment":
|
||||
pass
|
||||
else:
|
||||
assert False, "Unknown token type"
|
||||
|
||||
for prefix, namespace in prefix_mapping.items():
|
||||
handler.endPrefixMapping(prefix)
|
||||
handler.endDocument()
|
88
lib/bleach/_vendor/html5lib/treebuilders/__init__.py
Normal file
88
lib/bleach/_vendor/html5lib/treebuilders/__init__.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
"""A collection of modules for building different kinds of trees from HTML
|
||||
documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1. A set of classes for various types of elements: Document, Doctype, Comment,
|
||||
Element. These must implement the interface of ``base.treebuilders.Node``
|
||||
(although comment nodes have a different signature for their constructor,
|
||||
see ``treebuilders.etree.Comment``) Textual content may also be implemented
|
||||
as another node type, or not, as your tree implementation requires.
|
||||
|
||||
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
|
||||
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
|
||||
|
||||
* ``documentClass`` - the class to use for the bottommost node of a document
|
||||
* ``elementClass`` - the class to use for HTML Elements
|
||||
* ``commentClass`` - the class to use for comments
|
||||
* ``doctypeClass`` - the class to use for doctypes
|
||||
|
||||
It also has one required method:
|
||||
|
||||
* ``getDocument`` - Returns the root node of the complete document tree
|
||||
|
||||
3. If you wish to run the unit tests, you must also create a ``testSerializer``
|
||||
method on your treebuilder which accepts a node and returns a string
|
||||
containing Node and its children serialized according to the format used in
|
||||
the unittests
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .._utils import default_etree
|
||||
|
||||
treeBuilderCache = {}
|
||||
|
||||
|
||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeBuilder class for various types of trees with built-in support
|
||||
|
||||
:arg treeType: the name of the tree type required (case-insensitive). Supported
|
||||
values are:
|
||||
|
||||
* "dom" - A generic builder for DOM implementations, defaulting to a
|
||||
xml.dom.minidom based implementation.
|
||||
* "etree" - A generic builder for tree implementations exposing an
|
||||
ElementTree-like interface, defaulting to xml.etree.cElementTree if
|
||||
available and xml.etree.ElementTree if not.
|
||||
* "lxml" - A etree-based builder for lxml.etree, handling limitations
|
||||
of lxml's implementation.
|
||||
|
||||
:arg implementation: (Currently applies to the "etree" and "dom" tree
|
||||
types). A module implementing the tree type e.g. xml.etree.ElementTree
|
||||
or xml.etree.cElementTree.
|
||||
|
||||
:arg kwargs: Any additional options to pass to the TreeBuilder when
|
||||
creating it.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from html5lib.treebuilders import getTreeBuilder
|
||||
>>> builder = getTreeBuilder('etree')
|
||||
|
||||
"""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeBuilderCache:
|
||||
if treeType == "dom":
|
||||
from . import dom
|
||||
# Come up with a sane default (pref. from the stdlib)
|
||||
if implementation is None:
|
||||
from xml.dom import minidom
|
||||
implementation = minidom
|
||||
# NEVER cache here, caching is done in the dom submodule
|
||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||
elif treeType == "lxml":
|
||||
from . import etree_lxml
|
||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
from . import etree
|
||||
if implementation is None:
|
||||
implementation = default_etree
|
||||
# NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
else:
|
||||
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
|
||||
return treeBuilderCache.get(treeType)
|
417
lib/bleach/_vendor/html5lib/treebuilders/base.py
Normal file
417
lib/bleach/_vendor/html5lib/treebuilders/base.py
Normal file
|
@ -0,0 +1,417 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
from ..constants import scopingElements, tableInsertModeElements, namespaces
|
||||
|
||||
# The scope markers are inserted when entering object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, object elements, and marquees.
|
||||
Marker = None
|
||||
|
||||
listElementsMap = {
|
||||
None: (frozenset(scopingElements), False),
|
||||
"button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
|
||||
"list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
|
||||
(namespaces["html"], "ul")}), False),
|
||||
"table": (frozenset([(namespaces["html"], "html"),
|
||||
(namespaces["html"], "table")]), False),
|
||||
"select": (frozenset([(namespaces["html"], "optgroup"),
|
||||
(namespaces["html"], "option")]), True)
|
||||
}
|
||||
|
||||
|
||||
class Node(object):
|
||||
"""Represents an item in the tree"""
|
||||
def __init__(self, name):
|
||||
"""Creates a Node
|
||||
|
||||
:arg name: The tag name associated with the node
|
||||
|
||||
"""
|
||||
# The tag name associated with the node
|
||||
self.name = name
|
||||
# The parent of the current node (or None for the document node)
|
||||
self.parent = None
|
||||
# The value of the current node (applies to text nodes and comments)
|
||||
self.value = None
|
||||
# A dict holding name -> value pairs for attributes of the node
|
||||
self.attributes = {}
|
||||
# A list of child nodes of the current node. This must include all
|
||||
# elements but not necessarily other node types.
|
||||
self.childNodes = []
|
||||
# A list of miscellaneous flags that can be set on the node.
|
||||
self._flags = []
|
||||
|
||||
def __str__(self):
|
||||
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
|
||||
for name, value in
|
||||
self.attributes.items()])
|
||||
if attributesStr:
|
||||
return "<%s %s>" % (self.name, attributesStr)
|
||||
else:
|
||||
return "<%s>" % (self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s>" % (self.name)
|
||||
|
||||
def appendChild(self, node):
|
||||
"""Insert node as a child of the current node
|
||||
|
||||
:arg node: the node to insert
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
"""Insert data as text in the current node, positioned before the
|
||||
start of node insertBefore or to the end of the node's text.
|
||||
|
||||
:arg data: the data to insert
|
||||
|
||||
:arg insertBefore: True if you want to insert the text before the node
|
||||
and False if you want to insert it after the node
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
"""Insert node as a child of the current node, before refNode in the
|
||||
list of child nodes. Raises ValueError if refNode is not a child of
|
||||
the current node
|
||||
|
||||
:arg node: the node to insert
|
||||
|
||||
:arg refNode: the child node to insert the node before
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def removeChild(self, node):
|
||||
"""Remove node from the children of the current node
|
||||
|
||||
:arg node: the child node to remove
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
"""Move all the children of the current node to newParent.
|
||||
This is needed so that trees that don't store text as nodes move the
|
||||
text in the correct way
|
||||
|
||||
:arg newParent: the node to move all this node's children to
|
||||
|
||||
"""
|
||||
# XXX - should this method be made more general?
|
||||
for child in self.childNodes:
|
||||
newParent.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def cloneNode(self):
|
||||
"""Return a shallow copy of the current node i.e. a node with the same
|
||||
name and attributes but with no parent or child nodes
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ActiveFormattingElements(list):
|
||||
def append(self, node):
|
||||
equalCount = 0
|
||||
if node != Marker:
|
||||
for element in self[::-1]:
|
||||
if element == Marker:
|
||||
break
|
||||
if self.nodesEqual(element, node):
|
||||
equalCount += 1
|
||||
if equalCount == 3:
|
||||
self.remove(element)
|
||||
break
|
||||
list.append(self, node)
|
||||
|
||||
def nodesEqual(self, node1, node2):
|
||||
if not node1.nameTuple == node2.nameTuple:
|
||||
return False
|
||||
|
||||
if not node1.attributes == node2.attributes:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Base treebuilder implementation
|
||||
|
||||
* documentClass - the class to use for the bottommost node of a document
|
||||
* elementClass - the class to use for HTML Elements
|
||||
* commentClass - the class to use for comments
|
||||
* doctypeClass - the class to use for doctypes
|
||||
|
||||
"""
|
||||
# pylint:disable=not-callable
|
||||
|
||||
# Document class
|
||||
documentClass = None
|
||||
|
||||
# The class to use for creating a node
|
||||
elementClass = None
|
||||
|
||||
# The class to use for creating comments
|
||||
commentClass = None
|
||||
|
||||
# The class to use for creating doctypes
|
||||
doctypeClass = None
|
||||
|
||||
# Fragment class
|
||||
fragmentClass = None
|
||||
|
||||
def __init__(self, namespaceHTMLElements):
|
||||
"""Create a TreeBuilder
|
||||
|
||||
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||
|
||||
"""
|
||||
if namespaceHTMLElements:
|
||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||
else:
|
||||
self.defaultNamespace = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.openElements = []
|
||||
self.activeFormattingElements = ActiveFormattingElements()
|
||||
|
||||
# XXX - rename these to headElement, formElement
|
||||
self.headPointer = None
|
||||
self.formPointer = None
|
||||
|
||||
self.insertFromTable = False
|
||||
|
||||
self.document = self.documentClass()
|
||||
|
||||
def elementInScope(self, target, variant=None):
|
||||
|
||||
# If we pass a node in we match that. if we pass a string
|
||||
# match any node with that name
|
||||
exactNode = hasattr(target, "nameTuple")
|
||||
if not exactNode:
|
||||
if isinstance(target, text_type):
|
||||
target = (namespaces["html"], target)
|
||||
assert isinstance(target, tuple)
|
||||
|
||||
listElements, invert = listElementsMap[variant]
|
||||
|
||||
for node in reversed(self.openElements):
|
||||
if exactNode and node == target:
|
||||
return True
|
||||
elif not exactNode and node.nameTuple == target:
|
||||
return True
|
||||
elif (invert ^ (node.nameTuple in listElements)):
|
||||
return False
|
||||
|
||||
assert False # We should never reach this point
|
||||
|
||||
def reconstructActiveFormattingElements(self):
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
if not self.activeFormattingElements:
|
||||
return
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = len(self.activeFormattingElements) - 1
|
||||
entry = self.activeFormattingElements[i]
|
||||
if entry == Marker or entry in self.openElements:
|
||||
return
|
||||
|
||||
# Step 6
|
||||
while entry != Marker and entry not in self.openElements:
|
||||
if i == 0:
|
||||
# This will be reset to 0 below
|
||||
i = -1
|
||||
break
|
||||
i -= 1
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
entry = self.activeFormattingElements[i]
|
||||
|
||||
while True:
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
entry = self.activeFormattingElements[i]
|
||||
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement({"type": "StartTag",
|
||||
"name": clone.name,
|
||||
"namespace": clone.namespace,
|
||||
"data": clone.attributes})
|
||||
|
||||
# Step 10
|
||||
self.activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
if element == self.activeFormattingElements[-1]:
|
||||
break
|
||||
|
||||
def clearActiveFormattingElements(self):
|
||||
entry = self.activeFormattingElements.pop()
|
||||
while self.activeFormattingElements and entry != Marker:
|
||||
entry = self.activeFormattingElements.pop()
|
||||
|
||||
def elementInActiveFormattingElements(self, name):
|
||||
"""Check if an element exists between the end of the active
|
||||
formatting elements and the last marker. If it does, return it, else
|
||||
return false"""
|
||||
|
||||
for item in self.activeFormattingElements[::-1]:
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
if item == Marker:
|
||||
break
|
||||
elif item.name == name:
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertRoot(self, token):
|
||||
element = self.createElement(token)
|
||||
self.openElements.append(element)
|
||||
self.document.appendChild(element)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.document.appendChild(doctype)
|
||||
|
||||
def insertComment(self, token, parent=None):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
parent.appendChild(self.commentClass(token["data"]))
|
||||
|
||||
def createElement(self, token):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
return element
|
||||
|
||||
def _getInsertFromTable(self):
|
||||
return self._insertFromTable
|
||||
|
||||
def _setInsertFromTable(self, value):
|
||||
"""Switch the function used to insert an element from the
|
||||
normal one to the misnested table one and back again"""
|
||||
self._insertFromTable = value
|
||||
if value:
|
||||
self.insertElement = self.insertElementTable
|
||||
else:
|
||||
self.insertElement = self.insertElementNormal
|
||||
|
||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||
|
||||
def insertElementNormal(self, token):
|
||||
name = token["name"]
|
||||
assert isinstance(name, text_type), "Element %s not unicode" % name
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
self.openElements[-1].appendChild(element)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertElementTable(self, token):
|
||||
"""Create an element and insert it into the tree"""
|
||||
element = self.createElement(token)
|
||||
if self.openElements[-1].name not in tableInsertModeElements:
|
||||
return self.insertElementNormal(token)
|
||||
else:
|
||||
# We should be in the InTable mode. This means we want to do
|
||||
# special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
if insertBefore is None:
|
||||
parent.appendChild(element)
|
||||
else:
|
||||
parent.insertBefore(element, insertBefore)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
"""Insert text data."""
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
|
||||
if (not self.insertFromTable or (self.insertFromTable and
|
||||
self.openElements[-1].name
|
||||
not in tableInsertModeElements)):
|
||||
parent.insertText(data)
|
||||
else:
|
||||
# We should be in the InTable mode. This means we want to do
|
||||
# special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
parent.insertText(data, insertBefore)
|
||||
|
||||
def getTableMisnestedNodePosition(self):
|
||||
"""Get the foster parent element, and sibling to insert before
|
||||
(or None) when inserting a misnested table node"""
|
||||
# The foster parent element is the one which comes before the most
|
||||
# recently opened table element
|
||||
# XXX - this is really inelegant
|
||||
lastTable = None
|
||||
fosterParent = None
|
||||
insertBefore = None
|
||||
for elm in self.openElements[::-1]:
|
||||
if elm.name == "table":
|
||||
lastTable = elm
|
||||
break
|
||||
if lastTable:
|
||||
# XXX - we should really check that this parent is actually a
|
||||
# node here
|
||||
if lastTable.parent:
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else:
|
||||
fosterParent = self.openElements[
|
||||
self.openElements.index(lastTable) - 1]
|
||||
else:
|
||||
fosterParent = self.openElements[0]
|
||||
return fosterParent, insertBefore
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
# XXX td, th and tr are not actually needed
|
||||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
|
||||
name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
||||
self.generateImpliedEndTags(exclude)
|
||||
|
||||
def getDocument(self):
|
||||
"""Return the final tree"""
|
||||
return self.document
|
||||
|
||||
def getFragment(self):
|
||||
"""Return the final fragment"""
|
||||
# assert self.innerHTML
|
||||
fragment = self.fragmentClass()
|
||||
self.openElements[0].reparentChildren(fragment)
|
||||
return fragment
|
||||
|
||||
def testSerializer(self, node):
|
||||
"""Serialize the subtree of node in the format required by unit tests
|
||||
|
||||
:arg node: the node from which to start serializing
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
239
lib/bleach/_vendor/html5lib/treebuilders/dom.py
Normal file
239
lib/bleach/_vendor/html5lib/treebuilders/dom.py
Normal file
|
@ -0,0 +1,239 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
|
||||
try:
|
||||
from collections.abc import MutableMapping
|
||||
except ImportError: # Python 2.7
|
||||
from collections import MutableMapping
|
||||
from xml.dom import minidom, Node
|
||||
import weakref
|
||||
|
||||
from . import base
|
||||
from .. import constants
|
||||
from ..constants import namespaces
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
|
||||
def getDomBuilder(DomImplementation):
|
||||
Dom = DomImplementation
|
||||
|
||||
class AttrList(MutableMapping):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.element.attributes.keys())
|
||||
|
||||
def __setitem__(self, name, value):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
attr = self.element.ownerDocument.createAttribute(name)
|
||||
attr.value = value
|
||||
self.element.attributes[name] = attr
|
||||
|
||||
def __len__(self):
|
||||
return len(self.element.attributes)
|
||||
|
||||
def items(self):
|
||||
return list(self.element.attributes.items())
|
||||
|
||||
def values(self):
|
||||
return list(self.element.attributes.values())
|
||||
|
||||
def __getitem__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.attributes[name].value
|
||||
|
||||
def __delitem__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
del self.element.attributes[name]
|
||||
|
||||
class NodeBuilder(base.Node):
|
||||
def __init__(self, element):
|
||||
base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
|
||||
self.element.namespaceURI or None)
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
else:
|
||||
self.element.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
self.element.insertBefore(node.element, refNode.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
if node.element.parentNode == self.element:
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.hasChildNodes():
|
||||
child = self.element.firstChild
|
||||
self.element.removeChild(child)
|
||||
newParent.element.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
if name[0] is not None:
|
||||
qualifiedName = (name[0] + ":" + name[1])
|
||||
else:
|
||||
qualifiedName = name[1]
|
||||
self.element.setAttributeNS(name[2], qualifiedName,
|
||||
value)
|
||||
else:
|
||||
self.element.setAttribute(
|
||||
name, value)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
return NodeBuilder(self.element.cloneNode(False))
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.hasChildNodes()
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace is None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||
def documentClass(self):
|
||||
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
|
||||
return weakref.proxy(self)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
domimpl = Dom.getDOMImplementation()
|
||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
if Dom == minidom:
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
def elementClass(self, name, namespace=None):
|
||||
if namespace is None and self.defaultNamespace is None:
|
||||
node = self.dom.createElement(name)
|
||||
else:
|
||||
node = self.dom.createElementNS(namespace, name)
|
||||
|
||||
return NodeBuilder(node)
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
|
||||
def fragmentClass(self):
|
||||
return NodeBuilder(self.dom.createDocumentFragment())
|
||||
|
||||
def appendChild(self, node):
|
||||
self.dom.appendChild(node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.dom
|
||||
|
||||
def getFragment(self):
|
||||
return base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data = data
|
||||
if parent != self:
|
||||
base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
# pylint:disable=protected-access
|
||||
if Node.TEXT_NODE not in self.dom._child_node_types:
|
||||
self.dom._child_node_types = list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
self.dom.appendChild(self.dom.createTextNode(data))
|
||||
|
||||
implementation = DomImplementation
|
||||
name = None
|
||||
|
||||
def testSerializer(element):
|
||||
element.normalize()
|
||||
rv = []
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
if element.name:
|
||||
if element.publicId or element.systemId:
|
||||
publicId = element.publicId or ""
|
||||
systemId = element.systemId or ""
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(' ' * indent, element.name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document-fragment")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
|
||||
else:
|
||||
if (hasattr(element, "namespaceURI") and
|
||||
element.namespaceURI is not None):
|
||||
name = "%s %s" % (constants.prefixes[element.namespaceURI],
|
||||
element.nodeName)
|
||||
else:
|
||||
name = element.nodeName
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
if element.hasAttributes():
|
||||
attributes = []
|
||||
for i in range(len(element.attributes)):
|
||||
attr = element.attributes.item(i)
|
||||
name = attr.nodeName
|
||||
value = attr.value
|
||||
ns = attr.namespaceURI
|
||||
if ns:
|
||||
name = "%s %s" % (constants.prefixes[ns], attr.localName)
|
||||
else:
|
||||
name = attr.nodeName
|
||||
attributes.append((name, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
indent += 2
|
||||
for child in element.childNodes:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
return locals()
|
||||
|
||||
|
||||
# The actual means to get a module!
|
||||
getDomModule = moduleFactoryFactory(getDomBuilder)
|
343
lib/bleach/_vendor/html5lib/treebuilders/etree.py
Normal file
343
lib/bleach/_vendor/html5lib/treebuilders/etree.py
Normal file
|
@ -0,0 +1,343 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
# pylint:disable=protected-access
|
||||
|
||||
from six import text_type
|
||||
|
||||
import re
|
||||
|
||||
from copy import copy
|
||||
|
||||
from . import base
|
||||
from .. import _ihatexml
|
||||
from .. import constants
|
||||
from ..constants import namespaces
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
ElementTree = ElementTreeImplementation
|
||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||
|
||||
class Element(base.Node):
|
||||
def __init__(self, name, namespace=None):
|
||||
self._name = name
|
||||
self._namespace = namespace
|
||||
self._element = ElementTree.Element(self._getETreeTag(name,
|
||||
namespace))
|
||||
if namespace is None:
|
||||
self.nameTuple = namespaces["html"], self._name
|
||||
else:
|
||||
self.nameTuple = self._namespace, self._name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getETreeTag(self, name, namespace):
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s" % (namespace, name)
|
||||
return etree_tag
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = name
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return self._name
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _setNamespace(self, namespace):
|
||||
self._namespace = namespace
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getNamespace(self):
|
||||
return self._namespace
|
||||
|
||||
namespace = property(_getNamespace, _setNamespace)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
el_attrib = self._element.attrib
|
||||
el_attrib.clear()
|
||||
if attributes:
|
||||
# calling .items _always_ allocates, and the above truthy check is cheaper than the
|
||||
# allocation on average
|
||||
for key, value in attributes.items():
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s" % (key[2], key[1])
|
||||
else:
|
||||
name = key
|
||||
el_attrib[name] = value
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or len(self._element))
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = list(self._element).index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._childNodes.remove(node)
|
||||
self._element.remove(node._element)
|
||||
node.parent = None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
# Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
# Insert the text before the specified node
|
||||
children = list(self._element)
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index - 1].tail:
|
||||
self._element[index - 1].tail = ""
|
||||
self._element[index - 1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = type(self)(self.name, self.namespace)
|
||||
if self._element.attrib:
|
||||
element._element.attrib = copy(self._element.attrib)
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
# Use the superclass constructor to set all properties on the
|
||||
# wrapper element
|
||||
self._element = ElementTree.Comment(data)
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
def _getPublicId(self):
|
||||
return self._element.get("publicId", "")
|
||||
|
||||
def _setPublicId(self, value):
|
||||
if value is not None:
|
||||
self._element.set("publicId", value)
|
||||
|
||||
publicId = property(_getPublicId, _setPublicId)
|
||||
|
||||
def _getSystemId(self):
|
||||
return self._element.get("systemId", "")
|
||||
|
||||
def _setSystemId(self, value):
|
||||
if value is not None:
|
||||
self._element.set("systemId", value)
|
||||
|
||||
systemId = property(_getSystemId, _setSystemId)
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "DOCUMENT_ROOT")
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "DOCUMENT_FRAGMENT")
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if not(hasattr(element, "tag")):
|
||||
element = element.getroot()
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
|
||||
(element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||
elif element.tag == "DOCUMENT_ROOT":
|
||||
rv.append("#document")
|
||||
if element.text is not None:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||
if element.tail is not None:
|
||||
raise TypeError("Document node cannot have tail")
|
||||
if hasattr(element, "attrib") and len(element.attrib):
|
||||
raise TypeError("Document node cannot have attributes")
|
||||
elif element.tag == ElementTreeCommentType:
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||
else:
|
||||
assert isinstance(element.tag, text_type), \
|
||||
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
|
||||
nsmatch = tag_regexp.match(element.tag)
|
||||
|
||||
if nsmatch is None:
|
||||
name = element.tag
|
||||
else:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
name = "%s %s" % (prefix, name)
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
attributes = []
|
||||
for name, value in element.attrib.items():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
attr_string = "%s %s" % (prefix, name)
|
||||
else:
|
||||
attr_string = name
|
||||
attributes.append((attr_string, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||
indent += 2
|
||||
for child in element:
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element): # pylint:disable=unused-variable
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
filter = _ihatexml.InfosetFilter()
|
||||
|
||||
def serializeElement(element):
|
||||
if isinstance(element, ElementTree.ElementTree):
|
||||
element = element.getroot()
|
||||
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
|
||||
(element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||
elif element.tag == "DOCUMENT_ROOT":
|
||||
if element.text is not None:
|
||||
rv.append(element.text)
|
||||
if element.tail is not None:
|
||||
raise TypeError("Document node cannot have tail")
|
||||
if hasattr(element, "attrib") and len(element.attrib):
|
||||
raise TypeError("Document node cannot have attributes")
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag == ElementTreeCommentType:
|
||||
rv.append("<!--%s-->" % (element.text,))
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\"" % (
|
||||
filter.fromXmlName(name), value)
|
||||
for name, value in element.attrib.items()])
|
||||
rv.append("<%s %s>" % (element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>" % (element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
implementation = ElementTreeImplementation
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._element
|
||||
else:
|
||||
if self.defaultNamespace is not None:
|
||||
return self.document._element.find(
|
||||
"{%s}html" % self.defaultNamespace)
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
||||
|
||||
|
||||
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
392
lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
Normal file
392
lib/bleach/_vendor/html5lib/treebuilders/etree_lxml.py
Normal file
|
@ -0,0 +1,392 @@
|
|||
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||
of the native library as possible, without using fragile hacks like custom element
|
||||
names that break between releases. The downside of this is that we cannot represent
|
||||
all possible trees; specifically the following are known to cause problems:
|
||||
|
||||
Text or comments as siblings of the root element
|
||||
Docypes with no name
|
||||
|
||||
When any of these things occur, we emit a DataLossWarning
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
# pylint:disable=protected-access
|
||||
|
||||
import warnings
|
||||
import re
|
||||
import sys
|
||||
|
||||
try:
|
||||
from collections.abc import MutableMapping
|
||||
except ImportError:
|
||||
from collections import MutableMapping
|
||||
|
||||
from . import base
|
||||
from ..constants import DataLossWarning
|
||||
from .. import constants
|
||||
from . import etree as etree_builders
|
||||
from .. import _ihatexml
|
||||
|
||||
import lxml.etree as etree
|
||||
from six import PY3, binary_type
|
||||
|
||||
|
||||
fullTree = True
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
comment_type = etree.Comment("asd").tag
|
||||
|
||||
|
||||
class DocumentType(object):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
self.name = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
|
||||
class Document(object):
|
||||
def __init__(self):
|
||||
self._elementTree = None
|
||||
self._childNodes = []
|
||||
|
||||
def appendChild(self, element):
|
||||
last = self._elementTree.getroot()
|
||||
for last in self._elementTree.getroot().itersiblings():
|
||||
pass
|
||||
|
||||
last.addnext(element._element)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
childNodes = property(_getChildNodes)
|
||||
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if not hasattr(element, "tag"):
|
||||
if hasattr(element, "getroot"):
|
||||
# Full tree case
|
||||
rv.append("#document")
|
||||
if element.docinfo.internalDTD:
|
||||
if not (element.docinfo.public_id or
|
||||
element.docinfo.system_url):
|
||||
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
||||
else:
|
||||
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
|
||||
element.docinfo.root_name,
|
||||
element.docinfo.public_id,
|
||||
element.docinfo.system_url)
|
||||
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
|
||||
next_element = element.getroot()
|
||||
while next_element.getprevious() is not None:
|
||||
next_element = next_element.getprevious()
|
||||
while next_element is not None:
|
||||
serializeElement(next_element, indent + 2)
|
||||
next_element = next_element.getnext()
|
||||
elif isinstance(element, str) or isinstance(element, bytes):
|
||||
# Text in a fragment
|
||||
assert isinstance(element, str) or sys.version_info[0] == 2
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||
else:
|
||||
# Fragment case
|
||||
rv.append("#document-fragment")
|
||||
for next_element in element:
|
||||
serializeElement(next_element, indent + 2)
|
||||
elif element.tag == comment_type:
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
|
||||
else:
|
||||
assert isinstance(element, etree._Element)
|
||||
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||
if nsmatch is not None:
|
||||
ns = nsmatch.group(1)
|
||||
tag = nsmatch.group(2)
|
||||
prefix = constants.prefixes[ns]
|
||||
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
||||
infosetFilter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>" % (' ' * indent,
|
||||
infosetFilter.fromXmlName(element.tag)))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
attributes = []
|
||||
for name, value in element.attrib.items():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
name = infosetFilter.fromXmlName(name)
|
||||
prefix = constants.prefixes[ns]
|
||||
attr_string = "%s %s" % (prefix, name)
|
||||
else:
|
||||
attr_string = infosetFilter.fromXmlName(name)
|
||||
attributes.append((attr_string, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||
indent += 2
|
||||
for child in element:
|
||||
serializeElement(child, indent)
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
|
||||
def serializeElement(element):
|
||||
if not hasattr(element, "tag"):
|
||||
if element.docinfo.internalDTD:
|
||||
if element.docinfo.doctype:
|
||||
dtd_str = element.docinfo.doctype
|
||||
else:
|
||||
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
||||
rv.append(dtd_str)
|
||||
serializeElement(element.getroot())
|
||||
|
||||
elif element.tag == comment_type:
|
||||
rv.append("<!--%s-->" % (element.text,))
|
||||
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>" % (element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\"" % (name, value)
|
||||
for name, value in element.attrib.items()])
|
||||
rv.append("<%s %s>" % (element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>" % (element.tag,))
|
||||
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
class TreeBuilder(base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
commentClass = None
|
||||
fragmentClass = Document
|
||||
implementation = etree
|
||||
|
||||
def __init__(self, namespaceHTMLElements, fullTree=False):
|
||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
self.namespaceHTMLElements = namespaceHTMLElements
|
||||
|
||||
class Attributes(MutableMapping):
|
||||
def __init__(self, element):
|
||||
self._element = element
|
||||
|
||||
def _coerceKey(self, key):
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
||||
else:
|
||||
name = infosetFilter.coerceAttribute(key)
|
||||
return name
|
||||
|
||||
def __getitem__(self, key):
|
||||
value = self._element._element.attrib[self._coerceKey(key)]
|
||||
if not PY3 and isinstance(value, binary_type):
|
||||
value = value.decode("ascii")
|
||||
return value
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self._element._element.attrib[self._coerceKey(key)] = value
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._element._element.attrib[self._coerceKey(key)]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._element._element.attrib)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._element._element.attrib)
|
||||
|
||||
def clear(self):
|
||||
return self._element._element.attrib.clear()
|
||||
|
||||
class Element(builder.Element):
|
||||
def __init__(self, name, namespace):
|
||||
name = infosetFilter.coerceElement(name)
|
||||
builder.Element.__init__(self, name, namespace=namespace)
|
||||
self._attributes = Attributes(self)
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = infosetFilter.coerceElement(name)
|
||||
self._element.tag = self._getETreeTag(
|
||||
self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return infosetFilter.fromXmlName(self._name)
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._attributes
|
||||
|
||||
def _setAttributes(self, value):
|
||||
attributes = self.attributes
|
||||
attributes.clear()
|
||||
attributes.update(value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data = infosetFilter.coerceCharacters(data)
|
||||
builder.Element.insertText(self, data, insertBefore)
|
||||
|
||||
def cloneNode(self):
|
||||
element = type(self)(self.name, self.namespace)
|
||||
if self._element.attrib:
|
||||
element._element.attrib.update(self._element.attrib)
|
||||
return element
|
||||
|
||||
class Comment(builder.Comment):
|
||||
def __init__(self, data):
|
||||
data = infosetFilter.coerceComment(data)
|
||||
builder.Comment.__init__(self, data)
|
||||
|
||||
def _setData(self, data):
|
||||
data = infosetFilter.coerceComment(data)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
self.elementClass = Element
|
||||
self.commentClass = Comment
|
||||
# self.fragmentClass = builder.DocumentFragment
|
||||
base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def reset(self):
|
||||
base.TreeBuilder.reset(self)
|
||||
self.insertComment = self.insertCommentInitial
|
||||
self.initial_comments = []
|
||||
self.doctype = None
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._elementTree
|
||||
else:
|
||||
return self.document._elementTree.getroot()
|
||||
|
||||
def getFragment(self):
|
||||
fragment = []
|
||||
element = self.openElements[0]._element
|
||||
if element.text:
|
||||
fragment.append(element.text)
|
||||
fragment.extend(list(element))
|
||||
if element.tail:
|
||||
fragment.append(element.tail)
|
||||
return fragment
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if not name:
|
||||
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
|
||||
self.doctype = None
|
||||
else:
|
||||
coercedName = self.infosetFilter.coerceElement(name)
|
||||
if coercedName != name:
|
||||
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
|
||||
|
||||
doctype = self.doctypeClass(coercedName, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
def insertCommentInitial(self, data, parent=None):
|
||||
assert parent is None or parent is self.document
|
||||
assert self.document._elementTree is None
|
||||
self.initial_comments.append(data)
|
||||
|
||||
def insertCommentMain(self, data, parent=None):
|
||||
if (parent == self.document and
|
||||
self.document._elementTree.getroot()[-1].tag == comment_type):
|
||||
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
||||
super(TreeBuilder, self).insertComment(data, parent)
|
||||
|
||||
def insertRoot(self, token):
|
||||
# Because of the way libxml2 works, it doesn't seem to be possible to
|
||||
# alter information like the doctype after the tree has been parsed.
|
||||
# Therefore we need to use the built-in parser to create our initial
|
||||
# tree, after which we can add elements like normal
|
||||
docStr = ""
|
||||
if self.doctype:
|
||||
assert self.doctype.name
|
||||
docStr += "<!DOCTYPE %s" % self.doctype.name
|
||||
if (self.doctype.publicId is not None or
|
||||
self.doctype.systemId is not None):
|
||||
docStr += (' PUBLIC "%s" ' %
|
||||
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
|
||||
if self.doctype.systemId:
|
||||
sysid = self.doctype.systemId
|
||||
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
|
||||
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
|
||||
sysid = sysid.replace("'", 'U00027')
|
||||
if sysid.find("'") >= 0:
|
||||
docStr += '"%s"' % sysid
|
||||
else:
|
||||
docStr += "'%s'" % sysid
|
||||
else:
|
||||
docStr += "''"
|
||||
docStr += ">"
|
||||
if self.doctype.name != token["name"]:
|
||||
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
|
||||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
||||
root = etree.fromstring(docStr)
|
||||
|
||||
# Append the initial comments:
|
||||
for comment_token in self.initial_comments:
|
||||
comment = self.commentClass(comment_token["data"])
|
||||
root.addprevious(comment._element)
|
||||
|
||||
# Create the root document and add the ElementTree to it
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
# Give the root element the right name
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s" % (namespace, name)
|
||||
root.tag = etree_tag
|
||||
|
||||
# Add the root element to the internal child/open data structures
|
||||
root_element = self.elementClass(name, namespace)
|
||||
root_element._element = root
|
||||
self.document._childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
||||
# Reset to the default insert comment function
|
||||
self.insertComment = self.insertCommentMain
|
154
lib/bleach/_vendor/html5lib/treewalkers/__init__.py
Normal file
154
lib/bleach/_vendor/html5lib/treewalkers/__init__.py
Normal file
|
@ -0,0 +1,154 @@
|
|||
"""A collection of modules for iterating through different kinds of
|
||||
tree, generating tokens identical to those produced by the tokenizer
|
||||
module.
|
||||
|
||||
To create a tree walker for a new type of tree, you need to
|
||||
implement a tree walker object (called TreeWalker by convention) that
|
||||
implements a 'serialize' method which takes a tree as sole argument and
|
||||
returns an iterator which generates tokens.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .. import constants
|
||||
from .._utils import default_etree
|
||||
|
||||
__all__ = ["getTreeWalker", "pprint"]
|
||||
|
||||
treeWalkerCache = {}
|
||||
|
||||
|
||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeWalker class for various types of tree with built-in support
|
||||
|
||||
:arg str treeType: the name of the tree type required (case-insensitive).
|
||||
Supported values are:
|
||||
|
||||
* "dom": The xml.dom.minidom DOM implementation
|
||||
* "etree": A generic walker for tree implementations exposing an
|
||||
elementtree-like interface (known to work with ElementTree,
|
||||
cElementTree and lxml.etree).
|
||||
* "lxml": Optimized walker for lxml.etree
|
||||
* "genshi": a Genshi stream
|
||||
|
||||
:arg implementation: A module implementing the tree type e.g.
|
||||
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
|
||||
tree type only).
|
||||
|
||||
:arg kwargs: keyword arguments passed to the etree walker--for other
|
||||
walkers, this has no effect
|
||||
|
||||
:returns: a TreeWalker class
|
||||
|
||||
"""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeWalkerCache:
|
||||
if treeType == "dom":
|
||||
from . import dom
|
||||
treeWalkerCache[treeType] = dom.TreeWalker
|
||||
elif treeType == "genshi":
|
||||
from . import genshi
|
||||
treeWalkerCache[treeType] = genshi.TreeWalker
|
||||
elif treeType == "lxml":
|
||||
from . import etree_lxml
|
||||
treeWalkerCache[treeType] = etree_lxml.TreeWalker
|
||||
elif treeType == "etree":
|
||||
from . import etree
|
||||
if implementation is None:
|
||||
implementation = default_etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
||||
|
||||
|
||||
def concatenateCharacterTokens(tokens):
|
||||
pendingCharacters = []
|
||||
for token in tokens:
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
pendingCharacters.append(token["data"])
|
||||
else:
|
||||
if pendingCharacters:
|
||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||
pendingCharacters = []
|
||||
yield token
|
||||
if pendingCharacters:
|
||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||
|
||||
|
||||
def pprint(walker):
|
||||
"""Pretty printer for tree walkers
|
||||
|
||||
Takes a TreeWalker instance and pretty prints the output of walking the tree.
|
||||
|
||||
:arg walker: a TreeWalker instance
|
||||
|
||||
"""
|
||||
output = []
|
||||
indent = 0
|
||||
for token in concatenateCharacterTokens(walker):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# tag name
|
||||
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
|
||||
if token["namespace"] in constants.prefixes:
|
||||
ns = constants.prefixes[token["namespace"]]
|
||||
else:
|
||||
ns = token["namespace"]
|
||||
name = "%s %s" % (ns, token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
output.append("%s<%s>" % (" " * indent, name))
|
||||
indent += 2
|
||||
# attributes (sorted for consistent ordering)
|
||||
attrs = token["data"]
|
||||
for (namespace, localname), value in sorted(attrs.items()):
|
||||
if namespace:
|
||||
if namespace in constants.prefixes:
|
||||
ns = constants.prefixes[namespace]
|
||||
else:
|
||||
ns = namespace
|
||||
name = "%s %s" % (ns, localname)
|
||||
else:
|
||||
name = localname
|
||||
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
|
||||
# self-closing
|
||||
if type == "EmptyTag":
|
||||
indent -= 2
|
||||
|
||||
elif type == "EndTag":
|
||||
indent -= 2
|
||||
|
||||
elif type == "Comment":
|
||||
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
||||
|
||||
elif type == "Doctype":
|
||||
if token["name"]:
|
||||
if token["publicId"]:
|
||||
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(" " * indent,
|
||||
token["name"],
|
||||
token["publicId"],
|
||||
token["systemId"] if token["systemId"] else ""))
|
||||
elif token["systemId"]:
|
||||
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
||||
(" " * indent,
|
||||
token["name"],
|
||||
token["systemId"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
||||
token["name"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
||||
|
||||
elif type == "Characters":
|
||||
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
||||
|
||||
elif type == "SpaceCharacters":
|
||||
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown token type, %s" % type)
|
||||
|
||||
return "\n".join(output)
|
252
lib/bleach/_vendor/html5lib/treewalkers/base.py
Normal file
252
lib/bleach/_vendor/html5lib/treewalkers/base.py
Normal file
|
@ -0,0 +1,252 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.dom import Node
|
||||
from ..constants import namespaces, voidElements, spaceCharacters
|
||||
|
||||
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
||||
"TreeWalker", "NonRecursiveTreeWalker"]
|
||||
|
||||
DOCUMENT = Node.DOCUMENT_NODE
|
||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
ELEMENT = Node.ELEMENT_NODE
|
||||
COMMENT = Node.COMMENT_NODE
|
||||
ENTITY = Node.ENTITY_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
|
||||
class TreeWalker(object):
|
||||
"""Walks a tree yielding tokens
|
||||
|
||||
Tokens are dicts that all have a ``type`` field specifying the type of the
|
||||
token.
|
||||
|
||||
"""
|
||||
def __init__(self, tree):
|
||||
"""Creates a TreeWalker
|
||||
|
||||
:arg tree: the tree to walk
|
||||
|
||||
"""
|
||||
self.tree = tree
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def error(self, msg):
|
||||
"""Generates an error token with the given message
|
||||
|
||||
:arg msg: the error message
|
||||
|
||||
:returns: SerializeError token
|
||||
|
||||
"""
|
||||
return {"type": "SerializeError", "data": msg}
|
||||
|
||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||
"""Generates an EmptyTag token
|
||||
|
||||
:arg namespace: the namespace of the token--can be ``None``
|
||||
|
||||
:arg name: the name of the element
|
||||
|
||||
:arg attrs: the attributes of the element as a dict
|
||||
|
||||
:arg hasChildren: whether or not to yield a SerializationError because
|
||||
this tag shouldn't have children
|
||||
|
||||
:returns: EmptyTag token
|
||||
|
||||
"""
|
||||
yield {"type": "EmptyTag", "name": name,
|
||||
"namespace": namespace,
|
||||
"data": attrs}
|
||||
if hasChildren:
|
||||
yield self.error("Void element has children")
|
||||
|
||||
def startTag(self, namespace, name, attrs):
|
||||
"""Generates a StartTag token
|
||||
|
||||
:arg namespace: the namespace of the token--can be ``None``
|
||||
|
||||
:arg name: the name of the element
|
||||
|
||||
:arg attrs: the attributes of the element as a dict
|
||||
|
||||
:returns: StartTag token
|
||||
|
||||
"""
|
||||
return {"type": "StartTag",
|
||||
"name": name,
|
||||
"namespace": namespace,
|
||||
"data": attrs}
|
||||
|
||||
def endTag(self, namespace, name):
|
||||
"""Generates an EndTag token
|
||||
|
||||
:arg namespace: the namespace of the token--can be ``None``
|
||||
|
||||
:arg name: the name of the element
|
||||
|
||||
:returns: EndTag token
|
||||
|
||||
"""
|
||||
return {"type": "EndTag",
|
||||
"name": name,
|
||||
"namespace": namespace}
|
||||
|
||||
def text(self, data):
|
||||
"""Generates SpaceCharacters and Characters tokens
|
||||
|
||||
Depending on what's in the data, this generates one or more
|
||||
``SpaceCharacters`` and ``Characters`` tokens.
|
||||
|
||||
For example:
|
||||
|
||||
>>> from html5lib.treewalkers.base import TreeWalker
|
||||
>>> # Give it an empty tree just so it instantiates
|
||||
>>> walker = TreeWalker([])
|
||||
>>> list(walker.text(''))
|
||||
[]
|
||||
>>> list(walker.text(' '))
|
||||
[{u'data': ' ', u'type': u'SpaceCharacters'}]
|
||||
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
|
||||
[{u'data': ' ', u'type': u'SpaceCharacters'},
|
||||
{u'data': u'abc', u'type': u'Characters'},
|
||||
{u'data': u' ', u'type': u'SpaceCharacters'}]
|
||||
|
||||
:arg data: the text data
|
||||
|
||||
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
|
||||
|
||||
"""
|
||||
data = data
|
||||
middle = data.lstrip(spaceCharacters)
|
||||
left = data[:len(data) - len(middle)]
|
||||
if left:
|
||||
yield {"type": "SpaceCharacters", "data": left}
|
||||
data = middle
|
||||
middle = data.rstrip(spaceCharacters)
|
||||
right = data[len(middle):]
|
||||
if middle:
|
||||
yield {"type": "Characters", "data": middle}
|
||||
if right:
|
||||
yield {"type": "SpaceCharacters", "data": right}
|
||||
|
||||
def comment(self, data):
|
||||
"""Generates a Comment token
|
||||
|
||||
:arg data: the comment
|
||||
|
||||
:returns: Comment token
|
||||
|
||||
"""
|
||||
return {"type": "Comment", "data": data}
|
||||
|
||||
def doctype(self, name, publicId=None, systemId=None):
|
||||
"""Generates a Doctype token
|
||||
|
||||
:arg name:
|
||||
|
||||
:arg publicId:
|
||||
|
||||
:arg systemId:
|
||||
|
||||
:returns: the Doctype token
|
||||
|
||||
"""
|
||||
return {"type": "Doctype",
|
||||
"name": name,
|
||||
"publicId": publicId,
|
||||
"systemId": systemId}
|
||||
|
||||
def entity(self, name):
|
||||
"""Generates an Entity token
|
||||
|
||||
:arg name: the entity name
|
||||
|
||||
:returns: an Entity token
|
||||
|
||||
"""
|
||||
return {"type": "Entity", "name": name}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
"""Handles unknown node types"""
|
||||
return self.error("Unknown node type: " + nodeType)
|
||||
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getFirstChild(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getNextSibling(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getParentNode(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def __iter__(self):
|
||||
currentNode = self.tree
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
hasChildren = False
|
||||
|
||||
if type == DOCTYPE:
|
||||
yield self.doctype(*details)
|
||||
|
||||
elif type == TEXT:
|
||||
for token in self.text(*details):
|
||||
yield token
|
||||
|
||||
elif type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, attributes,
|
||||
hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
yield self.startTag(namespace, name, attributes)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
|
||||
elif type == ENTITY:
|
||||
yield self.entity(details[0])
|
||||
|
||||
elif type == DOCUMENT:
|
||||
hasChildren = True
|
||||
|
||||
else:
|
||||
yield self.unknown(details[0])
|
||||
|
||||
if hasChildren:
|
||||
firstChild = self.getFirstChild(currentNode)
|
||||
else:
|
||||
firstChild = None
|
||||
|
||||
if firstChild is not None:
|
||||
currentNode = firstChild
|
||||
else:
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
break
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
break
|
||||
else:
|
||||
currentNode = self.getParentNode(currentNode)
|
43
lib/bleach/_vendor/html5lib/treewalkers/dom.py
Normal file
43
lib/bleach/_vendor/html5lib/treewalkers/dom.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
attrs = {}
|
||||
for attr in list(node.attributes.keys()):
|
||||
attr = node.getAttributeNode(attr)
|
||||
if attr.namespaceURI:
|
||||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
||||
else:
|
||||
attrs[(None, attr.name)] = attr.value
|
||||
return (base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||
attrs, node.hasChildNodes())
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return base.COMMENT, node.nodeValue
|
||||
|
||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
else:
|
||||
return base.UNKNOWN, node.nodeType
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.firstChild
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parentNode
|
131
lib/bleach/_vendor/html5lib/treewalkers/etree.py
Normal file
131
lib/bleach/_vendor/html5lib/treewalkers/etree.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from . import base
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation):
|
||||
ElementTree = ElementTreeImplementation
|
||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||
|
||||
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
|
||||
"""Given the particular ElementTree representation, this implementation,
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
||||
1. The current element
|
||||
|
||||
2. The index of the element relative to its parent
|
||||
|
||||
3. A stack of ancestor elements
|
||||
|
||||
4. A flag "text", "tail" or None to indicate if the current node is a
|
||||
text node; either the text or tail of the current element (1)
|
||||
"""
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, _, _, flag = node
|
||||
if flag in ("text", "tail"):
|
||||
return base.TEXT, getattr(elt, flag)
|
||||
else:
|
||||
node = elt
|
||||
|
||||
if not(hasattr(node, "tag")):
|
||||
node = node.getroot()
|
||||
|
||||
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return (base.DOCTYPE, node.text,
|
||||
node.get("publicId"), node.get("systemId"))
|
||||
|
||||
elif node.tag == ElementTreeCommentType:
|
||||
return base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
assert isinstance(node.tag, string_types), type(node.tag)
|
||||
# This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = node.tag
|
||||
attrs = OrderedDict()
|
||||
for name, value in list(node.attrib.items()):
|
||||
match = tag_regexp.match(name)
|
||||
if match:
|
||||
attrs[(match.group(1), match.group(2))] = value
|
||||
else:
|
||||
attrs[(None, name)] = value
|
||||
return (base.ELEMENT, namespace, tag,
|
||||
attrs, len(node) or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
element, key, parents, flag = node, None, [], None
|
||||
|
||||
if flag in ("text", "tail"):
|
||||
return None
|
||||
else:
|
||||
if element.text:
|
||||
return element, key, parents, "text"
|
||||
elif len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
if element.tail and flag != "tail":
|
||||
return element, key, parents, "tail"
|
||||
elif key < len(parents[-1]) - 1:
|
||||
return parents[-1][key + 1], key + 1, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if not parents:
|
||||
return element
|
||||
else:
|
||||
return element, key, parents, None
|
||||
else:
|
||||
parent = parents.pop()
|
||||
if not parents:
|
||||
return parent
|
||||
else:
|
||||
assert list(parents[-1]).count(parent) == 1
|
||||
return parent, list(parents[-1]).index(parent), parents, None
|
||||
|
||||
return locals()
|
||||
|
||||
|
||||
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
215
lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
Normal file
215
lib/bleach/_vendor/html5lib/treewalkers/etree_lxml.py
Normal file
|
@ -0,0 +1,215 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml import etree
|
||||
from ..treebuilders.etree import tag_regexp
|
||||
|
||||
from . import base
|
||||
|
||||
from .. import _ihatexml
|
||||
|
||||
|
||||
def ensure_str(s):
|
||||
if s is None:
|
||||
return None
|
||||
elif isinstance(s, text_type):
|
||||
return s
|
||||
else:
|
||||
return s.decode("ascii", "strict")
|
||||
|
||||
|
||||
class Root(object):
|
||||
def __init__(self, et):
|
||||
self.elementtree = et
|
||||
self.children = []
|
||||
|
||||
try:
|
||||
if et.docinfo.internalDTD:
|
||||
self.children.append(Doctype(self,
|
||||
ensure_str(et.docinfo.root_name),
|
||||
ensure_str(et.docinfo.public_id),
|
||||
ensure_str(et.docinfo.system_url)))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
node = et.getroot()
|
||||
except AttributeError:
|
||||
node = et
|
||||
|
||||
while node.getprevious() is not None:
|
||||
node = node.getprevious()
|
||||
while node is not None:
|
||||
self.children.append(node)
|
||||
node = node.getnext()
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.children[key]
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
def __len__(self):
|
||||
return 1
|
||||
|
||||
|
||||
class Doctype(object):
|
||||
def __init__(self, root_node, name, public_id, system_id):
|
||||
self.root_node = root_node
|
||||
self.name = name
|
||||
self.public_id = public_id
|
||||
self.system_id = system_id
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return self.root_node.children[1]
|
||||
|
||||
|
||||
class FragmentRoot(Root):
|
||||
def __init__(self, children):
|
||||
self.children = [FragmentWrapper(self, child) for child in children]
|
||||
self.text = self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
|
||||
class FragmentWrapper(object):
|
||||
def __init__(self, fragment_root, obj):
|
||||
self.root_node = fragment_root
|
||||
self.obj = obj
|
||||
if hasattr(self.obj, 'text'):
|
||||
self.text = ensure_str(self.obj.text)
|
||||
else:
|
||||
self.text = None
|
||||
if hasattr(self.obj, 'tail'):
|
||||
self.tail = ensure_str(self.obj.tail)
|
||||
else:
|
||||
self.tail = None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.obj, name)
|
||||
|
||||
def getnext(self):
|
||||
siblings = self.root_node.children
|
||||
idx = siblings.index(self)
|
||||
if idx < len(siblings) - 1:
|
||||
return siblings[idx + 1]
|
||||
else:
|
||||
return None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.obj[key]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.obj)
|
||||
|
||||
def getparent(self):
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __unicode__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.obj)
|
||||
|
||||
|
||||
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||
def __init__(self, tree):
|
||||
# pylint:disable=redefined-variable-type
|
||||
if isinstance(tree, list):
|
||||
self.fragmentChildren = set(tree)
|
||||
tree = FragmentRoot(tree)
|
||||
else:
|
||||
self.fragmentChildren = set()
|
||||
tree = Root(tree)
|
||||
base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||
self.filter = _ihatexml.InfosetFilter()
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
return base.TEXT, ensure_str(getattr(node, key))
|
||||
|
||||
elif isinstance(node, Root):
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Doctype):
|
||||
return base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||
|
||||
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
|
||||
return base.TEXT, ensure_str(node.obj)
|
||||
|
||||
elif node.tag == etree.Comment:
|
||||
return base.COMMENT, ensure_str(node.text)
|
||||
|
||||
elif node.tag == etree.Entity:
|
||||
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
||||
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(ensure_str(node.tag))
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = ensure_str(node.tag)
|
||||
attrs = OrderedDict()
|
||||
for name, value in list(node.attrib.items()):
|
||||
name = ensure_str(name)
|
||||
value = ensure_str(value)
|
||||
match = tag_regexp.match(name)
|
||||
if match:
|
||||
attrs[(match.group(1), match.group(2))] = value
|
||||
else:
|
||||
attrs[(None, name)] = value
|
||||
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||
attrs, len(node) > 0 or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
assert not isinstance(node, tuple), "Text nodes have no children"
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text")
|
||||
else:
|
||||
return node[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
if key == "text":
|
||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||
# because node[0] might evaluate to False if it has no child element
|
||||
if len(node):
|
||||
return node[0]
|
||||
else:
|
||||
return None
|
||||
else: # tail
|
||||
return node.getnext()
|
||||
|
||||
return (node, "tail") if node.tail else node.getnext()
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
if key == "text":
|
||||
return node
|
||||
# else: fallback to "normal" processing
|
||||
elif node in self.fragmentChildren:
|
||||
return None
|
||||
|
||||
return node.getparent()
|
69
lib/bleach/_vendor/html5lib/treewalkers/genshi.py
Normal file
69
lib/bleach/_vendor/html5lib/treewalkers/genshi.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from genshi.core import QName
|
||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
|
||||
from . import base
|
||||
|
||||
from ..constants import voidElements, namespaces
|
||||
|
||||
|
||||
class TreeWalker(base.TreeWalker):
|
||||
def __iter__(self):
|
||||
# Buffer the events so we can pass in the following one
|
||||
previous = None
|
||||
for event in self.tree:
|
||||
if previous is not None:
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
previous = event
|
||||
|
||||
# Don't forget the final event!
|
||||
if previous is not None:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
|
||||
def tokens(self, event, next):
|
||||
kind, data, _ = event
|
||||
if kind == START:
|
||||
tag, attribs = data
|
||||
name = tag.localname
|
||||
namespace = tag.namespace
|
||||
converted_attribs = {}
|
||||
for k, v in attribs:
|
||||
if isinstance(k, QName):
|
||||
converted_attribs[(k.namespace, k.localname)] = v
|
||||
else:
|
||||
converted_attribs[(None, k)] = v
|
||||
|
||||
if namespace == namespaces["html"] and name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, converted_attribs,
|
||||
not next or next[0] != END or
|
||||
next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(namespace, name, converted_attribs)
|
||||
|
||||
elif kind == END:
|
||||
name = data.localname
|
||||
namespace = data.namespace
|
||||
if namespace != namespaces["html"] or name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif kind == COMMENT:
|
||||
yield self.comment(data)
|
||||
|
||||
elif kind == TEXT:
|
||||
for token in self.text(data):
|
||||
yield token
|
||||
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(*data)
|
||||
|
||||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
|
||||
START_CDATA, END_CDATA, PI):
|
||||
pass
|
||||
|
||||
else:
|
||||
yield self.unknown(kind)
|
1078
lib/bleach/_vendor/parse.py
Normal file
1078
lib/bleach/_vendor/parse.py
Normal file
File diff suppressed because it is too large
Load diff
3
lib/bleach/_vendor/vendor.txt
Normal file
3
lib/bleach/_vendor/vendor.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
html5lib==1.1 \
|
||||
--hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
|
||||
--hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
|
14
lib/bleach/_vendor/vendor_install.sh
Normal file
14
lib/bleach/_vendor/vendor_install.sh
Normal file
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
BLEACH_VENDOR_DIR=${BLEACH_VENDOR_DIR:-"."}
|
||||
DEST=${DEST:-"."}
|
||||
|
||||
pip install --no-binary all --no-compile --no-deps -r "${BLEACH_VENDOR_DIR}/vendor.txt" --target "${DEST}"
|
||||
|
||||
# install Python 3.6.14 urllib.urlparse for #536
|
||||
curl --proto '=https' --tlsv1.2 -o "${DEST}/parse.py" https://raw.githubusercontent.com/python/cpython/v3.6.14/Lib/urllib/parse.py
|
||||
(cd "${DEST}" && sha256sum parse.py > parse.py.SHA256SUM)
|
|
@ -1,20 +1,32 @@
|
|||
"""A set of basic callbacks for bleach.linkify."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def nofollow(attrs, new=False):
|
||||
if attrs['href'].startswith('mailto:'):
|
||||
href_key = (None, "href")
|
||||
|
||||
if href_key not in attrs:
|
||||
return attrs
|
||||
rel = [x for x in attrs.get('rel', '').split(' ') if x]
|
||||
if 'nofollow' not in [x.lower() for x in rel]:
|
||||
rel.append('nofollow')
|
||||
attrs['rel'] = ' '.join(rel)
|
||||
|
||||
if attrs[href_key].startswith("mailto:"):
|
||||
return attrs
|
||||
|
||||
rel_key = (None, "rel")
|
||||
rel_values = [val for val in attrs.get(rel_key, "").split(" ") if val]
|
||||
if "nofollow" not in [rel_val.lower() for rel_val in rel_values]:
|
||||
rel_values.append("nofollow")
|
||||
attrs[rel_key] = " ".join(rel_values)
|
||||
|
||||
return attrs
|
||||
|
||||
|
||||
def target_blank(attrs, new=False):
|
||||
if attrs['href'].startswith('mailto:'):
|
||||
href_key = (None, "href")
|
||||
|
||||
if href_key not in attrs:
|
||||
return attrs
|
||||
attrs['target'] = '_blank'
|
||||
|
||||
if attrs[href_key].startswith("mailto:"):
|
||||
return attrs
|
||||
|
||||
attrs[(None, "target")] = "_blank"
|
||||
return attrs
|
||||
|
|
|
@ -1,62 +0,0 @@
|
|||
import datetime
|
||||
from decimal import Decimal
|
||||
import types
|
||||
import six
|
||||
|
||||
|
||||
def is_protected_type(obj):
|
||||
"""Determine if the object instance is of a protected type.
|
||||
|
||||
Objects of protected types are preserved as-is when passed to
|
||||
force_unicode(strings_only=True).
|
||||
"""
|
||||
return isinstance(obj, (
|
||||
six.integer_types +
|
||||
(types.NoneType,
|
||||
datetime.datetime, datetime.date, datetime.time,
|
||||
float, Decimal))
|
||||
)
|
||||
|
||||
|
||||
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
|
||||
"""
|
||||
Similar to smart_text, except that lazy instances are resolved to
|
||||
strings, rather than kept as lazy objects.
|
||||
|
||||
If strings_only is True, don't convert (some) non-string-like objects.
|
||||
"""
|
||||
# Handle the common case first, saves 30-40% when s is an instance of
|
||||
# six.text_type. This function gets called often in that setting.
|
||||
if isinstance(s, six.text_type):
|
||||
return s
|
||||
if strings_only and is_protected_type(s):
|
||||
return s
|
||||
try:
|
||||
if not isinstance(s, six.string_types):
|
||||
if hasattr(s, '__unicode__'):
|
||||
s = s.__unicode__()
|
||||
else:
|
||||
if six.PY3:
|
||||
if isinstance(s, bytes):
|
||||
s = six.text_type(s, encoding, errors)
|
||||
else:
|
||||
s = six.text_type(s)
|
||||
else:
|
||||
s = six.text_type(bytes(s), encoding, errors)
|
||||
else:
|
||||
# Note: We use .decode() here, instead of six.text_type(s,
|
||||
# encoding, errors), so that if s is a SafeBytes, it ends up being
|
||||
# a SafeText at the end.
|
||||
s = s.decode(encoding, errors)
|
||||
except UnicodeDecodeError as e:
|
||||
if not isinstance(s, Exception):
|
||||
raise UnicodeDecodeError(*e.args)
|
||||
else:
|
||||
# If we get to here, the caller has passed in an Exception
|
||||
# subclass populated with non-ASCII bytestring data without a
|
||||
# working unicode method. Try to handle this without raising a
|
||||
# further exception by individually forcing the exception args
|
||||
# to unicode.
|
||||
s = ' '.join([force_unicode(arg, encoding, strings_only,
|
||||
errors) for arg in s])
|
||||
return s
|
665
lib/bleach/html5lib_shim.py
Normal file
665
lib/bleach/html5lib_shim.py
Normal file
|
@ -0,0 +1,665 @@
|
|||
# flake8: noqa
|
||||
"""
|
||||
Shim module between Bleach and html5lib. This makes it easier to upgrade the
|
||||
html5lib library without having to change a lot of code.
|
||||
"""
|
||||
|
||||
import re
|
||||
import string
|
||||
import warnings
|
||||
|
||||
# ignore html5lib deprecation warnings to use bleach; we are bleach
|
||||
# apply before we import submodules that import html5lib
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="html5lib's sanitizer is deprecated",
|
||||
category=DeprecationWarning,
|
||||
module="bleach._vendor.html5lib",
|
||||
)
|
||||
|
||||
from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
|
||||
HTMLParser,
|
||||
getTreeWalker,
|
||||
)
|
||||
from bleach._vendor.html5lib import (
|
||||
constants,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
|
||||
namespaces,
|
||||
prefixes,
|
||||
)
|
||||
from bleach._vendor.html5lib.constants import (
|
||||
_ReparseException as ReparseException,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib.filters.base import (
|
||||
Filter,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib.filters.sanitizer import (
|
||||
allowed_protocols,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib.filters.sanitizer import (
|
||||
Filter as SanitizerFilter,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib._inputstream import (
|
||||
HTMLInputStream,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib.serializer import (
|
||||
escape,
|
||||
HTMLSerializer,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib._tokenizer import (
|
||||
attributeMap,
|
||||
HTMLTokenizer,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
from bleach._vendor.html5lib._trie import (
|
||||
Trie,
|
||||
) # noqa: E402 module level import not at top of file
|
||||
|
||||
|
||||
#: Map of entity name to expanded entity
|
||||
ENTITIES = constants.entities
|
||||
|
||||
#: Trie of html entity string -> character representation
|
||||
ENTITIES_TRIE = Trie(ENTITIES)
|
||||
|
||||
#: Token type constants--these never change
|
||||
TAG_TOKEN_TYPES = {
|
||||
constants.tokenTypes["StartTag"],
|
||||
constants.tokenTypes["EndTag"],
|
||||
constants.tokenTypes["EmptyTag"],
|
||||
}
|
||||
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
|
||||
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
|
||||
|
||||
|
||||
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
|
||||
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
||||
HTML_TAGS = [
|
||||
"a",
|
||||
"abbr",
|
||||
"address",
|
||||
"area",
|
||||
"article",
|
||||
"aside",
|
||||
"audio",
|
||||
"b",
|
||||
"base",
|
||||
"bdi",
|
||||
"bdo",
|
||||
"blockquote",
|
||||
"body",
|
||||
"br",
|
||||
"button",
|
||||
"canvas",
|
||||
"caption",
|
||||
"cite",
|
||||
"code",
|
||||
"col",
|
||||
"colgroup",
|
||||
"data",
|
||||
"datalist",
|
||||
"dd",
|
||||
"del",
|
||||
"details",
|
||||
"dfn",
|
||||
"dialog",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"em",
|
||||
"embed",
|
||||
"fieldset",
|
||||
"figcaption",
|
||||
"figure",
|
||||
"footer",
|
||||
"form",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"head",
|
||||
"header",
|
||||
"hgroup",
|
||||
"hr",
|
||||
"html",
|
||||
"i",
|
||||
"iframe",
|
||||
"img",
|
||||
"input",
|
||||
"ins",
|
||||
"kbd",
|
||||
"keygen",
|
||||
"label",
|
||||
"legend",
|
||||
"li",
|
||||
"link",
|
||||
"map",
|
||||
"mark",
|
||||
"menu",
|
||||
"meta",
|
||||
"meter",
|
||||
"nav",
|
||||
"noscript",
|
||||
"object",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"option",
|
||||
"output",
|
||||
"p",
|
||||
"param",
|
||||
"picture",
|
||||
"pre",
|
||||
"progress",
|
||||
"q",
|
||||
"rp",
|
||||
"rt",
|
||||
"ruby",
|
||||
"s",
|
||||
"samp",
|
||||
"script",
|
||||
"section",
|
||||
"select",
|
||||
"slot",
|
||||
"small",
|
||||
"source",
|
||||
"span",
|
||||
"strong",
|
||||
"style",
|
||||
"sub",
|
||||
"summary",
|
||||
"sup",
|
||||
"table",
|
||||
"tbody",
|
||||
"td",
|
||||
"template",
|
||||
"textarea",
|
||||
"tfoot",
|
||||
"th",
|
||||
"thead",
|
||||
"time",
|
||||
"title",
|
||||
"tr",
|
||||
"track",
|
||||
"u",
|
||||
"ul",
|
||||
"var",
|
||||
"video",
|
||||
"wbr",
|
||||
]
|
||||
|
||||
|
||||
class InputStreamWithMemory:
|
||||
"""Wraps an HTMLInputStream to remember characters since last <
|
||||
|
||||
This wraps existing HTMLInputStream classes to keep track of the stream
|
||||
since the last < which marked an open tag state.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, inner_stream):
|
||||
self._inner_stream = inner_stream
|
||||
self.reset = self._inner_stream.reset
|
||||
self.position = self._inner_stream.position
|
||||
self._buffer = []
|
||||
|
||||
@property
|
||||
def errors(self):
|
||||
return self._inner_stream.errors
|
||||
|
||||
@property
|
||||
def charEncoding(self):
|
||||
return self._inner_stream.charEncoding
|
||||
|
||||
@property
|
||||
def changeEncoding(self):
|
||||
return self._inner_stream.changeEncoding
|
||||
|
||||
def char(self):
|
||||
c = self._inner_stream.char()
|
||||
# char() can return None if EOF, so ignore that
|
||||
if c:
|
||||
self._buffer.append(c)
|
||||
return c
|
||||
|
||||
def charsUntil(self, characters, opposite=False):
|
||||
chars = self._inner_stream.charsUntil(characters, opposite=opposite)
|
||||
self._buffer.extend(list(chars))
|
||||
return chars
|
||||
|
||||
def unget(self, char):
|
||||
if self._buffer:
|
||||
self._buffer.pop(-1)
|
||||
return self._inner_stream.unget(char)
|
||||
|
||||
def get_tag(self):
|
||||
"""Returns the stream history since last '<'
|
||||
|
||||
Since the buffer starts at the last '<' as as seen by tagOpenState(),
|
||||
we know that everything from that point to when this method is called
|
||||
is the "tag" that is being tokenized.
|
||||
|
||||
"""
|
||||
return "".join(self._buffer)
|
||||
|
||||
def start_tag(self):
|
||||
"""Resets stream history to just '<'
|
||||
|
||||
This gets called by tagOpenState() which marks a '<' that denotes an
|
||||
open tag. Any time we see that, we reset the buffer.
|
||||
|
||||
"""
|
||||
self._buffer = ["<"]
|
||||
|
||||
|
||||
class BleachHTMLTokenizer(HTMLTokenizer):
|
||||
"""Tokenizer that doesn't consume character entities"""
|
||||
|
||||
def __init__(self, consume_entities=False, **kwargs):
|
||||
super(BleachHTMLTokenizer, self).__init__(**kwargs)
|
||||
|
||||
self.consume_entities = consume_entities
|
||||
|
||||
# Wrap the stream with one that remembers the history
|
||||
self.stream = InputStreamWithMemory(self.stream)
|
||||
|
||||
def __iter__(self):
|
||||
last_error_token = None
|
||||
|
||||
for token in super(BleachHTMLTokenizer, self).__iter__():
|
||||
if last_error_token is not None:
|
||||
if (
|
||||
last_error_token["data"] == "invalid-character-in-attribute-name"
|
||||
and token["type"] in TAG_TOKEN_TYPES
|
||||
and token.get("data")
|
||||
):
|
||||
# token["data"] is an html5lib attributeMap
|
||||
# (OrderedDict 3.7+ and dict otherwise)
|
||||
# of attr name to attr value
|
||||
#
|
||||
# Remove attribute names that have ', " or < in them
|
||||
# because those characters are invalid for attribute names.
|
||||
token["data"] = attributeMap(
|
||||
(attr_name, attr_value)
|
||||
for attr_name, attr_value in token["data"].items()
|
||||
if (
|
||||
'"' not in attr_name
|
||||
and "'" not in attr_name
|
||||
and "<" not in attr_name
|
||||
)
|
||||
)
|
||||
last_error_token = None
|
||||
yield token
|
||||
|
||||
elif (
|
||||
last_error_token["data"] == "expected-closing-tag-but-got-char"
|
||||
and self.parser.tags is not None
|
||||
and token["data"].lower().strip() not in self.parser.tags
|
||||
):
|
||||
# We've got either a malformed tag or a pseudo-tag or
|
||||
# something that html5lib wants to turn into a malformed
|
||||
# comment which Bleach clean() will drop so we interfere
|
||||
# with the token stream to handle it more correctly.
|
||||
#
|
||||
# If this is an allowed tag, it's malformed and we just let
|
||||
# the html5lib parser deal with it--we don't enter into this
|
||||
# block.
|
||||
#
|
||||
# If this is not an allowed tag, then we convert it to
|
||||
# characters and it'll get escaped in the sanitizer.
|
||||
token["data"] = self.stream.get_tag()
|
||||
token["type"] = CHARACTERS_TYPE
|
||||
|
||||
last_error_token = None
|
||||
yield token
|
||||
|
||||
elif token["type"] == PARSEERROR_TYPE:
|
||||
# If the token is a parse error, then let the last_error_token
|
||||
# go, and make token the new last_error_token
|
||||
yield last_error_token
|
||||
last_error_token = token
|
||||
|
||||
else:
|
||||
yield last_error_token
|
||||
yield token
|
||||
last_error_token = None
|
||||
|
||||
continue
|
||||
|
||||
# If the token is a ParseError, we hold on to it so we can get the
|
||||
# next token and potentially fix it.
|
||||
if token["type"] == PARSEERROR_TYPE:
|
||||
last_error_token = token
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
if last_error_token:
|
||||
yield last_error_token
|
||||
|
||||
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
||||
# If this tokenizer is set to consume entities, then we can let the
|
||||
# superclass do its thing.
|
||||
if self.consume_entities:
|
||||
return super(BleachHTMLTokenizer, self).consumeEntity(
|
||||
allowedChar, fromAttribute
|
||||
)
|
||||
|
||||
# If this tokenizer is set to not consume entities, then we don't want
|
||||
# to consume and convert them, so this overrides the html5lib tokenizer's
|
||||
# consumeEntity so that it's now a no-op.
|
||||
#
|
||||
# However, when that gets called, it's consumed an &, so we put that back in
|
||||
# the stream.
|
||||
if fromAttribute:
|
||||
self.currentToken["data"][-1][1] += "&"
|
||||
|
||||
else:
|
||||
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
|
||||
|
||||
def tagOpenState(self):
|
||||
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
|
||||
# or ParseError. In all cases, we want to drop any stream history
|
||||
# we've collected so far and we do that by calling start_tag() on
|
||||
# the input stream wrapper.
|
||||
self.stream.start_tag()
|
||||
return super(BleachHTMLTokenizer, self).tagOpenState()
|
||||
|
||||
def emitCurrentToken(self):
|
||||
token = self.currentToken
|
||||
|
||||
if (
|
||||
self.parser.tags is not None
|
||||
and token["type"] in TAG_TOKEN_TYPES
|
||||
and token["name"].lower() not in self.parser.tags
|
||||
):
|
||||
# If this is a start/end/empty tag for a tag that's not in our
|
||||
# allowed list, then it gets stripped or escaped. In both of these
|
||||
# cases it gets converted to a Characters token.
|
||||
if self.parser.strip:
|
||||
# If we're stripping the token, we just throw in an empty
|
||||
# string token.
|
||||
new_data = ""
|
||||
|
||||
else:
|
||||
# If we're escaping the token, we want to escape the exact
|
||||
# original string. Since tokenizing also normalizes data
|
||||
# and this is a tag-like thing, we've lost some information.
|
||||
# So we go back through the stream to get the original
|
||||
# string and use that.
|
||||
new_data = self.stream.get_tag()
|
||||
|
||||
new_token = {"type": CHARACTERS_TYPE, "data": new_data}
|
||||
|
||||
self.currentToken = new_token
|
||||
self.tokenQueue.append(new_token)
|
||||
self.state = self.dataState
|
||||
return
|
||||
|
||||
super(BleachHTMLTokenizer, self).emitCurrentToken()
|
||||
|
||||
|
||||
class BleachHTMLParser(HTMLParser):
|
||||
"""Parser that uses BleachHTMLTokenizer"""
|
||||
|
||||
def __init__(self, tags, strip, consume_entities, **kwargs):
|
||||
"""
|
||||
:arg tags: list of allowed tags--everything else is either stripped or
|
||||
escaped; if None, then this doesn't look at tags at all
|
||||
:arg strip: whether to strip disallowed tags (True) or escape them (False);
|
||||
if tags=None, then this doesn't have any effect
|
||||
:arg consume_entities: whether to consume entities (default behavior) or
|
||||
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
|
||||
|
||||
"""
|
||||
self.tags = [tag.lower() for tag in tags] if tags is not None else None
|
||||
self.strip = strip
|
||||
self.consume_entities = consume_entities
|
||||
super(BleachHTMLParser, self).__init__(**kwargs)
|
||||
|
||||
def _parse(
|
||||
self, stream, innerHTML=False, container="div", scripting=True, **kwargs
|
||||
):
|
||||
# set scripting=True to parse <noscript> as though JS is enabled to
|
||||
# match the expected context in browsers
|
||||
#
|
||||
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
|
||||
#
|
||||
# Override HTMLParser so we can swap out the tokenizer for our own.
|
||||
self.innerHTMLMode = innerHTML
|
||||
self.container = container
|
||||
self.scripting = scripting
|
||||
self.tokenizer = BleachHTMLTokenizer(
|
||||
stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
|
||||
)
|
||||
self.reset()
|
||||
|
||||
try:
|
||||
self.mainLoop()
|
||||
except ReparseException:
|
||||
self.reset()
|
||||
self.mainLoop()
|
||||
|
||||
|
||||
def convert_entity(value):
|
||||
"""Convert an entity (minus the & and ; part) into what it represents
|
||||
|
||||
This handles numeric, hex, and text entities.
|
||||
|
||||
:arg value: the string (minus the ``&`` and ``;`` part) to convert
|
||||
|
||||
:returns: unicode character or None if it's an ambiguous ampersand that
|
||||
doesn't match a character entity
|
||||
|
||||
"""
|
||||
if value[0] == "#":
|
||||
if len(value) < 2:
|
||||
return None
|
||||
|
||||
if value[1] in ("x", "X"):
|
||||
# hex-encoded code point
|
||||
int_as_string, base = value[2:], 16
|
||||
else:
|
||||
# decimal code point
|
||||
int_as_string, base = value[1:], 10
|
||||
|
||||
if int_as_string == "":
|
||||
return None
|
||||
|
||||
code_point = int(int_as_string, base)
|
||||
if 0 < code_point < 0x110000:
|
||||
return chr(code_point)
|
||||
else:
|
||||
return None
|
||||
|
||||
return ENTITIES.get(value, None)
|
||||
|
||||
|
||||
def convert_entities(text):
|
||||
"""Converts all found entities in the text
|
||||
|
||||
:arg text: the text to convert entities in
|
||||
|
||||
:returns: unicode text with converted entities
|
||||
|
||||
"""
|
||||
if "&" not in text:
|
||||
return text
|
||||
|
||||
new_text = []
|
||||
for part in next_possible_entity(text):
|
||||
if not part:
|
||||
continue
|
||||
|
||||
if part.startswith("&"):
|
||||
entity = match_entity(part)
|
||||
if entity is not None:
|
||||
converted = convert_entity(entity)
|
||||
|
||||
# If it's not an ambiguous ampersand, then replace with the
|
||||
# unicode character. Otherwise, we leave the entity in.
|
||||
if converted is not None:
|
||||
new_text.append(converted)
|
||||
remainder = part[len(entity) + 2 :]
|
||||
if part:
|
||||
new_text.append(remainder)
|
||||
continue
|
||||
|
||||
new_text.append(part)
|
||||
|
||||
return "".join(new_text)
|
||||
|
||||
|
||||
def match_entity(stream):
|
||||
"""Returns first entity in stream or None if no entity exists
|
||||
|
||||
Note: For Bleach purposes, entities must start with a "&" and end with
|
||||
a ";". This ignoresambiguous character entities that have no ";" at the
|
||||
end.
|
||||
|
||||
:arg stream: the character stream
|
||||
|
||||
:returns: ``None`` or the entity string without "&" or ";"
|
||||
|
||||
"""
|
||||
# Nix the & at the beginning
|
||||
if stream[0] != "&":
|
||||
raise ValueError('Stream should begin with "&"')
|
||||
|
||||
stream = stream[1:]
|
||||
|
||||
stream = list(stream)
|
||||
possible_entity = ""
|
||||
end_characters = "<&=;" + string.whitespace
|
||||
|
||||
# Handle number entities
|
||||
if stream and stream[0] == "#":
|
||||
possible_entity = "#"
|
||||
stream.pop(0)
|
||||
|
||||
if stream and stream[0] in ("x", "X"):
|
||||
allowed = "0123456789abcdefABCDEF"
|
||||
possible_entity += stream.pop(0)
|
||||
else:
|
||||
allowed = "0123456789"
|
||||
|
||||
# FIXME(willkg): Do we want to make sure these are valid number
|
||||
# entities? This doesn't do that currently.
|
||||
while stream and stream[0] not in end_characters:
|
||||
c = stream.pop(0)
|
||||
if c not in allowed:
|
||||
break
|
||||
possible_entity += c
|
||||
|
||||
if possible_entity and stream and stream[0] == ";":
|
||||
return possible_entity
|
||||
return None
|
||||
|
||||
# Handle character entities
|
||||
while stream and stream[0] not in end_characters:
|
||||
c = stream.pop(0)
|
||||
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
|
||||
break
|
||||
possible_entity += c
|
||||
|
||||
if possible_entity and stream and stream[0] == ";":
|
||||
return possible_entity
|
||||
|
||||
return None
|
||||
|
||||
|
||||
AMP_SPLIT_RE = re.compile("(&)")
|
||||
|
||||
|
||||
def next_possible_entity(text):
|
||||
"""Takes a text and generates a list of possible entities
|
||||
|
||||
:arg text: the text to look at
|
||||
|
||||
:returns: generator where each part (except the first) starts with an
|
||||
"&"
|
||||
|
||||
"""
|
||||
for i, part in enumerate(AMP_SPLIT_RE.split(text)):
|
||||
if i == 0:
|
||||
yield part
|
||||
elif i % 2 == 0:
|
||||
yield "&" + part
|
||||
|
||||
|
||||
class BleachHTMLSerializer(HTMLSerializer):
|
||||
"""HTMLSerializer that undoes & -> & in attributes and sets
|
||||
escape_rcdata to True
|
||||
"""
|
||||
|
||||
# per the HTMLSerializer.__init__ docstring:
|
||||
#
|
||||
# Whether to escape characters that need to be
|
||||
# escaped within normal elements within rcdata elements such as
|
||||
# style.
|
||||
#
|
||||
escape_rcdata = True
|
||||
|
||||
def escape_base_amp(self, stoken):
|
||||
"""Escapes just bare & in HTML attribute values"""
|
||||
# First, undo escaping of &. We need to do this because html5lib's
|
||||
# HTMLSerializer expected the tokenizer to consume all the character
|
||||
# entities and convert them to their respective characters, but the
|
||||
# BleachHTMLTokenizer doesn't do that. For example, this fixes
|
||||
# &entity; back to &entity; .
|
||||
stoken = stoken.replace("&", "&")
|
||||
|
||||
# However, we do want all bare & that are not marking character
|
||||
# entities to be changed to &, so let's do that carefully here.
|
||||
for part in next_possible_entity(stoken):
|
||||
if not part:
|
||||
continue
|
||||
|
||||
if part.startswith("&"):
|
||||
entity = match_entity(part)
|
||||
# Only leave entities in that are not ambiguous. If they're
|
||||
# ambiguous, then we escape the ampersand.
|
||||
if entity is not None and convert_entity(entity) is not None:
|
||||
yield "&" + entity + ";"
|
||||
|
||||
# Length of the entity plus 2--one for & at the beginning
|
||||
# and one for ; at the end
|
||||
part = part[len(entity) + 2 :]
|
||||
if part:
|
||||
yield part
|
||||
continue
|
||||
|
||||
yield part.replace("&", "&")
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values
|
||||
|
||||
Note that this converts & to & in attribute values where the & isn't
|
||||
already part of an unambiguous character entity.
|
||||
|
||||
"""
|
||||
in_tag = False
|
||||
after_equals = False
|
||||
|
||||
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
|
||||
if in_tag:
|
||||
if stoken == ">":
|
||||
in_tag = False
|
||||
|
||||
elif after_equals:
|
||||
if stoken != '"':
|
||||
for part in self.escape_base_amp(stoken):
|
||||
yield part
|
||||
|
||||
after_equals = False
|
||||
continue
|
||||
|
||||
elif stoken == "=":
|
||||
after_equals = True
|
||||
|
||||
yield stoken
|
||||
else:
|
||||
if stoken.startswith("<"):
|
||||
in_tag = True
|
||||
yield stoken
|
574
lib/bleach/linkifier.py
Normal file
574
lib/bleach/linkifier.py
Normal file
|
@ -0,0 +1,574 @@
|
|||
import re
|
||||
|
||||
from bleach import callbacks as linkify_callbacks
|
||||
from bleach import html5lib_shim
|
||||
from bleach.utils import alphabetize_attributes
|
||||
|
||||
|
||||
#: List of default callbacks
|
||||
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
||||
|
||||
|
||||
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
||||
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
||||
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
||||
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
||||
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
||||
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
||||
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
||||
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
||||
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
||||
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
||||
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
||||
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
||||
xn xxx ye yt yu za zm zw""".split()
|
||||
|
||||
# Make sure that .com doesn't get matched by .co first
|
||||
TLDS.reverse()
|
||||
|
||||
|
||||
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
|
||||
"""Builds the url regex used by linkifier
|
||||
|
||||
If you want a different set of tlds or allowed protocols, pass those in
|
||||
and stomp on the existing ``url_re``::
|
||||
|
||||
from bleach import linkifier
|
||||
|
||||
my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
|
||||
|
||||
linker = LinkifyFilter(url_re=my_url_re)
|
||||
|
||||
"""
|
||||
return re.compile(
|
||||
r"""\(* # Match any opening parentheses.
|
||||
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
||||
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
||||
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||
# except for # and ~, which happen in practice)
|
||||
""".format(
|
||||
"|".join(sorted(protocols)), "|".join(sorted(tlds))
|
||||
),
|
||||
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
URL_RE = build_url_re()
|
||||
|
||||
|
||||
PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
|
||||
|
||||
|
||||
def build_email_re(tlds=TLDS):
|
||||
"""Builds the email regex used by linkifier
|
||||
|
||||
If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
|
||||
|
||||
from bleach import linkifier
|
||||
|
||||
my_email_re = linkifier.build_email_re(my_tlds_list)
|
||||
|
||||
linker = LinkifyFilter(email_re=my_url_re)
|
||||
|
||||
"""
|
||||
# open and closing braces doubled below for format string
|
||||
return re.compile(
|
||||
r"""(?<!//)
|
||||
(([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
|
||||
(\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
|
||||
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
||||
|\\[\001-\011\013\014\016-\177])*" # quoted-string
|
||||
)@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
|
||||
""".format(
|
||||
"|".join(tlds)
|
||||
),
|
||||
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
EMAIL_RE = build_email_re()
|
||||
|
||||
|
||||
class Linker:
|
||||
"""Convert URL-like strings in an HTML fragment to links
|
||||
|
||||
This function converts strings that look like URLs, domain names and email
|
||||
addresses in text that may be an HTML fragment to links, while preserving:
|
||||
|
||||
1. links already in the string
|
||||
2. urls found in attributes
|
||||
3. email addresses
|
||||
|
||||
linkify does a best-effort approach and tries to recover from bad
|
||||
situations due to crazy text.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
callbacks=DEFAULT_CALLBACKS,
|
||||
skip_tags=None,
|
||||
parse_email=False,
|
||||
url_re=URL_RE,
|
||||
email_re=EMAIL_RE,
|
||||
recognized_tags=html5lib_shim.HTML_TAGS,
|
||||
):
|
||||
"""Creates a Linker instance
|
||||
|
||||
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||
|
||||
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||
contents of; for example, you could set this to ``['pre']`` to skip
|
||||
linkifying contents of ``pre`` tags
|
||||
|
||||
:arg bool parse_email: whether or not to linkify email addresses
|
||||
|
||||
:arg re url_re: url matching regex
|
||||
|
||||
:arg re email_re: email matching regex
|
||||
|
||||
:arg list-of-strings recognized_tags: the list of tags that linkify knows about;
|
||||
everything else gets escaped
|
||||
|
||||
:returns: linkified text as unicode
|
||||
|
||||
"""
|
||||
self.callbacks = callbacks
|
||||
self.skip_tags = skip_tags
|
||||
self.parse_email = parse_email
|
||||
self.url_re = url_re
|
||||
self.email_re = email_re
|
||||
|
||||
# Create a parser/tokenizer that allows all HTML tags and escapes
|
||||
# anything not in that list.
|
||||
self.parser = html5lib_shim.BleachHTMLParser(
|
||||
tags=recognized_tags,
|
||||
strip=False,
|
||||
consume_entities=True,
|
||||
namespaceHTMLElements=False,
|
||||
)
|
||||
self.walker = html5lib_shim.getTreeWalker("etree")
|
||||
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
||||
quote_attr_values="always",
|
||||
omit_optional_tags=False,
|
||||
# linkify does not sanitize
|
||||
sanitize=False,
|
||||
# linkify alphabetizes
|
||||
alphabetical_attributes=False,
|
||||
)
|
||||
|
||||
def linkify(self, text):
|
||||
"""Linkify specified text
|
||||
|
||||
:arg str text: the text to add links to
|
||||
|
||||
:returns: linkified text as unicode
|
||||
|
||||
:raises TypeError: if ``text`` is not a text type
|
||||
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
raise TypeError("argument must be of text type")
|
||||
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
dom = self.parser.parseFragment(text)
|
||||
filtered = LinkifyFilter(
|
||||
source=self.walker(dom),
|
||||
callbacks=self.callbacks,
|
||||
skip_tags=self.skip_tags,
|
||||
parse_email=self.parse_email,
|
||||
url_re=self.url_re,
|
||||
email_re=self.email_re,
|
||||
)
|
||||
return self.serializer.render(filtered)
|
||||
|
||||
|
||||
class LinkifyFilter(html5lib_shim.Filter):
|
||||
"""html5lib filter that linkifies text
|
||||
|
||||
This will do the following:
|
||||
|
||||
* convert email addresses into links
|
||||
* convert urls into links
|
||||
* edit existing links by running them through callbacks--the default is to
|
||||
add a ``rel="nofollow"``
|
||||
|
||||
This filter can be used anywhere html5lib filters can be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source,
|
||||
callbacks=DEFAULT_CALLBACKS,
|
||||
skip_tags=None,
|
||||
parse_email=False,
|
||||
url_re=URL_RE,
|
||||
email_re=EMAIL_RE,
|
||||
):
|
||||
"""Creates a LinkifyFilter instance
|
||||
|
||||
:arg TreeWalker source: stream
|
||||
|
||||
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||
|
||||
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||
contents of; for example, you could set this to ``['pre']`` to skip
|
||||
linkifying contents of ``pre`` tags
|
||||
|
||||
:arg bool parse_email: whether or not to linkify email addresses
|
||||
|
||||
:arg re url_re: url matching regex
|
||||
|
||||
:arg re email_re: email matching regex
|
||||
|
||||
"""
|
||||
super(LinkifyFilter, self).__init__(source)
|
||||
|
||||
self.callbacks = callbacks or []
|
||||
self.skip_tags = skip_tags or []
|
||||
self.parse_email = parse_email
|
||||
|
||||
self.url_re = url_re
|
||||
self.email_re = email_re
|
||||
|
||||
def apply_callbacks(self, attrs, is_new):
|
||||
"""Given an attrs dict and an is_new bool, runs through callbacks
|
||||
|
||||
Callbacks can return an adjusted attrs dict or ``None``. In the case of
|
||||
``None``, we stop going through callbacks and return that and the link
|
||||
gets dropped.
|
||||
|
||||
:arg dict attrs: map of ``(namespace, name)`` -> ``value``
|
||||
|
||||
:arg bool is_new: whether or not this link was added by linkify
|
||||
|
||||
:returns: adjusted attrs dict or ``None``
|
||||
|
||||
"""
|
||||
for cb in self.callbacks:
|
||||
attrs = cb(attrs, is_new)
|
||||
if attrs is None:
|
||||
return None
|
||||
return attrs
|
||||
|
||||
def extract_character_data(self, token_list):
|
||||
"""Extracts and squashes character sequences in a token stream"""
|
||||
# FIXME(willkg): This is a terrible idea. What it does is drop all the
|
||||
# tags from the token list and merge the Characters and SpaceCharacters
|
||||
# tokens into a single text.
|
||||
#
|
||||
# So something like this::
|
||||
#
|
||||
# "<span>" "<b>" "some text" "</b>" "</span>"
|
||||
#
|
||||
# gets converted to "some text".
|
||||
#
|
||||
# This gets used to figure out the ``_text`` fauxttribute value for
|
||||
# linkify callables.
|
||||
#
|
||||
# I'm not really sure how else to support that ``_text`` fauxttribute and
|
||||
# maintain some modicum of backwards compatibility with previous versions
|
||||
# of Bleach.
|
||||
|
||||
out = []
|
||||
for token in token_list:
|
||||
token_type = token["type"]
|
||||
if token_type in ["Characters", "SpaceCharacters"]:
|
||||
out.append(token["data"])
|
||||
|
||||
return "".join(out)
|
||||
|
||||
def handle_email_addresses(self, src_iter):
|
||||
"""Handle email addresses in character tokens"""
|
||||
for token in src_iter:
|
||||
if token["type"] == "Characters":
|
||||
text = token["data"]
|
||||
new_tokens = []
|
||||
end = 0
|
||||
|
||||
# For each email address we find in the text
|
||||
for match in self.email_re.finditer(text):
|
||||
if match.start() > end:
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": text[end : match.start()]}
|
||||
)
|
||||
|
||||
# Run attributes through the callbacks to see what we
|
||||
# should do with this match
|
||||
attrs = {
|
||||
(None, "href"): "mailto:%s" % match.group(0),
|
||||
"_text": match.group(0),
|
||||
}
|
||||
attrs = self.apply_callbacks(attrs, True)
|
||||
|
||||
if attrs is None:
|
||||
# Just add the text--but not as a link
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": match.group(0)}
|
||||
)
|
||||
|
||||
else:
|
||||
# Add an "a" tag for the new link
|
||||
_text = attrs.pop("_text", "")
|
||||
attrs = alphabetize_attributes(attrs)
|
||||
new_tokens.extend(
|
||||
[
|
||||
{"type": "StartTag", "name": "a", "data": attrs},
|
||||
{"type": "Characters", "data": str(_text)},
|
||||
{"type": "EndTag", "name": "a"},
|
||||
]
|
||||
)
|
||||
end = match.end()
|
||||
|
||||
if new_tokens:
|
||||
# Yield the adjusted set of tokens and then continue
|
||||
# through the loop
|
||||
if end < len(text):
|
||||
new_tokens.append({"type": "Characters", "data": text[end:]})
|
||||
|
||||
for new_token in new_tokens:
|
||||
yield new_token
|
||||
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
def strip_non_url_bits(self, fragment):
|
||||
"""Strips non-url bits from the url
|
||||
|
||||
This accounts for over-eager matching by the regex.
|
||||
|
||||
"""
|
||||
prefix = suffix = ""
|
||||
|
||||
while fragment:
|
||||
# Try removing ( from the beginning and, if it's balanced, from the
|
||||
# end, too
|
||||
if fragment.startswith("("):
|
||||
prefix = prefix + "("
|
||||
fragment = fragment[1:]
|
||||
|
||||
if fragment.endswith(")"):
|
||||
suffix = ")" + suffix
|
||||
fragment = fragment[:-1]
|
||||
continue
|
||||
|
||||
# Now try extraneous things from the end. For example, sometimes we
|
||||
# pick up ) at the end of a url, but the url is in a parenthesized
|
||||
# phrase like:
|
||||
#
|
||||
# "i looked at the site (at http://example.com)"
|
||||
|
||||
if fragment.endswith(")") and "(" not in fragment:
|
||||
fragment = fragment[:-1]
|
||||
suffix = ")" + suffix
|
||||
continue
|
||||
|
||||
# Handle commas
|
||||
if fragment.endswith(","):
|
||||
fragment = fragment[:-1]
|
||||
suffix = "," + suffix
|
||||
continue
|
||||
|
||||
# Handle periods
|
||||
if fragment.endswith("."):
|
||||
fragment = fragment[:-1]
|
||||
suffix = "." + suffix
|
||||
continue
|
||||
|
||||
# Nothing matched, so we're done
|
||||
break
|
||||
|
||||
return fragment, prefix, suffix
|
||||
|
||||
def handle_links(self, src_iter):
|
||||
"""Handle links in character tokens"""
|
||||
in_a = False # happens, if parse_email=True and if a mail was found
|
||||
for token in src_iter:
|
||||
if in_a:
|
||||
if token["type"] == "EndTag" and token["name"] == "a":
|
||||
in_a = False
|
||||
yield token
|
||||
continue
|
||||
elif token["type"] == "StartTag" and token["name"] == "a":
|
||||
in_a = True
|
||||
yield token
|
||||
continue
|
||||
if token["type"] == "Characters":
|
||||
text = token["data"]
|
||||
new_tokens = []
|
||||
end = 0
|
||||
|
||||
for match in self.url_re.finditer(text):
|
||||
if match.start() > end:
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": text[end : match.start()]}
|
||||
)
|
||||
|
||||
url = match.group(0)
|
||||
prefix = suffix = ""
|
||||
|
||||
# Sometimes we pick up too much in the url match, so look for
|
||||
# bits we should drop and remove them from the match
|
||||
url, prefix, suffix = self.strip_non_url_bits(url)
|
||||
|
||||
# If there's no protocol, add one
|
||||
if PROTO_RE.search(url):
|
||||
href = url
|
||||
else:
|
||||
href = "http://%s" % url
|
||||
|
||||
attrs = {(None, "href"): href, "_text": url}
|
||||
attrs = self.apply_callbacks(attrs, True)
|
||||
|
||||
if attrs is None:
|
||||
# Just add the text
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": prefix + url + suffix}
|
||||
)
|
||||
|
||||
else:
|
||||
# Add the "a" tag!
|
||||
if prefix:
|
||||
new_tokens.append({"type": "Characters", "data": prefix})
|
||||
|
||||
_text = attrs.pop("_text", "")
|
||||
attrs = alphabetize_attributes(attrs)
|
||||
|
||||
new_tokens.extend(
|
||||
[
|
||||
{"type": "StartTag", "name": "a", "data": attrs},
|
||||
{"type": "Characters", "data": str(_text)},
|
||||
{"type": "EndTag", "name": "a"},
|
||||
]
|
||||
)
|
||||
|
||||
if suffix:
|
||||
new_tokens.append({"type": "Characters", "data": suffix})
|
||||
|
||||
end = match.end()
|
||||
|
||||
if new_tokens:
|
||||
# Yield the adjusted set of tokens and then continue
|
||||
# through the loop
|
||||
if end < len(text):
|
||||
new_tokens.append({"type": "Characters", "data": text[end:]})
|
||||
|
||||
for new_token in new_tokens:
|
||||
yield new_token
|
||||
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
def handle_a_tag(self, token_buffer):
|
||||
"""Handle the "a" tag
|
||||
|
||||
This could adjust the link or drop it altogether depending on what the
|
||||
callbacks return.
|
||||
|
||||
This yields the new set of tokens.
|
||||
|
||||
"""
|
||||
a_token = token_buffer[0]
|
||||
if a_token["data"]:
|
||||
attrs = a_token["data"]
|
||||
else:
|
||||
attrs = {}
|
||||
text = self.extract_character_data(token_buffer)
|
||||
attrs["_text"] = text
|
||||
|
||||
attrs = self.apply_callbacks(attrs, False)
|
||||
|
||||
if attrs is None:
|
||||
# We're dropping the "a" tag and everything else and replacing
|
||||
# it with character data. So emit that token.
|
||||
yield {"type": "Characters", "data": text}
|
||||
|
||||
else:
|
||||
new_text = attrs.pop("_text", "")
|
||||
a_token["data"] = alphabetize_attributes(attrs)
|
||||
|
||||
if text == new_text:
|
||||
# The callbacks didn't change the text, so we yield the new "a"
|
||||
# token, then whatever else was there, then the end "a" token
|
||||
yield a_token
|
||||
for mem in token_buffer[1:]:
|
||||
yield mem
|
||||
|
||||
else:
|
||||
# If the callbacks changed the text, then we're going to drop
|
||||
# all the tokens between the start and end "a" tags and replace
|
||||
# it with the new text
|
||||
yield a_token
|
||||
yield {"type": "Characters", "data": str(new_text)}
|
||||
yield token_buffer[-1]
|
||||
|
||||
def __iter__(self):
|
||||
in_a = False
|
||||
in_skip_tag = None
|
||||
|
||||
token_buffer = []
|
||||
|
||||
for token in super(LinkifyFilter, self).__iter__():
|
||||
if in_a:
|
||||
# Handle the case where we're in an "a" tag--we want to buffer tokens
|
||||
# until we hit an end "a" tag.
|
||||
if token["type"] == "EndTag" and token["name"] == "a":
|
||||
# Add the end tag to the token buffer and then handle them
|
||||
# and yield anything returned
|
||||
token_buffer.append(token)
|
||||
for new_token in self.handle_a_tag(token_buffer):
|
||||
yield new_token
|
||||
|
||||
# Clear "a" related state and continue since we've yielded all
|
||||
# the tokens we're going to yield
|
||||
in_a = False
|
||||
token_buffer = []
|
||||
else:
|
||||
token_buffer.append(token)
|
||||
continue
|
||||
|
||||
if token["type"] in ["StartTag", "EmptyTag"]:
|
||||
if token["name"] in self.skip_tags:
|
||||
# Skip tags start a "special mode" where we don't linkify
|
||||
# anything until the end tag.
|
||||
in_skip_tag = token["name"]
|
||||
|
||||
elif token["name"] == "a":
|
||||
# The "a" tag is special--we switch to a slurp mode and
|
||||
# slurp all the tokens until the end "a" tag and then
|
||||
# figure out what to do with them there.
|
||||
in_a = True
|
||||
token_buffer.append(token)
|
||||
|
||||
# We buffer the start tag, so we don't want to yield it,
|
||||
# yet
|
||||
continue
|
||||
|
||||
elif in_skip_tag and self.skip_tags:
|
||||
# NOTE(willkg): We put this clause here since in_a and
|
||||
# switching in and out of in_a takes precedence.
|
||||
if token["type"] == "EndTag" and token["name"] == in_skip_tag:
|
||||
in_skip_tag = None
|
||||
|
||||
elif not in_a and not in_skip_tag and token["type"] == "Characters":
|
||||
new_stream = iter([token])
|
||||
if self.parse_email:
|
||||
new_stream = self.handle_email_addresses(new_stream)
|
||||
|
||||
new_stream = self.handle_links(new_stream)
|
||||
|
||||
for token in new_stream:
|
||||
yield token
|
||||
|
||||
# We've already yielded this token, so continue
|
||||
continue
|
||||
|
||||
yield token
|
|
@ -1,148 +1,645 @@
|
|||
from __future__ import unicode_literals
|
||||
from itertools import chain
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
import warnings
|
||||
|
||||
from html5lib.constants import tokenTypes
|
||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||
from html5lib.tokenizer import HTMLTokenizer
|
||||
from bleach._vendor.parse import urlparse
|
||||
from xml.sax.saxutils import unescape
|
||||
|
||||
from bleach import html5lib_shim
|
||||
from bleach.utils import alphabetize_attributes
|
||||
|
||||
|
||||
PROTOS = HTMLSanitizerMixin.acceptable_protocols
|
||||
PROTOS.remove('feed')
|
||||
#: List of allowed tags
|
||||
ALLOWED_TAGS = [
|
||||
"a",
|
||||
"abbr",
|
||||
"acronym",
|
||||
"b",
|
||||
"blockquote",
|
||||
"code",
|
||||
"em",
|
||||
"i",
|
||||
"li",
|
||||
"ol",
|
||||
"strong",
|
||||
"ul",
|
||||
]
|
||||
|
||||
|
||||
class BleachSanitizerMixin(HTMLSanitizerMixin):
|
||||
"""Mixin to replace sanitize_token() and sanitize_css()."""
|
||||
#: Map of allowed attributes by tag
|
||||
ALLOWED_ATTRIBUTES = {
|
||||
"a": ["href", "title"],
|
||||
"abbr": ["title"],
|
||||
"acronym": ["title"],
|
||||
}
|
||||
|
||||
allowed_svg_properties = []
|
||||
#: List of allowed styles
|
||||
ALLOWED_STYLES = []
|
||||
|
||||
#: List of allowed protocols
|
||||
ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
|
||||
|
||||
#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
|
||||
INVISIBLE_CHARACTERS = "".join(
|
||||
[chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
|
||||
)
|
||||
|
||||
#: Regexp for characters that are invisible
|
||||
INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
|
||||
|
||||
#: String to replace invisible characters with. This can be a character, a
|
||||
#: string, or even a function that takes a Python re matchobj
|
||||
INVISIBLE_REPLACEMENT_CHAR = "?"
|
||||
|
||||
|
||||
class Cleaner:
|
||||
"""Cleaner for cleaning HTML fragments of malicious content
|
||||
|
||||
This cleaner is a security-focused function whose sole purpose is to remove
|
||||
malicious content from a string such that it can be displayed as content in
|
||||
a web page.
|
||||
|
||||
To use::
|
||||
|
||||
from bleach.sanitizer import Cleaner
|
||||
|
||||
cleaner = Cleaner()
|
||||
|
||||
for text in all_the_yucky_things:
|
||||
sanitized = cleaner.clean(text)
|
||||
|
||||
.. Note::
|
||||
|
||||
This cleaner is not designed to use to transform content to be used in
|
||||
non-web-page contexts.
|
||||
|
||||
.. Warning::
|
||||
|
||||
This cleaner is not thread-safe--the html parser has internal state.
|
||||
Create a separate cleaner per thread!
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tags=ALLOWED_TAGS,
|
||||
attributes=ALLOWED_ATTRIBUTES,
|
||||
styles=ALLOWED_STYLES,
|
||||
protocols=ALLOWED_PROTOCOLS,
|
||||
strip=False,
|
||||
strip_comments=True,
|
||||
filters=None,
|
||||
):
|
||||
"""Initializes a Cleaner
|
||||
|
||||
:arg list tags: allowed list of tags; defaults to
|
||||
``bleach.sanitizer.ALLOWED_TAGS``
|
||||
|
||||
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||
|
||||
:arg list styles: allowed list of css styles; defaults to
|
||||
``bleach.sanitizer.ALLOWED_STYLES``
|
||||
|
||||
:arg list protocols: allowed list of protocols for links; defaults
|
||||
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||
|
||||
:arg bool strip: whether or not to strip disallowed elements
|
||||
|
||||
:arg bool strip_comments: whether or not to strip HTML comments
|
||||
|
||||
:arg list filters: list of html5lib Filter classes to pass streamed content through
|
||||
|
||||
.. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
|
||||
|
||||
.. Warning::
|
||||
|
||||
Using filters changes the output of ``bleach.Cleaner.clean``.
|
||||
Make sure the way the filters change the output are secure.
|
||||
|
||||
"""
|
||||
self.tags = tags
|
||||
self.attributes = attributes
|
||||
self.styles = styles
|
||||
self.protocols = protocols
|
||||
self.strip = strip
|
||||
self.strip_comments = strip_comments
|
||||
self.filters = filters or []
|
||||
|
||||
self.parser = html5lib_shim.BleachHTMLParser(
|
||||
tags=self.tags,
|
||||
strip=self.strip,
|
||||
consume_entities=False,
|
||||
namespaceHTMLElements=False,
|
||||
)
|
||||
self.walker = html5lib_shim.getTreeWalker("etree")
|
||||
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
||||
quote_attr_values="always",
|
||||
omit_optional_tags=False,
|
||||
escape_lt_in_attrs=True,
|
||||
# We want to leave entities as they are without escaping or
|
||||
# resolving or expanding
|
||||
resolve_entities=False,
|
||||
# Bleach has its own sanitizer, so don't use the html5lib one
|
||||
sanitize=False,
|
||||
# Bleach sanitizer alphabetizes already, so don't use the html5lib one
|
||||
alphabetical_attributes=False,
|
||||
)
|
||||
|
||||
def clean(self, text):
|
||||
"""Cleans text and returns sanitized result as unicode
|
||||
|
||||
:arg str text: text to be cleaned
|
||||
|
||||
:returns: sanitized text as unicode
|
||||
|
||||
:raises TypeError: if ``text`` is not a text type
|
||||
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
message = (
|
||||
"argument cannot be of '{name}' type, must be of text type".format(
|
||||
name=text.__class__.__name__
|
||||
)
|
||||
)
|
||||
raise TypeError(message)
|
||||
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
dom = self.parser.parseFragment(text)
|
||||
filtered = BleachSanitizerFilter(
|
||||
source=self.walker(dom),
|
||||
# Bleach-sanitizer-specific things
|
||||
attributes=self.attributes,
|
||||
strip_disallowed_elements=self.strip,
|
||||
strip_html_comments=self.strip_comments,
|
||||
# html5lib-sanitizer things
|
||||
allowed_elements=self.tags,
|
||||
allowed_css_properties=self.styles,
|
||||
allowed_protocols=self.protocols,
|
||||
allowed_svg_properties=[],
|
||||
)
|
||||
|
||||
# Apply any filters after the BleachSanitizerFilter
|
||||
for filter_class in self.filters:
|
||||
filtered = filter_class(source=filtered)
|
||||
|
||||
return self.serializer.render(filtered)
|
||||
|
||||
|
||||
def attribute_filter_factory(attributes):
|
||||
"""Generates attribute filter function for the given attributes value
|
||||
|
||||
The attributes value can take one of several shapes. This returns a filter
|
||||
function appropriate to the attributes value. One nice thing about this is
|
||||
that there's less if/then shenanigans in the ``allow_token`` method.
|
||||
|
||||
"""
|
||||
if callable(attributes):
|
||||
return attributes
|
||||
|
||||
if isinstance(attributes, dict):
|
||||
|
||||
def _attr_filter(tag, attr, value):
|
||||
if tag in attributes:
|
||||
attr_val = attributes[tag]
|
||||
if callable(attr_val):
|
||||
return attr_val(tag, attr, value)
|
||||
|
||||
if attr in attr_val:
|
||||
return True
|
||||
|
||||
if "*" in attributes:
|
||||
attr_val = attributes["*"]
|
||||
if callable(attr_val):
|
||||
return attr_val(tag, attr, value)
|
||||
|
||||
return attr in attr_val
|
||||
|
||||
return False
|
||||
|
||||
return _attr_filter
|
||||
|
||||
if isinstance(attributes, list):
|
||||
|
||||
def _attr_filter(tag, attr, value):
|
||||
return attr in attributes
|
||||
|
||||
return _attr_filter
|
||||
|
||||
raise ValueError("attributes needs to be a callable, a list or a dict")
|
||||
|
||||
|
||||
class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
|
||||
"""html5lib Filter that sanitizes text
|
||||
|
||||
This filter can be used anywhere html5lib filters can be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source,
|
||||
attributes=ALLOWED_ATTRIBUTES,
|
||||
strip_disallowed_elements=False,
|
||||
strip_html_comments=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Creates a BleachSanitizerFilter instance
|
||||
|
||||
:arg Treewalker source: stream
|
||||
|
||||
:arg list tags: allowed list of tags; defaults to
|
||||
``bleach.sanitizer.ALLOWED_TAGS``
|
||||
|
||||
:arg dict attributes: allowed attributes; can be a callable, list or dict;
|
||||
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
|
||||
|
||||
:arg list styles: allowed list of css styles; defaults to
|
||||
``bleach.sanitizer.ALLOWED_STYLES``
|
||||
|
||||
:arg list protocols: allowed list of protocols for links; defaults
|
||||
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
|
||||
|
||||
:arg bool strip_disallowed_elements: whether or not to strip disallowed
|
||||
elements
|
||||
|
||||
:arg bool strip_html_comments: whether or not to strip HTML comments
|
||||
|
||||
"""
|
||||
self.attr_filter = attribute_filter_factory(attributes)
|
||||
self.strip_disallowed_elements = strip_disallowed_elements
|
||||
self.strip_html_comments = strip_html_comments
|
||||
|
||||
# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="html5lib's sanitizer is deprecated",
|
||||
category=DeprecationWarning,
|
||||
module="bleach._vendor.html5lib",
|
||||
)
|
||||
return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
|
||||
|
||||
def sanitize_stream(self, token_iterator):
|
||||
for token in token_iterator:
|
||||
ret = self.sanitize_token(token)
|
||||
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
if isinstance(ret, list):
|
||||
for subtoken in ret:
|
||||
yield subtoken
|
||||
else:
|
||||
yield ret
|
||||
|
||||
def merge_characters(self, token_iterator):
|
||||
"""Merge consecutive Characters tokens in a stream"""
|
||||
characters_buffer = []
|
||||
|
||||
for token in token_iterator:
|
||||
if characters_buffer:
|
||||
if token["type"] == "Characters":
|
||||
characters_buffer.append(token)
|
||||
continue
|
||||
else:
|
||||
# Merge all the characters tokens together into one and then
|
||||
# operate on it.
|
||||
new_token = {
|
||||
"data": "".join(
|
||||
[char_token["data"] for char_token in characters_buffer]
|
||||
),
|
||||
"type": "Characters",
|
||||
}
|
||||
characters_buffer = []
|
||||
yield new_token
|
||||
|
||||
elif token["type"] == "Characters":
|
||||
characters_buffer.append(token)
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
new_token = {
|
||||
"data": "".join([char_token["data"] for char_token in characters_buffer]),
|
||||
"type": "Characters",
|
||||
}
|
||||
yield new_token
|
||||
|
||||
def __iter__(self):
|
||||
return self.merge_characters(
|
||||
self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
|
||||
)
|
||||
|
||||
def sanitize_token(self, token):
|
||||
"""Sanitize a token either by HTML-encoding or dropping.
|
||||
|
||||
Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
|
||||
a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
|
||||
Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
|
||||
['attribute', 'pairs'], 'tag': callable}.
|
||||
|
||||
Here callable is a function with two arguments of attribute name
|
||||
and value. It should return true of false.
|
||||
Here callable is a function with two arguments of attribute name and
|
||||
value. It should return true of false.
|
||||
|
||||
Also gives the option to strip tags instead of encoding.
|
||||
|
||||
"""
|
||||
if (getattr(self, 'wildcard_attributes', None) is None and
|
||||
isinstance(self.allowed_attributes, dict)):
|
||||
self.wildcard_attributes = self.allowed_attributes.get('*', [])
|
||||
:arg dict token: token to sanitize
|
||||
|
||||
:returns: token or list of tokens
|
||||
|
||||
"""
|
||||
token_type = token["type"]
|
||||
if token_type in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["name"] in self.allowed_elements:
|
||||
return self.allow_token(token)
|
||||
|
||||
if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
|
||||
tokenTypes['EmptyTag']):
|
||||
if token['name'] in self.allowed_elements:
|
||||
if 'data' in token:
|
||||
if isinstance(self.allowed_attributes, dict):
|
||||
allowed_attributes = self.allowed_attributes.get(
|
||||
token['name'], [])
|
||||
#print callable(allowed_attributes)
|
||||
if not callable(allowed_attributes):
|
||||
allowed_attributes += self.wildcard_attributes
|
||||
else:
|
||||
allowed_attributes = self.allowed_attributes
|
||||
attrs = dict([(name, val) for name, val in
|
||||
token['data'][::-1]
|
||||
if (allowed_attributes(name, val)
|
||||
if callable(allowed_attributes)
|
||||
else name in allowed_attributes)])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if attr not in attrs:
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# Remove replacement characters from unescaped
|
||||
# characters.
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
|
||||
and (val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token['name'] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and
|
||||
re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if 'style' in attrs:
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token['data'] = [(name, val) for name, val in
|
||||
attrs.items()]
|
||||
return token
|
||||
elif self.strip_disallowed_elements:
|
||||
pass
|
||||
return None
|
||||
|
||||
else:
|
||||
if token['type'] == tokenTypes['EndTag']:
|
||||
token['data'] = '</{0!s}>'.format(token['name'])
|
||||
elif token['data']:
|
||||
attr = ' {0!s}="{1!s}"'
|
||||
attrs = ''.join([attr.format(k, escape(v)) for k, v in
|
||||
token['data']])
|
||||
token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
|
||||
else:
|
||||
token['data'] = '<{0!s}>'.format(token['name'])
|
||||
if token['selfClosing']:
|
||||
token['data'] = token['data'][:-1] + '/>'
|
||||
token['type'] = tokenTypes['Characters']
|
||||
del token["name"]
|
||||
return token
|
||||
elif token['type'] == tokenTypes['Comment']:
|
||||
if "data" in token:
|
||||
# Alphabetize the attributes before calling .disallowed_token()
|
||||
# so that the resulting string is stable
|
||||
token["data"] = alphabetize_attributes(token["data"])
|
||||
return self.disallowed_token(token)
|
||||
|
||||
elif token_type == "Comment":
|
||||
if not self.strip_html_comments:
|
||||
# call lxml.sax.saxutils to escape &, <, and > in addition to " and '
|
||||
token["data"] = html5lib_shim.escape(
|
||||
token["data"], entities={'"': """, "'": "'"}
|
||||
)
|
||||
return token
|
||||
else:
|
||||
return None
|
||||
|
||||
elif token_type == "Characters":
|
||||
return self.sanitize_characters(token)
|
||||
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_characters(self, token):
|
||||
"""Handles Characters tokens
|
||||
|
||||
Our overridden tokenizer doesn't do anything with entities. However,
|
||||
that means that the serializer will convert all ``&`` in Characters
|
||||
tokens to ``&``.
|
||||
|
||||
Since we don't want that, we extract entities here and convert them to
|
||||
Entity tokens so the serializer will let them be.
|
||||
|
||||
:arg token: the Characters token to work on
|
||||
|
||||
:returns: a list of tokens
|
||||
|
||||
"""
|
||||
data = token.get("data", "")
|
||||
|
||||
if not data:
|
||||
return token
|
||||
|
||||
data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
|
||||
token["data"] = data
|
||||
|
||||
# If there isn't a & in the data, we can return now
|
||||
if "&" not in data:
|
||||
return token
|
||||
|
||||
new_tokens = []
|
||||
|
||||
# For each possible entity that starts with a "&", we try to extract an
|
||||
# actual entity and re-tokenize accordingly
|
||||
for part in html5lib_shim.next_possible_entity(data):
|
||||
if not part:
|
||||
continue
|
||||
|
||||
if part.startswith("&"):
|
||||
entity = html5lib_shim.match_entity(part)
|
||||
if entity is not None:
|
||||
if entity == "amp":
|
||||
# LinkifyFilter can't match urls across token boundaries
|
||||
# which is problematic with & since that shows up in
|
||||
# querystrings all the time. This special-cases &
|
||||
# and converts it to a & and sticks it in as a
|
||||
# Characters token. It'll get merged with surrounding
|
||||
# tokens in the BleachSanitizerfilter.__iter__ and
|
||||
# escaped in the serializer.
|
||||
new_tokens.append({"type": "Characters", "data": "&"})
|
||||
else:
|
||||
new_tokens.append({"type": "Entity", "name": entity})
|
||||
|
||||
# Length of the entity plus 2--one for & at the beginning
|
||||
# and one for ; at the end
|
||||
remainder = part[len(entity) + 2 :]
|
||||
if remainder:
|
||||
new_tokens.append({"type": "Characters", "data": remainder})
|
||||
continue
|
||||
|
||||
new_tokens.append({"type": "Characters", "data": part})
|
||||
|
||||
return new_tokens
|
||||
|
||||
def sanitize_uri_value(self, value, allowed_protocols):
|
||||
"""Checks a uri value to see if it's allowed
|
||||
|
||||
:arg value: the uri value to sanitize
|
||||
:arg allowed_protocols: list of allowed protocols
|
||||
|
||||
:returns: allowed value or None
|
||||
|
||||
"""
|
||||
# NOTE(willkg): This transforms the value into one that's easier to
|
||||
# match and verify, but shouldn't get returned since it's vastly
|
||||
# different than the original value.
|
||||
|
||||
# Convert all character entities in the value
|
||||
new_value = html5lib_shim.convert_entities(value)
|
||||
|
||||
# Nix backtick, space characters, and control characters
|
||||
new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
|
||||
|
||||
# Remove REPLACEMENT characters
|
||||
new_value = new_value.replace("\ufffd", "")
|
||||
|
||||
# Lowercase it--this breaks the value, but makes it easier to match
|
||||
# against
|
||||
new_value = new_value.lower()
|
||||
|
||||
try:
|
||||
# Drop attributes with uri values that have protocols that aren't
|
||||
# allowed
|
||||
parsed = urlparse(new_value)
|
||||
except ValueError:
|
||||
# URI is impossible to parse, therefore it's not allowed
|
||||
return None
|
||||
|
||||
if parsed.scheme:
|
||||
# If urlparse found a scheme, check that
|
||||
if parsed.scheme in allowed_protocols:
|
||||
return value
|
||||
|
||||
else:
|
||||
# Allow uris that are just an anchor
|
||||
if new_value.startswith("#"):
|
||||
return value
|
||||
|
||||
# Handle protocols that urlparse doesn't recognize like "myprotocol"
|
||||
if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
|
||||
return value
|
||||
|
||||
# If there's no protocol/scheme specified, then assume it's "http"
|
||||
# and see if that's allowed
|
||||
if "http" in allowed_protocols:
|
||||
return value
|
||||
|
||||
return None
|
||||
|
||||
def allow_token(self, token):
|
||||
"""Handles the case where we're allowing the tag"""
|
||||
if "data" in token:
|
||||
# Loop through all the attributes and drop the ones that are not
|
||||
# allowed, are unsafe or break other rules. Additionally, fix
|
||||
# attribute values that need fixing.
|
||||
#
|
||||
# At the end of this loop, we have the final set of attributes
|
||||
# we're keeping.
|
||||
attrs = {}
|
||||
for namespaced_name, val in token["data"].items():
|
||||
namespace, name = namespaced_name
|
||||
|
||||
# Drop attributes that are not explicitly allowed
|
||||
#
|
||||
# NOTE(willkg): We pass in the attribute name--not a namespaced
|
||||
# name.
|
||||
if not self.attr_filter(token["name"], name, val):
|
||||
continue
|
||||
|
||||
# Drop attributes with uri values that use a disallowed protocol
|
||||
# Sanitize attributes with uri values
|
||||
if namespaced_name in self.attr_val_is_uri:
|
||||
new_value = self.sanitize_uri_value(val, self.allowed_protocols)
|
||||
if new_value is None:
|
||||
continue
|
||||
val = new_value
|
||||
|
||||
# Drop values in svg attrs with non-local IRIs
|
||||
if namespaced_name in self.svg_attr_val_allows_ref:
|
||||
new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
|
||||
new_val = new_val.strip()
|
||||
if not new_val:
|
||||
continue
|
||||
|
||||
else:
|
||||
# Replace the val with the unescaped version because
|
||||
# it's a iri
|
||||
val = new_val
|
||||
|
||||
# Drop href and xlink:href attr for svg elements with non-local IRIs
|
||||
if (None, token["name"]) in self.svg_allow_local_href:
|
||||
if namespaced_name in [
|
||||
(None, "href"),
|
||||
(html5lib_shim.namespaces["xlink"], "href"),
|
||||
]:
|
||||
if re.search(r"^\s*[^#\s]", val):
|
||||
continue
|
||||
|
||||
# If it's a style attribute, sanitize it
|
||||
if namespaced_name == (None, "style"):
|
||||
val = self.sanitize_css(val)
|
||||
|
||||
# At this point, we want to keep the attribute, so add it in
|
||||
attrs[namespaced_name] = val
|
||||
|
||||
token["data"] = alphabetize_attributes(attrs)
|
||||
|
||||
return token
|
||||
|
||||
def disallowed_token(self, token):
|
||||
token_type = token["type"]
|
||||
if token_type == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
|
||||
elif token["data"]:
|
||||
assert token_type in ("StartTag", "EmptyTag")
|
||||
attrs = []
|
||||
for (ns, name), v in token["data"].items():
|
||||
# If we end up with a namespace, but no name, switch them so we
|
||||
# have a valid name to use.
|
||||
if ns and not name:
|
||||
ns, name = name, ns
|
||||
|
||||
# Figure out namespaced name if the namespace is appropriate
|
||||
# and exists; if the ns isn't in prefixes, then drop it.
|
||||
if ns is None or ns not in html5lib_shim.prefixes:
|
||||
namespaced_name = name
|
||||
else:
|
||||
namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
|
||||
|
||||
attrs.append(
|
||||
' %s="%s"'
|
||||
% (
|
||||
namespaced_name,
|
||||
# NOTE(willkg): HTMLSerializer escapes attribute values
|
||||
# already, so if we do it here (like HTMLSerializer does),
|
||||
# then we end up double-escaping.
|
||||
v,
|
||||
)
|
||||
)
|
||||
token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
|
||||
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
|
||||
if token.get("selfClosing"):
|
||||
token["data"] = token["data"][:-1] + "/>"
|
||||
|
||||
token["type"] = "Characters"
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
"""HTMLSanitizerMixin.sanitize_css replacement.
|
||||
"""Sanitizes css in style tags"""
|
||||
# Convert entities in the style so that it can be parsed as CSS
|
||||
style = html5lib_shim.convert_entities(style)
|
||||
|
||||
HTMLSanitizerMixin.sanitize_css always whitelists background-*,
|
||||
border-*, margin-*, and padding-*. We only whitelist what's in
|
||||
the whitelist.
|
||||
# Drop any url values before we do anything else
|
||||
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
|
||||
|
||||
"""
|
||||
# disallow urls
|
||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
# The gauntlet of sanitization
|
||||
|
||||
# Validate the css in the style tag and if it's not valid, then drop
|
||||
# the whole thing.
|
||||
parts = style.split(";")
|
||||
gauntlet = re.compile(
|
||||
r"""^( # consider a style attribute value as composed of:
|
||||
[/:,#%!.\s\w] # a non-newline character
|
||||
|\w-\w # 3 characters in the form \w-\w
|
||||
|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
|
||||
|"[\s\w]+" # a double quoted string of [\s\w]+
|
||||
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
|
||||
)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
|
||||
flags=re.U | re.VERBOSE,
|
||||
)
|
||||
|
||||
# gauntlet
|
||||
# TODO: Make sure this does what it's meant to - I *think* it wants to
|
||||
# validate style attribute contents.
|
||||
parts = style.split(';')
|
||||
gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
|
||||
"""\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
|
||||
for part in parts:
|
||||
if not gauntlet.match(part):
|
||||
return ''
|
||||
return ""
|
||||
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ""
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
|
||||
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
clean.append(prop + ": " + value + ";")
|
||||
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
clean.append(prop + ": " + value + ";")
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
|
||||
class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName,
|
||||
**kwargs)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
return " ".join(clean)
|
||||
|
|
21
lib/bleach/utils.py
Normal file
21
lib/bleach/utils.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from collections import OrderedDict
|
||||
|
||||
|
||||
def _attr_key(attr):
|
||||
"""Returns appropriate key for sorting attribute names
|
||||
|
||||
Attribute names are a tuple of ``(namespace, name)`` where namespace can be
|
||||
``None`` or a string. These can't be compared in Python 3, so we conver the
|
||||
``None`` to an empty string.
|
||||
|
||||
"""
|
||||
key = (attr[0][0] or ""), attr[0][1]
|
||||
return key
|
||||
|
||||
|
||||
def alphabetize_attributes(attrs):
|
||||
"""Takes a dict of attributes (or None) and returns them alphabetized"""
|
||||
if not attrs:
|
||||
return attrs
|
||||
|
||||
return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)])
|
Loading…
Add table
Add a link
Reference in a new issue