Update html5lib-1.1

This commit is contained in:
JonnyWong16 2021-10-14 22:49:47 -07:00
parent 3a116486e7
commit 586fd15464
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
142 changed files with 90234 additions and 2393 deletions

View file

@ -1,56 +1,68 @@
"""A collection of modules for building different kinds of tree from
HTML documents.
"""A collection of modules for building different kinds of trees from HTML
documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.etree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
1. A set of classes for various types of elements: Document, Doctype, Comment,
Element. These must implement the interface of ``base.treebuilders.Node``
(although comment nodes have a different signature for their constructor,
see ``treebuilders.etree.Comment``) Textual content may also be implemented
as another node type, or not, as your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
It also has one required method:
getDocument - Returns the root node of the complete document tree
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
* ``documentClass`` - the class to use for the bottommost node of a document
* ``elementClass`` - the class to use for HTML Elements
* ``commentClass`` - the class to use for comments
* ``doctypeClass`` - the class to use for doctypes
It also has one required method:
* ``getDocument`` - Returns the root node of the complete document tree
3. If you wish to run the unit tests, you must also create a ``testSerializer``
method on your treebuilder which accepts a node and returns a string
containing Node and its children serialized according to the format used in
the unittests
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
"""
from __future__ import absolute_import, division, unicode_literals
from ..utils import default_etree
from .._utils import default_etree
treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
"""Get a TreeBuilder class for various types of trees with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are:
:arg treeType: the name of the tree type required (case-insensitive). Supported
values are:
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation.
"etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to
xml.etree.cElementTree if available and
xml.etree.ElementTree if not.
"lxml" - A etree-based builder for lxml.etree, handling
limitations of lxml's implementation.
* "dom" - A generic builder for DOM implementations, defaulting to a
xml.dom.minidom based implementation.
* "etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to xml.etree.cElementTree if
available and xml.etree.ElementTree if not.
* "lxml" - A etree-based builder for lxml.etree, handling limitations
of lxml's implementation.
implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g.
xml.etree.ElementTree or xml.etree.cElementTree."""
:arg implementation: (Currently applies to the "etree" and "dom" tree
types). A module implementing the tree type e.g. xml.etree.ElementTree
or xml.etree.cElementTree.
:arg kwargs: Any additional options to pass to the TreeBuilder when
creating it.
Example:
>>> from html5lib.treebuilders import getTreeBuilder
>>> builder = getTreeBuilder('etree')
"""
treeType = treeType.lower()
if treeType not in treeBuilderCache:

View file

@ -10,9 +10,9 @@ Marker = None
listElementsMap = {
None: (frozenset(scopingElements), False),
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")])), False),
"button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
"list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
(namespaces["html"], "ul")}), False),
"table": (frozenset([(namespaces["html"], "html"),
(namespaces["html"], "table")]), False),
"select": (frozenset([(namespaces["html"], "optgroup"),
@ -21,22 +21,25 @@ listElementsMap = {
class Node(object):
"""Represents an item in the tree"""
def __init__(self, name):
"""Node representing an item in the tree.
name - The tag name associated with the node
parent - The parent of the current node (or None for the document node)
value - The value of the current node (applies to text nodes and
comments
attributes - a dict holding name, value pairs for attributes of the node
childNodes - a list of child nodes of the current node. This must
include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node
"""Creates a Node
:arg name: The tag name associated with the node
"""
# The tag name associated with the node
self.name = name
# The parent of the current node (or None for the document node)
self.parent = None
# The value of the current node (applies to text nodes and comments)
self.value = None
# A dict holding name -> value pairs for attributes of the node
self.attributes = {}
# A list of child nodes of the current node. This must include all
# elements but not necessarily other node types.
self.childNodes = []
# A list of miscellaneous flags that can be set on the node.
self._flags = []
def __str__(self):
@ -53,23 +56,41 @@ class Node(object):
def appendChild(self, node):
"""Insert node as a child of the current node
:arg node: the node to insert
"""
raise NotImplementedError
def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text.
:arg data: the data to insert
:arg insertBefore: True if you want to insert the text before the node
and False if you want to insert it after the node
"""
raise NotImplementedError
def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
the current node"""
the current node
:arg node: the node to insert
:arg refNode: the child node to insert the node before
"""
raise NotImplementedError
def removeChild(self, node):
"""Remove node from the children of the current node
:arg node: the child node to remove
"""
raise NotImplementedError
@ -77,6 +98,9 @@ class Node(object):
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
text in the correct way
:arg newParent: the node to move all this node's children to
"""
# XXX - should this method be made more general?
for child in self.childNodes:
@ -121,11 +145,14 @@ class ActiveFormattingElements(list):
class TreeBuilder(object):
"""Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
* documentClass - the class to use for the bottommost node of a document
* elementClass - the class to use for HTML Elements
* commentClass - the class to use for comments
* doctypeClass - the class to use for doctypes
"""
# pylint:disable=not-callable
# Document class
documentClass = None
@ -143,6 +170,11 @@ class TreeBuilder(object):
fragmentClass = None
def __init__(self, namespaceHTMLElements):
"""Create a TreeBuilder
:arg namespaceHTMLElements: whether or not to namespace HTML elements
"""
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
@ -166,12 +198,17 @@ class TreeBuilder(object):
# If we pass a node in we match that. if we pass a string
# match any node with that name
exactNode = hasattr(target, "nameTuple")
if not exactNode:
if isinstance(target, text_type):
target = (namespaces["html"], target)
assert isinstance(target, tuple)
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
if (node.name == target and not exactNode or
node == target and exactNode):
if exactNode and node == target:
return True
elif not exactNode and node.nameTuple == target:
return True
elif (invert ^ (node.nameTuple in listElements)):
return False
@ -353,19 +390,19 @@ class TreeBuilder(object):
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
and name != exclude):
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
name != exclude):
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):
"Return the final tree"
"""Return the final tree"""
return self.document
def getFragment(self):
"Return the final fragment"
"""Return the final fragment"""
# assert self.innerHTML
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
@ -373,5 +410,8 @@ class TreeBuilder(object):
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests
node - the node from which to start serializing"""
:arg node: the node from which to start serializing
"""
raise NotImplementedError

View file

@ -1,54 +1,65 @@
from __future__ import absolute_import, division, unicode_literals
try:
from collections.abc import MutableMapping
except ImportError: # Python 2.7
from collections import MutableMapping
from xml.dom import minidom, Node
import weakref
from . import _base
from . import base
from .. import constants
from ..constants import namespaces
from ..utils import moduleFactoryFactory
from .._utils import moduleFactoryFactory
def getDomBuilder(DomImplementation):
Dom = DomImplementation
class AttrList(object):
class AttrList(MutableMapping):
def __init__(self, element):
self.element = element
def __iter__(self):
return list(self.element.attributes.items()).__iter__()
return iter(self.element.attributes.keys())
def __setitem__(self, name, value):
self.element.setAttribute(name, value)
def __len__(self):
return len(list(self.element.attributes.items()))
def items(self):
return [(item[0], item[1]) for item in
list(self.element.attributes.items())]
def keys(self):
return list(self.element.attributes.keys())
def __getitem__(self, name):
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.hasAttribute(name)
attr = self.element.ownerDocument.createAttribute(name)
attr.value = value
self.element.attributes[name] = attr
class NodeBuilder(_base.Node):
def __len__(self):
return len(self.element.attributes)
def items(self):
return list(self.element.attributes.items())
def values(self):
return list(self.element.attributes.values())
def __getitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.attributes[name].value
def __delitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
del self.element.attributes[name]
class NodeBuilder(base.Node):
def __init__(self, element):
_base.Node.__init__(self, element.nodeName)
base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self: hasattr(self.element, "namespaceURI")
and self.element.namespaceURI or None)
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
@ -109,7 +120,7 @@ def getDomBuilder(DomImplementation):
nameTuple = property(getNameTuple)
class TreeBuilder(_base.TreeBuilder):
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self)
@ -149,16 +160,17 @@ def getDomBuilder(DomImplementation):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
return base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data = data
if parent != self:
_base.TreeBuilder.insertText(self, data, parent)
base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types:
# pylint:disable=protected-access
if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))

View file

@ -1,13 +1,17 @@
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
from six import text_type
import re
from . import _base
from .. import ihatexml
from copy import copy
from . import base
from .. import _ihatexml
from .. import constants
from ..constants import namespaces
from ..utils import moduleFactoryFactory
from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
@ -16,7 +20,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class Element(_base.Node):
class Element(base.Node):
def __init__(self, name, namespace=None):
self._name = name
self._namespace = namespace
@ -59,16 +63,17 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self._element.attrib
def _setAttributes(self, attributes):
# Delete existing attributes first
# XXX - there may be a better way to do this...
for key in list(self._element.attrib.keys()):
del self._element.attrib[key]
for key, value in attributes.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], key[1])
else:
name = key
self._element.set(name, value)
el_attrib = self._element.attrib
el_attrib.clear()
if attributes:
# calling .items _always_ allocates, and the above truthy check is cheaper than the
# allocation on average
for key, value in attributes.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], key[1])
else:
name = key
el_attrib[name] = value
attributes = property(_getAttributes, _setAttributes)
@ -98,6 +103,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
node.parent = self
def removeChild(self, node):
self._childNodes.remove(node)
self._element.remove(node._element)
node.parent = None
@ -126,8 +132,8 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
def cloneNode(self):
element = type(self)(self.name, self.namespace)
for name, value in self.attributes.items():
element.attributes[name] = value
if self._element.attrib:
element._element.attrib = copy(self._element.attrib)
return element
def reparentChildren(self, newParent):
@ -139,7 +145,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
@ -253,10 +259,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "\n".join(rv)
def tostring(element):
def tostring(element): # pylint:disable=unused-variable
"""Serialize an element and its child nodes to a string"""
rv = []
filter = ihatexml.InfosetFilter()
filter = _ihatexml.InfosetFilter()
def serializeElement(element):
if isinstance(element, ElementTree.ElementTree):
@ -307,7 +313,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
@ -329,7 +335,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element
return base.TreeBuilder.getFragment(self)._element
return locals()

View file

@ -10,18 +10,25 @@ When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings
import re
import sys
from . import _base
try:
from collections.abc import MutableMapping
except ImportError:
from collections import MutableMapping
from . import base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import ihatexml
from .. import _ihatexml
import lxml.etree as etree
from six import PY3, binary_type
fullTree = True
@ -43,7 +50,11 @@ class Document(object):
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
last = self._elementTree.getroot()
for last in self._elementTree.getroot().itersiblings():
pass
last.addnext(element._element)
def _getChildNodes(self):
return self._childNodes
@ -53,8 +64,7 @@ class Document(object):
def testSerializer(element):
rv = []
finalText = None
infosetFilter = ihatexml.InfosetFilter()
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
@ -79,7 +89,7 @@ def testSerializer(element):
next_element = next_element.getnext()
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info.major == 2
assert isinstance(element, str) or sys.version_info[0] == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
# Fragment case
@ -128,16 +138,12 @@ def testSerializer(element):
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
@ -173,13 +179,10 @@ def tostring(element):
serializeElement(element)
if finalText is not None:
rv.append("%s\"" % (' ' * 2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
class TreeBuilder(base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
@ -189,27 +192,40 @@ class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
class Attributes(MutableMapping):
def __init__(self, element):
self._element = element
dict.__init__(self, value)
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
def _coerceKey(self, key):
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
return name
def __getitem__(self, key):
value = self._element._element.attrib[self._coerceKey(key)]
if not PY3 and isinstance(value, binary_type):
value = value.decode("ascii")
return value
def __setitem__(self, key, value):
self._element._element.attrib[self._coerceKey(key)] = value
def __delitem__(self, key):
del self._element._element.attrib[self._coerceKey(key)]
def __iter__(self):
return iter(self._element._element.attrib)
def __len__(self):
return len(self._element._element.attrib)
def clear(self):
return self._element._element.attrib.clear()
class Element(builder.Element):
def __init__(self, name, namespace):
@ -230,8 +246,10 @@ class TreeBuilder(_base.TreeBuilder):
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
def _setAttributes(self, value):
attributes = self.attributes
attributes.clear()
attributes.update(value)
attributes = property(_getAttributes, _setAttributes)
@ -239,8 +257,11 @@ class TreeBuilder(_base.TreeBuilder):
data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
def cloneNode(self):
element = type(self)(self.name, self.namespace)
if self._element.attrib:
element._element.attrib.update(self._element.attrib)
return element
class Comment(builder.Comment):
def __init__(self, data):
@ -257,12 +278,12 @@ class TreeBuilder(_base.TreeBuilder):
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = builder.Comment
self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
_base.TreeBuilder.reset(self)
base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
@ -303,19 +324,20 @@ class TreeBuilder(_base.TreeBuilder):
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
assert parent is None or parent is self.document
assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
"""Create the document root"""
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our iniial
# Therefore we need to use the built-in parser to create our initial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype:
@ -344,7 +366,8 @@ class TreeBuilder(_base.TreeBuilder):
# Append the initial comments:
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
comment = self.commentClass(comment_token["data"])
root.addprevious(comment._element)
# Create the root document and add the ElementTree to it
self.document = self.documentClass()