Update html5lib-1.1

2025-07-07 21:51:14 -07:00 · 2021-10-14 22:49:47 -07:00 · 2021-10-14 22:49:47 -07:00 · 586fd15464
commit 586fd15464
parent 3a116486e7
142 changed files with 90234 additions and 2393 deletions
--- a/lib/html5lib/treewalkers/init.py
+++ b/lib/html5lib/treewalkers/init.py
@ -2,17 +2,18 @@
 tree, generating tokens identical to those produced by the tokenizer
 module.

-To create a tree walker for a new type of tree, you need to do
+To create a tree walker for a new type of tree, you need to
 implement a tree walker object (called TreeWalker by convention) that
-implements a 'serialize' method taking a tree as sole argument and
-returning an iterator generating tokens.
+implements a 'serialize' method which takes a tree as sole argument and
+returns an iterator which generates tokens.
 """

 from __future__ import absolute_import, division, unicode_literals

-import sys
+from .. import constants
+from .._utils import default_etree

-from ..utils import default_etree
+__all__ = ["getTreeWalker", "pprint"]

 treeWalkerCache = {}

@ -20,34 +21,38 @@ treeWalkerCache = {}
 def getTreeWalker(treeType, implementation=None, **kwargs):
    """Get a TreeWalker class for various types of tree with built-in support

-    treeType - the name of the tree type required (case-insensitive). Supported
-               values are:
+    :arg str treeType: the name of the tree type required (case-insensitive).
+        Supported values are:

-                "dom" - The xml.dom.minidom DOM implementation
-                "pulldom" - The xml.dom.pulldom event stream
-                "etree" - A generic walker for tree implementations exposing an
-                          elementtree-like interface (known to work with
-                          ElementTree, cElementTree and lxml.etree).
-                "lxml" - Optimized walker for lxml.etree
-                "genshi" - a Genshi stream
+        * "dom": The xml.dom.minidom DOM implementation
+        * "etree": A generic walker for tree implementations exposing an
+          elementtree-like interface (known to work with ElementTree,
+          cElementTree and lxml.etree).
+        * "lxml": Optimized walker for lxml.etree
+        * "genshi": a Genshi stream

-    implementation - (Currently applies to the "etree" tree type only). A module
-                      implementing the tree type e.g. xml.etree.ElementTree or
-                      cElementTree."""
+    :arg implementation: A module implementing the tree type e.g.
+        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
+        tree type only).
+
+    :arg kwargs: keyword arguments passed to the etree walker--for other
+        walkers, this has no effect
+
+    :returns: a TreeWalker class
+
+    """

    treeType = treeType.lower()
    if treeType not in treeWalkerCache:
-        if treeType in ("dom", "pulldom"):
-            name = "%s.%s" % (__name__, treeType)
-            __import__(name)
-            mod = sys.modules[name]
-            treeWalkerCache[treeType] = mod.TreeWalker
+        if treeType == "dom":
+            from . import dom
+            treeWalkerCache[treeType] = dom.TreeWalker
        elif treeType == "genshi":
-            from . import genshistream
-            treeWalkerCache[treeType] = genshistream.TreeWalker
+            from . import genshi
+            treeWalkerCache[treeType] = genshi.TreeWalker
        elif treeType == "lxml":
-            from . import lxmletree
-            treeWalkerCache[treeType] = lxmletree.TreeWalker
+            from . import etree_lxml
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
        elif treeType == "etree":
            from . import etree
            if implementation is None:
@ -55,3 +60,95 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers
+
+    Takes a TreeWalker instance and pretty prints the output of walking the tree.
+
+    :arg walker: a TreeWalker instance
+
+    """
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)