Update html5lib-1.1

This commit is contained in:
JonnyWong16 2021-10-14 22:49:47 -07:00
parent 3a116486e7
commit 586fd15464
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
142 changed files with 90234 additions and 2393 deletions

View file

@ -1,14 +1,23 @@
"""
HTML parsing library based on the WHATWG "HTML5"
specification. The parser is designed to be compatible with existing
HTML found in the wild and implements well-defined error recovery that
HTML parsing library based on the `WHATWG HTML specification
<https://whatwg.org/html>`_. The parser is designed to be compatible with
existing HTML found in the wild and implements well-defined error recovery that
is largely compatible with modern desktop web browsers.
Example usage:
Example usage::
import html5lib
f = open("my_document.html")
tree = html5lib.parse(f)
import html5lib
with open("my_document.html", "rb") as f:
tree = html5lib.parse(f)
For convenience, this module re-exports the following names:
* :func:`~.html5parser.parse`
* :func:`~.html5parser.parseFragment`
* :class:`~.html5parser.HTMLParser`
* :func:`~.treebuilders.getTreeBuilder`
* :func:`~.treewalkers.getTreeWalker`
* :func:`~.serializer.serialize`
"""
from __future__ import absolute_import, division, unicode_literals
@ -20,4 +29,7 @@ from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
__version__ = "0.999"
# this has to be at the top level, see how setup.py parses this
#: Distribution version number.
__version__ = "1.1"

View file

@ -136,6 +136,7 @@ def normaliseCharList(charList):
i += j
return rv
# We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
@ -175,18 +176,18 @@ def escapeRegexp(string):
return string
# output from the above
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars=None,
def __init__(self,
dropXmlnsLocalName=False,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
@ -217,7 +218,7 @@ class InfosetFilter(object):
else:
return self.toXmlName(name)
def coerceElement(self, name, namespace=None):
def coerceElement(self, name):
return self.toXmlName(name)
def coerceComment(self, data):
@ -225,11 +226,14 @@ class InfosetFilter(object):
while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -")
if data.endswith("-"):
warnings.warn("Comments cannot end in a dash", DataLossWarning)
data += " "
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
for i in range(data.count("\x0C")):
for _ in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ")
# Other non-xml characters
@ -251,7 +255,7 @@ class InfosetFilter(object):
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
warnings.warn("Coercing non-XML name", DataLossWarning)
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
@ -259,7 +263,7 @@ class InfosetFilter(object):
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
warnings.warn("Coercing non-XML name", DataLossWarning)
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput

View file

@ -1,26 +1,17 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from six.moves import http_client
from six.moves import http_client, urllib
import codecs
import re
from io import BytesIO, StringIO
import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import encodings, ReparseException
from . import utils
from io import StringIO
try:
from io import BytesIO
except ImportError:
BytesIO = StringIO
try:
from io import BufferedIOBase
except ImportError:
class BufferedIOBase(object):
pass
from .constants import _ReparseException
from . import _utils
# Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
@ -28,17 +19,30 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF])
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
if _utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# eval. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
"]")
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF}
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil()
charsUntilRegEx = {}
@ -118,10 +122,13 @@ class BufferedStream(object):
return b"".join(rv)
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
if isinstance(source, http_client.HTTPResponse):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
# Also check for addinfourl wrapping HTTPResponse
(isinstance(source, urllib.response.addbase) and
isinstance(source.fp, http_client.HTTPResponse))):
isUnicode = False
elif hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type)
@ -129,12 +136,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
isUnicode = isinstance(source, text_type)
if isUnicode:
if encoding is not None:
raise TypeError("Cannot explicitly set an encoding with a unicode string")
encodings = [x for x in kwargs if x.endswith("_encoding")]
if encodings:
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
return HTMLUnicodeInputStream(source)
return HTMLUnicodeInputStream(source, **kwargs)
else:
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
return HTMLBinaryInputStream(source, **kwargs)
class HTMLUnicodeInputStream(object):
@ -160,22 +168,21 @@ class HTMLUnicodeInputStream(object):
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""
# Craziness
if len("\U0010FFFF") == 1:
if not _utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
# List of where new lines occur
self.newLines = [0]
self.charEncoding = ("utf-8", "certain")
self.charEncoding = (lookupEncoding("utf-8"), "certain")
self.dataStream = self.openStream(source)
self.reset()
@ -265,12 +272,10 @@ class HTMLUnicodeInputStream(object):
self._bufferedCharacter = data[-1]
data = data[:-1]
self.reportCharacterErrors(data)
if self.reportCharacterErrors:
self.reportCharacterErrors(data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
@ -280,7 +285,7 @@ class HTMLUnicodeInputStream(object):
return True
def characterErrorsUCS4(self, data):
for i in range(len(invalid_unicode_re.findall(data))):
for _ in range(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
@ -293,9 +298,9 @@ class HTMLUnicodeInputStream(object):
codepoint = ord(match.group())
pos = match.start()
# Pretty sure there should be endianness issues here
if utils.isSurrogatePair(data[pos:pos + 2]):
if _utils.isSurrogatePair(data[pos:pos + 2]):
# We have a surrogate pair!
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
@ -356,7 +361,7 @@ class HTMLUnicodeInputStream(object):
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
if char is not None:
if char is not EOF:
if self.chunkOffset == 0:
# unget is called quite rarely, so it's a good idea to do
# more work here if it saves a bit of work in the frequently
@ -378,7 +383,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -391,8 +398,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
@ -400,27 +405,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
HTMLUnicodeInputStream.__init__(self, self.rawStream)
self.charEncoding = (codecName(encoding), "certain")
# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 512
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
# Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
# Things from args
self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding
# Detect encoding iff no explicit "transport level" encoding is supplied
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
# Determine encoding
self.charEncoding = self.determineEncoding(useChardet)
assert self.charEncoding[0] is not None
# Call superclass
self.reset()
def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
HTMLUnicodeInputStream.reset(self)
def openStream(self, source):
@ -437,29 +443,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
try:
stream.seek(stream.tell())
except:
except Exception:
stream = BufferedStream(stream)
return stream
def detectEncoding(self, parseMeta=True, chardet=True):
# First look for a BOM
def determineEncoding(self, chardet=True):
# BOMs take precedence over everything
# This will also read past the BOM if present
encoding = self.detectBOM()
confidence = "certain"
# If there is no BOM need to look for meta elements with encoding
# information
if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
confidence = "tentative"
# Guess with chardet, if avaliable
if encoding is None and chardet:
confidence = "tentative"
charEncoding = self.detectBOM(), "certain"
if charEncoding[0] is not None:
return charEncoding
# If we've been overridden, we've been overridden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding
# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Guess with chardet, if available
if chardet:
try:
try:
from charade.universaldetector import UniversalDetector
except ImportError:
from chardet.universaldetector import UniversalDetector
from chardet.universaldetector import UniversalDetector
except ImportError:
pass
else:
buffers = []
detector = UniversalDetector()
while not detector.done:
@ -470,37 +497,34 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
buffers.append(buffer)
detector.feed(buffer)
detector.close()
encoding = detector.result['encoding']
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
if encoding is None:
confidence = "tentative"
encoding = self.defaultEncoding
if encoding is not None:
return encoding, "tentative"
# Substitute for equivalent encodings:
encodingSub = {"iso-8859-1": "windows-1252"}
# Try the default encoding
charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
if encoding.lower() in encodingSub:
encoding = encodingSub[encoding.lower()]
return encoding, confidence
# Fallback to html5lib's default if even that hasn't worked
return lookupEncoding("windows-1252"), "tentative"
def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
newEncoding = codecName(newEncoding)
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
newEncoding = lookupEncoding(newEncoding)
if newEncoding is None:
return
if newEncoding.name in ("utf-16be", "utf-16le"):
newEncoding = lookupEncoding("utf-8")
assert newEncoding is not None
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain")
else:
self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certain")
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
self.reset()
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
@ -508,8 +532,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
@ -529,9 +553,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
return encoding
if encoding:
self.rawStream.seek(seek)
return lookupEncoding(encoding)
else:
self.rawStream.seek(0)
return None
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
@ -542,8 +569,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
self.rawStream.seek(0)
encoding = parser.getEncoding()
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
encoding = "utf-8"
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
encoding = lookupEncoding("utf-8")
return encoding
@ -557,6 +584,7 @@ class EncodingBytes(bytes):
return bytes.__new__(self, value.lower())
def __init__(self, value):
# pylint:disable=unused-argument
self._position = -1
def __iter__(self):
@ -630,9 +658,7 @@ class EncodingBytes(bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p + len(bytes)]
rv = data.startswith(bytes)
rv = self.startswith(bytes, self.position)
if rv:
self.position += len(bytes)
return rv
@ -640,15 +666,11 @@ class EncodingBytes(bytes):
def jumpTo(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
# XXX: This is ugly, but I can't see a nicer way to fix this.
if self._position == -1:
self._position = 0
self._position += (newPosition + len(bytes) - 1)
return True
else:
try:
self._position = self.index(bytes, self.position) + len(bytes) - 1
except ValueError:
raise StopIteration
return True
class EncodingParser(object):
@ -660,6 +682,9 @@ class EncodingParser(object):
self.encoding = None
def getEncoding(self):
if b"<meta" not in self.data:
return None
methodDispatch = (
(b"<!--", self.handleComment),
(b"<meta", self.handleMeta),
@ -667,8 +692,12 @@ class EncodingParser(object):
(b"<!", self.handleOther),
(b"<?", self.handleOther),
(b"<", self.handlePossibleStartTag))
for byte in self.data:
for _ in self.data:
keepParsing = True
try:
self.data.jumpTo(b"<")
except StopIteration:
break
for key, method in methodDispatch:
if self.data.matchBytes(key):
try:
@ -706,7 +735,7 @@ class EncodingParser(object):
return False
elif attr[0] == b"charset":
tentativeEncoding = attr[1]
codec = codecName(tentativeEncoding)
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
self.encoding = codec
return False
@ -714,7 +743,7 @@ class EncodingParser(object):
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse()
if tentativeEncoding is not None:
codec = codecName(tentativeEncoding)
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
if hasPragma:
self.encoding = codec
@ -871,7 +900,7 @@ class ContentAttrParser(object):
return None
def codecName(encoding):
def lookupEncoding(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if isinstance(encoding, bytes):
@ -879,8 +908,11 @@ def codecName(encoding):
encoding = encoding.decode("ascii")
except UnicodeDecodeError:
return None
if encoding:
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
return encodings.get(canonicalName, None)
if encoding is not None:
try:
return webencodings.lookup(encoding)
except AttributeError:
return None
else:
return None

View file

@ -1,11 +1,9 @@
from __future__ import absolute_import, division, unicode_literals
try:
chr = unichr # flake8: noqa
except NameError:
pass
from six import unichr as chr
from collections import deque
from collections import deque, OrderedDict
from sys import version_info
from .constants import spaceCharacters
from .constants import entities
@ -14,12 +12,17 @@ from .constants import digits, hexDigits, EOF
from .constants import tokenTypes, tagTokenTypes
from .constants import replacementCharacters
from .inputstream import HTMLInputStream
from ._inputstream import HTMLInputStream
from .trie import Trie
from ._trie import Trie
entitiesTrie = Trie(entities)
if version_info >= (3, 7):
attributeMap = dict
else:
attributeMap = OrderedDict
class HTMLTokenizer(object):
""" This class takes care of tokenizing HTML.
@ -34,16 +37,11 @@ class HTMLTokenizer(object):
Points to HTMLInputStream object.
"""
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
def __init__(self, stream, parser=None, **kwargs):
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
self.stream = HTMLInputStream(stream, **kwargs)
self.parser = parser
# Perform case conversions?
self.lowercaseElementName = lowercaseElementName
self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state
self.escapeFlag = False
self.lastFourChars = []
@ -147,8 +145,8 @@ class HTMLTokenizer(object):
output = "&"
charStack = [self.stream.char()]
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
or (allowedChar is not None and allowedChar == charStack[0])):
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
(allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0])
elif charStack[0] == "#":
@ -235,8 +233,15 @@ class HTMLTokenizer(object):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
if self.lowercaseElementName:
token["name"] = token["name"].translate(asciiUpper2Lower)
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
if len(raw) > len(data):
# we had some duplicated attribute, fix so first wins
data.update(raw[::-1])
token["data"] = data
if token["type"] == tokenTypes["EndTag"]:
if token["data"]:
self.tokenQueue.append({"type": tokenTypes["ParseError"],
@ -921,10 +926,9 @@ class HTMLTokenizer(object):
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
if self.lowercaseAttrName:
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, value in self.currentToken["data"][:-1]:
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"duplicate-attribute"})
@ -1716,11 +1720,11 @@ class HTMLTokenizer(object):
else:
data.append(char)
data = "".join(data)
data = "".join(data) # pylint:disable=redefined-variable-type
# Deal with null here rather than in the parser
nullCount = data.count("\u0000")
if nullCount > 0:
for i in range(nullCount):
for _ in range(nullCount):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "invalid-codepoint"})
data = data.replace("\u0000", "\uFFFD")

View file

@ -0,0 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from .py import Trie
__all__ = ["Trie"]

View file

@ -1,19 +1,22 @@
from __future__ import absolute_import, division, unicode_literals
from collections import Mapping
try:
from collections.abc import Mapping
except ImportError: # Python 2.7
from collections import Mapping
class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
keys = super().keys()
# pylint:disable=arguments-differ
keys = super(Trie, self).keys()
if prefix is None:
return set(keys)
# Python 2.6: no set comprehensions
return set([x for x in keys if x.startswith(prefix)])
return {x for x in keys if x.startswith(prefix)}
def has_keys_with_prefix(self, prefix):
for key in self.keys():

159
lib/html5lib/_utils.py Normal file
View file

@ -0,0 +1,159 @@
from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
try:
from collections.abc import Mapping
except ImportError:
from collections import Mapping
from six import text_type, PY3
if PY3:
import xml.etree.ElementTree as default_etree
else:
try:
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as
# Jython. This is because UTF-16 itself is based on the use of such
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"') # pylint:disable=eval-used
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"') # pylint:disable=eval-used
assert isinstance(_x, text_type)
except Exception:
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
class MethodDispatcher(dict):
"""Dict with 2 special properties:
On initiation, keys that are lists, sets or tuples are converted to
multiple keys so accessing any one of the items in the original
list-like object returns the matching value
md = MethodDispatcher({("foo", "bar"):"baz"})
md["foo"] == "baz"
A default value which can be set through the default attribute.
"""
def __init__(self, items=()):
_dictEntries = []
for name, value in items:
if isinstance(name, (list, tuple, frozenset, set)):
for item in name:
_dictEntries.append((item, value))
else:
_dictEntries.append((name, value))
dict.__init__(self, _dictEntries)
assert len(self) == len(_dictEntries)
self.default = None
def __getitem__(self, key):
return dict.get(self, key, self.default)
def __get__(self, instance, owner=None):
return BoundMethodDispatcher(instance, self)
class BoundMethodDispatcher(Mapping):
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
def __init__(self, instance, dispatcher):
self.instance = instance
self.dispatcher = dispatcher
def __getitem__(self, key):
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
# on a function, __get__ is used to bind a function to an instance as a bound method
return self.dispatcher[key].__get__(self.instance)
def get(self, key, default):
if key in self.dispatcher:
return self[key]
else:
return default
def __iter__(self):
return iter(self.dispatcher)
def __len__(self):
return len(self.dispatcher)
def __contains__(self, key):
return key in self.dispatcher
# Some utility functions to deal with weirdness around UCS2 vs UCS4
# python builds
def isSurrogatePair(data):
return (len(data) == 2 and
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
def surrogatePairToCodepoint(data):
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
(ord(data[1]) - 0xDC00))
return char_val
# Module Factory Factory (no, this isn't Java, I know)
# Here to stop this being duplicated all over the place.
def moduleFactoryFactory(factory):
moduleCache = {}
def moduleFactory(baseModule, *args, **kwargs):
if isinstance(ModuleType.__name__, type("")):
name = "_%s_factory" % baseModule.__name__
else:
name = b"_%s_factory" % baseModule.__name__
kwargs_tuple = tuple(kwargs.items())
try:
return moduleCache[name][args][kwargs_tuple]
except KeyError:
mod = ModuleType(name)
objs = factory(baseModule, *args, **kwargs)
mod.__dict__.update(objs)
if "name" not in moduleCache:
moduleCache[name] = {}
if "args" not in moduleCache[name]:
moduleCache[name][args] = {}
if "kwargs" not in moduleCache[name][args]:
moduleCache[name][args][kwargs_tuple] = {}
moduleCache[name][args][kwargs_tuple] = mod
return mod
return moduleFactory
def memoize(func):
cache = {}
def wrapped(*args, **kwargs):
key = (tuple(args), tuple(kwargs.items()))
if key not in cache:
cache[key] = func(*args, **kwargs)
return cache[key]
return wrapped

View file

@ -1,292 +1,296 @@
from __future__ import absolute_import, division, unicode_literals
import string
import gettext
_ = gettext.gettext
EOF = None
E = {
"null-character":
_("Null character in input stream, replaced with U+FFFD."),
"Null character in input stream, replaced with U+FFFD.",
"invalid-codepoint":
_("Invalid codepoint in stream."),
"Invalid codepoint in stream.",
"incorrectly-placed-solidus":
_("Solidus (/) incorrectly placed in tag."),
"Solidus (/) incorrectly placed in tag.",
"incorrect-cr-newline-entity":
_("Incorrect CR newline entity, replaced with LF."),
"Incorrect CR newline entity, replaced with LF.",
"illegal-windows-1252-entity":
_("Entity used with illegal number (windows-1252 reference)."),
"Entity used with illegal number (windows-1252 reference).",
"cant-convert-numeric-entity":
_("Numeric entity couldn't be converted to character "
"(codepoint U+%(charAsInt)08x)."),
"Numeric entity couldn't be converted to character "
"(codepoint U+%(charAsInt)08x).",
"illegal-codepoint-for-numeric-entity":
_("Numeric entity represents an illegal codepoint: "
"U+%(charAsInt)08x."),
"Numeric entity represents an illegal codepoint: "
"U+%(charAsInt)08x.",
"numeric-entity-without-semicolon":
_("Numeric entity didn't end with ';'."),
"Numeric entity didn't end with ';'.",
"expected-numeric-entity-but-got-eof":
_("Numeric entity expected. Got end of file instead."),
"Numeric entity expected. Got end of file instead.",
"expected-numeric-entity":
_("Numeric entity expected but none found."),
"Numeric entity expected but none found.",
"named-entity-without-semicolon":
_("Named entity didn't end with ';'."),
"Named entity didn't end with ';'.",
"expected-named-entity":
_("Named entity expected. Got none."),
"Named entity expected. Got none.",
"attributes-in-end-tag":
_("End tag contains unexpected attributes."),
"End tag contains unexpected attributes.",
'self-closing-flag-on-end-tag':
_("End tag contains unexpected self-closing flag."),
"End tag contains unexpected self-closing flag.",
"expected-tag-name-but-got-right-bracket":
_("Expected tag name. Got '>' instead."),
"Expected tag name. Got '>' instead.",
"expected-tag-name-but-got-question-mark":
_("Expected tag name. Got '?' instead. (HTML doesn't "
"support processing instructions.)"),
"Expected tag name. Got '?' instead. (HTML doesn't "
"support processing instructions.)",
"expected-tag-name":
_("Expected tag name. Got something else instead"),
"Expected tag name. Got something else instead",
"expected-closing-tag-but-got-right-bracket":
_("Expected closing tag. Got '>' instead. Ignoring '</>'."),
"Expected closing tag. Got '>' instead. Ignoring '</>'.",
"expected-closing-tag-but-got-eof":
_("Expected closing tag. Unexpected end of file."),
"Expected closing tag. Unexpected end of file.",
"expected-closing-tag-but-got-char":
_("Expected closing tag. Unexpected character '%(data)s' found."),
"Expected closing tag. Unexpected character '%(data)s' found.",
"eof-in-tag-name":
_("Unexpected end of file in the tag name."),
"Unexpected end of file in the tag name.",
"expected-attribute-name-but-got-eof":
_("Unexpected end of file. Expected attribute name instead."),
"Unexpected end of file. Expected attribute name instead.",
"eof-in-attribute-name":
_("Unexpected end of file in attribute name."),
"Unexpected end of file in attribute name.",
"invalid-character-in-attribute-name":
_("Invalid character in attribute name"),
"Invalid character in attribute name",
"duplicate-attribute":
_("Dropped duplicate attribute on tag."),
"Dropped duplicate attribute on tag.",
"expected-end-of-tag-name-but-got-eof":
_("Unexpected end of file. Expected = or end of tag."),
"Unexpected end of file. Expected = or end of tag.",
"expected-attribute-value-but-got-eof":
_("Unexpected end of file. Expected attribute value."),
"Unexpected end of file. Expected attribute value.",
"expected-attribute-value-but-got-right-bracket":
_("Expected attribute value. Got '>' instead."),
"Expected attribute value. Got '>' instead.",
'equals-in-unquoted-attribute-value':
_("Unexpected = in unquoted attribute"),
"Unexpected = in unquoted attribute",
'unexpected-character-in-unquoted-attribute-value':
_("Unexpected character in unquoted attribute"),
"Unexpected character in unquoted attribute",
"invalid-character-after-attribute-name":
_("Unexpected character after attribute name."),
"Unexpected character after attribute name.",
"unexpected-character-after-attribute-value":
_("Unexpected character after attribute value."),
"Unexpected character after attribute value.",
"eof-in-attribute-value-double-quote":
_("Unexpected end of file in attribute value (\")."),
"Unexpected end of file in attribute value (\").",
"eof-in-attribute-value-single-quote":
_("Unexpected end of file in attribute value (')."),
"Unexpected end of file in attribute value (').",
"eof-in-attribute-value-no-quotes":
_("Unexpected end of file in attribute value."),
"Unexpected end of file in attribute value.",
"unexpected-EOF-after-solidus-in-tag":
_("Unexpected end of file in tag. Expected >"),
"Unexpected end of file in tag. Expected >",
"unexpected-character-after-solidus-in-tag":
_("Unexpected character after / in tag. Expected >"),
"Unexpected character after / in tag. Expected >",
"expected-dashes-or-doctype":
_("Expected '--' or 'DOCTYPE'. Not found."),
"Expected '--' or 'DOCTYPE'. Not found.",
"unexpected-bang-after-double-dash-in-comment":
_("Unexpected ! after -- in comment"),
"Unexpected ! after -- in comment",
"unexpected-space-after-double-dash-in-comment":
_("Unexpected space after -- in comment"),
"Unexpected space after -- in comment",
"incorrect-comment":
_("Incorrect comment."),
"Incorrect comment.",
"eof-in-comment":
_("Unexpected end of file in comment."),
"Unexpected end of file in comment.",
"eof-in-comment-end-dash":
_("Unexpected end of file in comment (-)"),
"Unexpected end of file in comment (-)",
"unexpected-dash-after-double-dash-in-comment":
_("Unexpected '-' after '--' found in comment."),
"Unexpected '-' after '--' found in comment.",
"eof-in-comment-double-dash":
_("Unexpected end of file in comment (--)."),
"Unexpected end of file in comment (--).",
"eof-in-comment-end-space-state":
_("Unexpected end of file in comment."),
"Unexpected end of file in comment.",
"eof-in-comment-end-bang-state":
_("Unexpected end of file in comment."),
"Unexpected end of file in comment.",
"unexpected-char-in-comment":
_("Unexpected character in comment found."),
"Unexpected character in comment found.",
"need-space-after-doctype":
_("No space after literal string 'DOCTYPE'."),
"No space after literal string 'DOCTYPE'.",
"expected-doctype-name-but-got-right-bracket":
_("Unexpected > character. Expected DOCTYPE name."),
"Unexpected > character. Expected DOCTYPE name.",
"expected-doctype-name-but-got-eof":
_("Unexpected end of file. Expected DOCTYPE name."),
"Unexpected end of file. Expected DOCTYPE name.",
"eof-in-doctype-name":
_("Unexpected end of file in DOCTYPE name."),
"Unexpected end of file in DOCTYPE name.",
"eof-in-doctype":
_("Unexpected end of file in DOCTYPE."),
"Unexpected end of file in DOCTYPE.",
"expected-space-or-right-bracket-in-doctype":
_("Expected space or '>'. Got '%(data)s'"),
"Expected space or '>'. Got '%(data)s'",
"unexpected-end-of-doctype":
_("Unexpected end of DOCTYPE."),
"Unexpected end of DOCTYPE.",
"unexpected-char-in-doctype":
_("Unexpected character in DOCTYPE."),
"Unexpected character in DOCTYPE.",
"eof-in-innerhtml":
_("XXX innerHTML EOF"),
"XXX innerHTML EOF",
"unexpected-doctype":
_("Unexpected DOCTYPE. Ignored."),
"Unexpected DOCTYPE. Ignored.",
"non-html-root":
_("html needs to be the first start tag."),
"html needs to be the first start tag.",
"expected-doctype-but-got-eof":
_("Unexpected End of file. Expected DOCTYPE."),
"Unexpected End of file. Expected DOCTYPE.",
"unknown-doctype":
_("Erroneous DOCTYPE."),
"Erroneous DOCTYPE.",
"expected-doctype-but-got-chars":
_("Unexpected non-space characters. Expected DOCTYPE."),
"Unexpected non-space characters. Expected DOCTYPE.",
"expected-doctype-but-got-start-tag":
_("Unexpected start tag (%(name)s). Expected DOCTYPE."),
"Unexpected start tag (%(name)s). Expected DOCTYPE.",
"expected-doctype-but-got-end-tag":
_("Unexpected end tag (%(name)s). Expected DOCTYPE."),
"Unexpected end tag (%(name)s). Expected DOCTYPE.",
"end-tag-after-implied-root":
_("Unexpected end tag (%(name)s) after the (implied) root element."),
"Unexpected end tag (%(name)s) after the (implied) root element.",
"expected-named-closing-tag-but-got-eof":
_("Unexpected end of file. Expected end tag (%(name)s)."),
"Unexpected end of file. Expected end tag (%(name)s).",
"two-heads-are-not-better-than-one":
_("Unexpected start tag head in existing head. Ignored."),
"Unexpected start tag head in existing head. Ignored.",
"unexpected-end-tag":
_("Unexpected end tag (%(name)s). Ignored."),
"Unexpected end tag (%(name)s). Ignored.",
"unexpected-start-tag-out-of-my-head":
_("Unexpected start tag (%(name)s) that can be in head. Moved."),
"Unexpected start tag (%(name)s) that can be in head. Moved.",
"unexpected-start-tag":
_("Unexpected start tag (%(name)s)."),
"Unexpected start tag (%(name)s).",
"missing-end-tag":
_("Missing end tag (%(name)s)."),
"Missing end tag (%(name)s).",
"missing-end-tags":
_("Missing end tags (%(name)s)."),
"Missing end tags (%(name)s).",
"unexpected-start-tag-implies-end-tag":
_("Unexpected start tag (%(startName)s) "
"implies end tag (%(endName)s)."),
"Unexpected start tag (%(startName)s) "
"implies end tag (%(endName)s).",
"unexpected-start-tag-treated-as":
_("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
"Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
"deprecated-tag":
_("Unexpected start tag %(name)s. Don't use it!"),
"Unexpected start tag %(name)s. Don't use it!",
"unexpected-start-tag-ignored":
_("Unexpected start tag %(name)s. Ignored."),
"Unexpected start tag %(name)s. Ignored.",
"expected-one-end-tag-but-got-another":
_("Unexpected end tag (%(gotName)s). "
"Missing end tag (%(expectedName)s)."),
"Unexpected end tag (%(gotName)s). "
"Missing end tag (%(expectedName)s).",
"end-tag-too-early":
_("End tag (%(name)s) seen too early. Expected other end tag."),
"End tag (%(name)s) seen too early. Expected other end tag.",
"end-tag-too-early-named":
_("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
"end-tag-too-early-ignored":
_("End tag (%(name)s) seen too early. Ignored."),
"End tag (%(name)s) seen too early. Ignored.",
"adoption-agency-1.1":
_("End tag (%(name)s) violates step 1, "
"paragraph 1 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 1, "
"paragraph 1 of the adoption agency algorithm.",
"adoption-agency-1.2":
_("End tag (%(name)s) violates step 1, "
"paragraph 2 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 1, "
"paragraph 2 of the adoption agency algorithm.",
"adoption-agency-1.3":
_("End tag (%(name)s) violates step 1, "
"paragraph 3 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 1, "
"paragraph 3 of the adoption agency algorithm.",
"adoption-agency-4.4":
_("End tag (%(name)s) violates step 4, "
"paragraph 4 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 4, "
"paragraph 4 of the adoption agency algorithm.",
"unexpected-end-tag-treated-as":
_("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
"Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
"no-end-tag":
_("This element (%(name)s) has no end tag."),
"This element (%(name)s) has no end tag.",
"unexpected-implied-end-tag-in-table":
_("Unexpected implied end tag (%(name)s) in the table phase."),
"Unexpected implied end tag (%(name)s) in the table phase.",
"unexpected-implied-end-tag-in-table-body":
_("Unexpected implied end tag (%(name)s) in the table body phase."),
"Unexpected implied end tag (%(name)s) in the table body phase.",
"unexpected-char-implies-table-voodoo":
_("Unexpected non-space characters in "
"table context caused voodoo mode."),
"Unexpected non-space characters in "
"table context caused voodoo mode.",
"unexpected-hidden-input-in-table":
_("Unexpected input with type hidden in table context."),
"Unexpected input with type hidden in table context.",
"unexpected-form-in-table":
_("Unexpected form in table context."),
"Unexpected form in table context.",
"unexpected-start-tag-implies-table-voodoo":
_("Unexpected start tag (%(name)s) in "
"table context caused voodoo mode."),
"Unexpected start tag (%(name)s) in "
"table context caused voodoo mode.",
"unexpected-end-tag-implies-table-voodoo":
_("Unexpected end tag (%(name)s) in "
"table context caused voodoo mode."),
"Unexpected end tag (%(name)s) in "
"table context caused voodoo mode.",
"unexpected-cell-in-table-body":
_("Unexpected table cell start tag (%(name)s) "
"in the table body phase."),
"Unexpected table cell start tag (%(name)s) "
"in the table body phase.",
"unexpected-cell-end-tag":
_("Got table cell end tag (%(name)s) "
"while required end tags are missing."),
"Got table cell end tag (%(name)s) "
"while required end tags are missing.",
"unexpected-end-tag-in-table-body":
_("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
"Unexpected end tag (%(name)s) in the table body phase. Ignored.",
"unexpected-implied-end-tag-in-table-row":
_("Unexpected implied end tag (%(name)s) in the table row phase."),
"Unexpected implied end tag (%(name)s) in the table row phase.",
"unexpected-end-tag-in-table-row":
_("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
"Unexpected end tag (%(name)s) in the table row phase. Ignored.",
"unexpected-select-in-select":
_("Unexpected select start tag in the select phase "
"treated as select end tag."),
"Unexpected select start tag in the select phase "
"treated as select end tag.",
"unexpected-input-in-select":
_("Unexpected input start tag in the select phase."),
"Unexpected input start tag in the select phase.",
"unexpected-start-tag-in-select":
_("Unexpected start tag token (%(name)s in the select phase. "
"Ignored."),
"Unexpected start tag token (%(name)s in the select phase. "
"Ignored.",
"unexpected-end-tag-in-select":
_("Unexpected end tag (%(name)s) in the select phase. Ignored."),
"Unexpected end tag (%(name)s) in the select phase. Ignored.",
"unexpected-table-element-start-tag-in-select-in-table":
_("Unexpected table element start tag (%(name)s) in the select in table phase."),
"Unexpected table element start tag (%(name)s) in the select in table phase.",
"unexpected-table-element-end-tag-in-select-in-table":
_("Unexpected table element end tag (%(name)s) in the select in table phase."),
"Unexpected table element end tag (%(name)s) in the select in table phase.",
"unexpected-char-after-body":
_("Unexpected non-space characters in the after body phase."),
"Unexpected non-space characters in the after body phase.",
"unexpected-start-tag-after-body":
_("Unexpected start tag token (%(name)s)"
" in the after body phase."),
"Unexpected start tag token (%(name)s)"
" in the after body phase.",
"unexpected-end-tag-after-body":
_("Unexpected end tag token (%(name)s)"
" in the after body phase."),
"Unexpected end tag token (%(name)s)"
" in the after body phase.",
"unexpected-char-in-frameset":
_("Unexpected characters in the frameset phase. Characters ignored."),
"Unexpected characters in the frameset phase. Characters ignored.",
"unexpected-start-tag-in-frameset":
_("Unexpected start tag token (%(name)s)"
" in the frameset phase. Ignored."),
"Unexpected start tag token (%(name)s)"
" in the frameset phase. Ignored.",
"unexpected-frameset-in-frameset-innerhtml":
_("Unexpected end tag token (frameset) "
"in the frameset phase (innerHTML)."),
"Unexpected end tag token (frameset) "
"in the frameset phase (innerHTML).",
"unexpected-end-tag-in-frameset":
_("Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored."),
"Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored.",
"unexpected-char-after-frameset":
_("Unexpected non-space characters in the "
"after frameset phase. Ignored."),
"Unexpected non-space characters in the "
"after frameset phase. Ignored.",
"unexpected-start-tag-after-frameset":
_("Unexpected start tag (%(name)s)"
" in the after frameset phase. Ignored."),
"Unexpected start tag (%(name)s)"
" in the after frameset phase. Ignored.",
"unexpected-end-tag-after-frameset":
_("Unexpected end tag (%(name)s)"
" in the after frameset phase. Ignored."),
"Unexpected end tag (%(name)s)"
" in the after frameset phase. Ignored.",
"unexpected-end-tag-after-body-innerhtml":
_("Unexpected end tag after body(innerHtml)"),
"Unexpected end tag after body(innerHtml)",
"expected-eof-but-got-char":
_("Unexpected non-space characters. Expected end of file."),
"Unexpected non-space characters. Expected end of file.",
"expected-eof-but-got-start-tag":
_("Unexpected start tag (%(name)s)"
". Expected end of file."),
"Unexpected start tag (%(name)s)"
". Expected end of file.",
"expected-eof-but-got-end-tag":
_("Unexpected end tag (%(name)s)"
". Expected end of file."),
"Unexpected end tag (%(name)s)"
". Expected end of file.",
"eof-in-table":
_("Unexpected end of file. Expected table content."),
"Unexpected end of file. Expected table content.",
"eof-in-select":
_("Unexpected end of file. Expected select content."),
"Unexpected end of file. Expected select content.",
"eof-in-frameset":
_("Unexpected end of file. Expected frameset content."),
"Unexpected end of file. Expected frameset content.",
"eof-in-script-in-script":
_("Unexpected end of file. Expected script content."),
"Unexpected end of file. Expected script content.",
"eof-in-foreign-lands":
_("Unexpected end of file. Expected foreign content"),
"Unexpected end of file. Expected foreign content",
"non-void-element-with-trailing-solidus":
_("Trailing solidus not allowed on element %(name)s"),
"Trailing solidus not allowed on element %(name)s",
"unexpected-html-element-in-foreign-content":
_("Element %(name)s not allowed in a non-html context"),
"Element %(name)s not allowed in a non-html context",
"unexpected-end-tag-before-html":
_("Unexpected end tag (%(name)s) before html."),
"Unexpected end tag (%(name)s) before html.",
"unexpected-inhead-noscript-tag":
"Element %(name)s not allowed in a inhead-noscript context",
"eof-in-head-noscript":
"Unexpected end of file. Expected inhead-noscript content",
"char-in-head-noscript":
"Unexpected non-space character. Expected inhead-noscript content",
"XXX-undefined-error":
_("Undefined error (this sucks and should be fixed)"),
"Undefined error (this sucks and should be fixed)",
}
namespaces = {
@ -298,7 +302,7 @@ namespaces = {
"xmlns": "http://www.w3.org/2000/xmlns/"
}
scopingElements = frozenset((
scopingElements = frozenset([
(namespaces["html"], "applet"),
(namespaces["html"], "caption"),
(namespaces["html"], "html"),
@ -316,9 +320,9 @@ scopingElements = frozenset((
(namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"),
(namespaces["svg"], "title"),
))
])
formattingElements = frozenset((
formattingElements = frozenset([
(namespaces["html"], "a"),
(namespaces["html"], "b"),
(namespaces["html"], "big"),
@ -333,9 +337,9 @@ formattingElements = frozenset((
(namespaces["html"], "strong"),
(namespaces["html"], "tt"),
(namespaces["html"], "u")
))
])
specialElements = frozenset((
specialElements = frozenset([
(namespaces["html"], "address"),
(namespaces["html"], "applet"),
(namespaces["html"], "area"),
@ -416,22 +420,89 @@ specialElements = frozenset((
(namespaces["html"], "wbr"),
(namespaces["html"], "xmp"),
(namespaces["svg"], "foreignObject")
))
])
htmlIntegrationPointElements = frozenset((
(namespaces["mathml"], "annotaion-xml"),
htmlIntegrationPointElements = frozenset([
(namespaces["mathml"], "annotation-xml"),
(namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"),
(namespaces["svg"], "title")
))
])
mathmlTextIntegrationPointElements = frozenset((
mathmlTextIntegrationPointElements = frozenset([
(namespaces["mathml"], "mi"),
(namespaces["mathml"], "mo"),
(namespaces["mathml"], "mn"),
(namespaces["mathml"], "ms"),
(namespaces["mathml"], "mtext")
))
])
adjustSVGAttributes = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
@ -448,24 +519,24 @@ adjustForeignAttributes = {
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()])
unadjustForeignAttributes = {(ns, local): qname for qname, (prefix, local, ns) in
adjustForeignAttributes.items()}
spaceCharacters = frozenset((
spaceCharacters = frozenset([
"\t",
"\n",
"\u000C",
" ",
"\r"
))
])
tableInsertModeElements = frozenset((
tableInsertModeElements = frozenset([
"table",
"tbody",
"tfoot",
"thead",
"tr"
))
])
asciiLowercase = frozenset(string.ascii_lowercase)
asciiUppercase = frozenset(string.ascii_uppercase)
@ -473,8 +544,7 @@ asciiLetters = frozenset(string.ascii_letters)
digits = frozenset(string.digits)
hexDigits = frozenset(string.hexdigits)
asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
for c in string.ascii_uppercase])
asciiUpper2Lower = {ord(c): ord(c.lower()) for c in string.ascii_uppercase}
# Heading elements need to be ordered
headingElements = (
@ -486,7 +556,7 @@ headingElements = (
"h6"
)
voidElements = frozenset((
voidElements = frozenset([
"base",
"command",
"event-source",
@ -502,11 +572,11 @@ voidElements = frozenset((
"input",
"source",
"track"
))
])
cdataElements = frozenset(('title', 'textarea'))
cdataElements = frozenset(['title', 'textarea'])
rcdataElements = frozenset((
rcdataElements = frozenset([
'style',
'script',
'xmp',
@ -514,27 +584,28 @@ rcdataElements = frozenset((
'noembed',
'noframes',
'noscript'
))
])
booleanAttributes = {
"": frozenset(("irrelevant",)),
"style": frozenset(("scoped",)),
"img": frozenset(("ismap",)),
"audio": frozenset(("autoplay", "controls")),
"video": frozenset(("autoplay", "controls")),
"script": frozenset(("defer", "async")),
"details": frozenset(("open",)),
"datagrid": frozenset(("multiple", "disabled")),
"command": frozenset(("hidden", "disabled", "checked", "default")),
"hr": frozenset(("noshade")),
"menu": frozenset(("autosubmit",)),
"fieldset": frozenset(("disabled", "readonly")),
"option": frozenset(("disabled", "readonly", "selected")),
"optgroup": frozenset(("disabled", "readonly")),
"button": frozenset(("disabled", "autofocus")),
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
"output": frozenset(("disabled", "readonly")),
"": frozenset(["irrelevant", "itemscope"]),
"style": frozenset(["scoped"]),
"img": frozenset(["ismap"]),
"audio": frozenset(["autoplay", "controls"]),
"video": frozenset(["autoplay", "controls"]),
"script": frozenset(["defer", "async"]),
"details": frozenset(["open"]),
"datagrid": frozenset(["multiple", "disabled"]),
"command": frozenset(["hidden", "disabled", "checked", "default"]),
"hr": frozenset(["noshade"]),
"menu": frozenset(["autosubmit"]),
"fieldset": frozenset(["disabled", "readonly"]),
"option": frozenset(["disabled", "readonly", "selected"]),
"optgroup": frozenset(["disabled", "readonly"]),
"button": frozenset(["disabled", "autofocus"]),
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"output": frozenset(["disabled", "readonly"]),
"iframe": frozenset(["seamless"]),
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@ -574,7 +645,7 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
entities = {
"AElig": "\xc6",
@ -2815,7 +2886,6 @@ replacementCharacters = {
0x0d: "\u000D",
0x80: "\u20AC",
0x81: "\u0081",
0x81: "\u0081",
0x82: "\u201A",
0x83: "\u0192",
0x84: "\u201E",
@ -2848,235 +2918,6 @@ replacementCharacters = {
0x9F: "\u0178",
}
encodings = {
'437': 'cp437',
'850': 'cp850',
'852': 'cp852',
'855': 'cp855',
'857': 'cp857',
'860': 'cp860',
'861': 'cp861',
'862': 'cp862',
'863': 'cp863',
'865': 'cp865',
'866': 'cp866',
'869': 'cp869',
'ansix341968': 'ascii',
'ansix341986': 'ascii',
'arabic': 'iso8859-6',
'ascii': 'ascii',
'asmo708': 'iso8859-6',
'big5': 'big5',
'big5hkscs': 'big5hkscs',
'chinese': 'gbk',
'cp037': 'cp037',
'cp1026': 'cp1026',
'cp154': 'ptcp154',
'cp367': 'ascii',
'cp424': 'cp424',
'cp437': 'cp437',
'cp500': 'cp500',
'cp775': 'cp775',
'cp819': 'windows-1252',
'cp850': 'cp850',
'cp852': 'cp852',
'cp855': 'cp855',
'cp857': 'cp857',
'cp860': 'cp860',
'cp861': 'cp861',
'cp862': 'cp862',
'cp863': 'cp863',
'cp864': 'cp864',
'cp865': 'cp865',
'cp866': 'cp866',
'cp869': 'cp869',
'cp936': 'gbk',
'cpgr': 'cp869',
'cpis': 'cp861',
'csascii': 'ascii',
'csbig5': 'big5',
'cseuckr': 'cp949',
'cseucpkdfmtjapanese': 'euc_jp',
'csgb2312': 'gbk',
'cshproman8': 'hp-roman8',
'csibm037': 'cp037',
'csibm1026': 'cp1026',
'csibm424': 'cp424',
'csibm500': 'cp500',
'csibm855': 'cp855',
'csibm857': 'cp857',
'csibm860': 'cp860',
'csibm861': 'cp861',
'csibm863': 'cp863',
'csibm864': 'cp864',
'csibm865': 'cp865',
'csibm866': 'cp866',
'csibm869': 'cp869',
'csiso2022jp': 'iso2022_jp',
'csiso2022jp2': 'iso2022_jp_2',
'csiso2022kr': 'iso2022_kr',
'csiso58gb231280': 'gbk',
'csisolatin1': 'windows-1252',
'csisolatin2': 'iso8859-2',
'csisolatin3': 'iso8859-3',
'csisolatin4': 'iso8859-4',
'csisolatin5': 'windows-1254',
'csisolatin6': 'iso8859-10',
'csisolatinarabic': 'iso8859-6',
'csisolatincyrillic': 'iso8859-5',
'csisolatingreek': 'iso8859-7',
'csisolatinhebrew': 'iso8859-8',
'cskoi8r': 'koi8-r',
'csksc56011987': 'cp949',
'cspc775baltic': 'cp775',
'cspc850multilingual': 'cp850',
'cspc862latinhebrew': 'cp862',
'cspc8codepage437': 'cp437',
'cspcp852': 'cp852',
'csptcp154': 'ptcp154',
'csshiftjis': 'shift_jis',
'csunicode11utf7': 'utf-7',
'cyrillic': 'iso8859-5',
'cyrillicasian': 'ptcp154',
'ebcdiccpbe': 'cp500',
'ebcdiccpca': 'cp037',
'ebcdiccpch': 'cp500',
'ebcdiccphe': 'cp424',
'ebcdiccpnl': 'cp037',
'ebcdiccpus': 'cp037',
'ebcdiccpwt': 'cp037',
'ecma114': 'iso8859-6',
'ecma118': 'iso8859-7',
'elot928': 'iso8859-7',
'eucjp': 'euc_jp',
'euckr': 'cp949',
'extendedunixcodepackedformatforjapanese': 'euc_jp',
'gb18030': 'gb18030',
'gb2312': 'gbk',
'gb231280': 'gbk',
'gbk': 'gbk',
'greek': 'iso8859-7',
'greek8': 'iso8859-7',
'hebrew': 'iso8859-8',
'hproman8': 'hp-roman8',
'hzgb2312': 'hz',
'ibm037': 'cp037',
'ibm1026': 'cp1026',
'ibm367': 'ascii',
'ibm424': 'cp424',
'ibm437': 'cp437',
'ibm500': 'cp500',
'ibm775': 'cp775',
'ibm819': 'windows-1252',
'ibm850': 'cp850',
'ibm852': 'cp852',
'ibm855': 'cp855',
'ibm857': 'cp857',
'ibm860': 'cp860',
'ibm861': 'cp861',
'ibm862': 'cp862',
'ibm863': 'cp863',
'ibm864': 'cp864',
'ibm865': 'cp865',
'ibm866': 'cp866',
'ibm869': 'cp869',
'iso2022jp': 'iso2022_jp',
'iso2022jp2': 'iso2022_jp_2',
'iso2022kr': 'iso2022_kr',
'iso646irv1991': 'ascii',
'iso646us': 'ascii',
'iso88591': 'windows-1252',
'iso885910': 'iso8859-10',
'iso8859101992': 'iso8859-10',
'iso885911987': 'windows-1252',
'iso885913': 'iso8859-13',
'iso885914': 'iso8859-14',
'iso8859141998': 'iso8859-14',
'iso885915': 'iso8859-15',
'iso885916': 'iso8859-16',
'iso8859162001': 'iso8859-16',
'iso88592': 'iso8859-2',
'iso885921987': 'iso8859-2',
'iso88593': 'iso8859-3',
'iso885931988': 'iso8859-3',
'iso88594': 'iso8859-4',
'iso885941988': 'iso8859-4',
'iso88595': 'iso8859-5',
'iso885951988': 'iso8859-5',
'iso88596': 'iso8859-6',
'iso885961987': 'iso8859-6',
'iso88597': 'iso8859-7',
'iso885971987': 'iso8859-7',
'iso88598': 'iso8859-8',
'iso885981988': 'iso8859-8',
'iso88599': 'windows-1254',
'iso885991989': 'windows-1254',
'isoceltic': 'iso8859-14',
'isoir100': 'windows-1252',
'isoir101': 'iso8859-2',
'isoir109': 'iso8859-3',
'isoir110': 'iso8859-4',
'isoir126': 'iso8859-7',
'isoir127': 'iso8859-6',
'isoir138': 'iso8859-8',
'isoir144': 'iso8859-5',
'isoir148': 'windows-1254',
'isoir149': 'cp949',
'isoir157': 'iso8859-10',
'isoir199': 'iso8859-14',
'isoir226': 'iso8859-16',
'isoir58': 'gbk',
'isoir6': 'ascii',
'koi8r': 'koi8-r',
'koi8u': 'koi8-u',
'korean': 'cp949',
'ksc5601': 'cp949',
'ksc56011987': 'cp949',
'ksc56011989': 'cp949',
'l1': 'windows-1252',
'l10': 'iso8859-16',
'l2': 'iso8859-2',
'l3': 'iso8859-3',
'l4': 'iso8859-4',
'l5': 'windows-1254',
'l6': 'iso8859-10',
'l8': 'iso8859-14',
'latin1': 'windows-1252',
'latin10': 'iso8859-16',
'latin2': 'iso8859-2',
'latin3': 'iso8859-3',
'latin4': 'iso8859-4',
'latin5': 'windows-1254',
'latin6': 'iso8859-10',
'latin8': 'iso8859-14',
'latin9': 'iso8859-15',
'ms936': 'gbk',
'mskanji': 'shift_jis',
'pt154': 'ptcp154',
'ptcp154': 'ptcp154',
'r8': 'hp-roman8',
'roman8': 'hp-roman8',
'shiftjis': 'shift_jis',
'tis620': 'cp874',
'unicode11utf7': 'utf-7',
'us': 'ascii',
'usascii': 'ascii',
'utf16': 'utf-16',
'utf16be': 'utf-16-be',
'utf16le': 'utf-16-le',
'utf8': 'utf-8',
'windows1250': 'cp1250',
'windows1251': 'cp1251',
'windows1252': 'cp1252',
'windows1253': 'cp1253',
'windows1254': 'cp1254',
'windows1255': 'cp1255',
'windows1256': 'cp1256',
'windows1257': 'cp1257',
'windows1258': 'cp1258',
'windows936': 'gbk',
'x-x-big5': 'big5'}
tokenTypes = {
"Doctype": 0,
"Characters": 1,
@ -3088,17 +2929,18 @@ tokenTypes = {
"ParseError": 7
}
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]))
tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]])
prefixes = dict([(v, k) for k, v in namespaces.items()])
prefixes = {v: k for k, v in namespaces.items()}
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
class DataLossWarning(UserWarning):
"""Raised when the current tree is unable to represent the input data"""
pass
class ReparseException(Exception):
class _ReparseException(Exception):
pass

View file

@ -1,20 +1,29 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from . import base
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from collections import OrderedDict
class Filter(_base.Filter):
def _attr_key(attr):
"""Return an appropriate key for an attribute for sorting
Attributes have a namespace that can be either ``None`` or a string. We
can't compare the two because they're different types, so we convert
``None`` to an empty string first.
"""
return (attr[0][0] or ''), attr[0][1]
class Filter(base.Filter):
"""Alphabetizes attributes for elements"""
def __iter__(self):
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict()
for name, value in sorted(token["data"].items(),
key=lambda x: x[0]):
key=_attr_key):
attrs[name] = value
token["data"] = attrs
yield token

View file

@ -1,11 +1,19 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from . import base
class Filter(_base.Filter):
class Filter(base.Filter):
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
def __init__(self, source, encoding):
_base.Filter.__init__(self, source)
"""Creates a Filter
:arg source: the source token stream
:arg encoding: the encoding to set
"""
base.Filter.__init__(self, source)
self.encoding = encoding
def __iter__(self):
@ -13,7 +21,7 @@ class Filter(_base.Filter):
meta_found = (self.encoding is None)
pending = []
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag":
if token["name"].lower() == "head":

View file

@ -1,93 +1,93 @@
from __future__ import absolute_import, division, unicode_literals
from gettext import gettext
_ = gettext
from six import text_type
from . import _base
from ..constants import cdataElements, rcdataElements, voidElements
from . import base
from ..constants import namespaces, voidElements
from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
class LintError(Exception):
pass
class Filter(base.Filter):
"""Lints the token stream for errors
If it finds any errors, it'll raise an ``AssertionError``.
"""
def __init__(self, source, require_matching_tags=True):
"""Creates a Filter
:arg source: the source token stream
:arg require_matching_tags: whether or not to require matching tags
"""
super(Filter, self).__init__(source)
self.require_matching_tags = require_matching_tags
class Filter(_base.Filter):
def __iter__(self):
open_elements = []
contentModelFlag = "PCDATA"
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
namespace = token["namespace"]
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
if not name:
raise LintError(_("Empty tag name"))
if type == "StartTag" and name in voidElements:
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
elif type == "EmptyTag" and name not in voidElements:
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
if not name:
raise LintError(_("Empty attribute name"))
if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
contentModelFlag = "RCDATA"
elif name == "plaintext":
contentModelFlag = "PLAINTEXT"
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(token["data"], dict)
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert type == "EmptyTag"
else:
assert type == "StartTag"
if type == "StartTag" and self.require_matching_tags:
open_elements.append((namespace, name))
for (namespace, name), value in token["data"].items():
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(value, text_type)
elif type == "EndTag":
namespace = token["namespace"]
name = token["name"]
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
if not name:
raise LintError(_("Empty tag name"))
if name in voidElements:
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
start_name = open_elements.pop()
if start_name != name:
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
contentModelFlag = "PCDATA"
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
elif self.require_matching_tags:
start = open_elements.pop()
assert start == (namespace, name)
elif type == "Comment":
if contentModelFlag != "PCDATA":
raise LintError(_("Comment not in PCDATA content model flag"))
data = token["data"]
assert isinstance(data, text_type)
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
if not data:
raise LintError(_("%(type)s token with empty data") % {"type": type})
assert isinstance(data, text_type)
assert data != ""
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
assert data.strip(spaceCharacters) == ""
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
# XXX: what to do with token["data"] ?
assert name is None or isinstance(name, text_type)
assert token["publicId"] is None or isinstance(name, text_type)
assert token["systemId"] is None or isinstance(name, text_type)
elif type in ("ParseError", "SerializeError"):
pass
elif type == "Entity":
assert isinstance(token["name"], text_type)
elif type == "SerializerError":
assert isinstance(token["data"], text_type)
else:
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
assert False, "Unknown token type: %(type)s" % {"type": type}
yield token

View file

@ -1,9 +1,10 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from . import base
class Filter(_base.Filter):
class Filter(base.Filter):
"""Removes optional tags from the token stream"""
def slider(self):
previous1 = previous2 = None
for token in self.source:
@ -11,7 +12,8 @@ class Filter(_base.Filter):
yield previous2, previous1, token
previous2 = previous1
previous1 = token
yield previous2, previous1, None
if previous1 is not None:
yield previous2, previous1, None
def __iter__(self):
for previous, token, next in self.slider():
@ -58,7 +60,7 @@ class Filter(_base.Filter):
elif tagname == 'colgroup':
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# is not immediately preceded by another colgroup element whose
# end tag has been omitted.
if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never
@ -70,7 +72,7 @@ class Filter(_base.Filter):
elif tagname == 'tbody':
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# not immediately preceded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == "StartTag":
# omit the thead and tfoot elements' end tag when they are

View file

@ -1,12 +1,916 @@
"""Deprecated from html5lib 1.1.
See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
is recommended as a replacement. Please let us know in the aforementioned issue
if Bleach is unsuitable for your needs.
"""
from __future__ import absolute_import, division, unicode_literals
from . import _base
from ..sanitizer import HTMLSanitizerMixin
import re
import warnings
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from . import base
from ..constants import namespaces, prefixes
__all__ = ["Filter"]
class Filter(_base.Filter, HTMLSanitizerMixin):
_deprecation_msg = (
"html5lib's sanitizer is deprecated; see " +
"https://github.com/html5lib/html5lib-python/issues/443 and please let " +
"us know if Bleach is unsuitable for your needs"
)
warnings.warn(_deprecation_msg, DeprecationWarning)
allowed_elements = frozenset((
(namespaces['html'], 'a'),
(namespaces['html'], 'abbr'),
(namespaces['html'], 'acronym'),
(namespaces['html'], 'address'),
(namespaces['html'], 'area'),
(namespaces['html'], 'article'),
(namespaces['html'], 'aside'),
(namespaces['html'], 'audio'),
(namespaces['html'], 'b'),
(namespaces['html'], 'big'),
(namespaces['html'], 'blockquote'),
(namespaces['html'], 'br'),
(namespaces['html'], 'button'),
(namespaces['html'], 'canvas'),
(namespaces['html'], 'caption'),
(namespaces['html'], 'center'),
(namespaces['html'], 'cite'),
(namespaces['html'], 'code'),
(namespaces['html'], 'col'),
(namespaces['html'], 'colgroup'),
(namespaces['html'], 'command'),
(namespaces['html'], 'datagrid'),
(namespaces['html'], 'datalist'),
(namespaces['html'], 'dd'),
(namespaces['html'], 'del'),
(namespaces['html'], 'details'),
(namespaces['html'], 'dfn'),
(namespaces['html'], 'dialog'),
(namespaces['html'], 'dir'),
(namespaces['html'], 'div'),
(namespaces['html'], 'dl'),
(namespaces['html'], 'dt'),
(namespaces['html'], 'em'),
(namespaces['html'], 'event-source'),
(namespaces['html'], 'fieldset'),
(namespaces['html'], 'figcaption'),
(namespaces['html'], 'figure'),
(namespaces['html'], 'footer'),
(namespaces['html'], 'font'),
(namespaces['html'], 'form'),
(namespaces['html'], 'header'),
(namespaces['html'], 'h1'),
(namespaces['html'], 'h2'),
(namespaces['html'], 'h3'),
(namespaces['html'], 'h4'),
(namespaces['html'], 'h5'),
(namespaces['html'], 'h6'),
(namespaces['html'], 'hr'),
(namespaces['html'], 'i'),
(namespaces['html'], 'img'),
(namespaces['html'], 'input'),
(namespaces['html'], 'ins'),
(namespaces['html'], 'keygen'),
(namespaces['html'], 'kbd'),
(namespaces['html'], 'label'),
(namespaces['html'], 'legend'),
(namespaces['html'], 'li'),
(namespaces['html'], 'm'),
(namespaces['html'], 'map'),
(namespaces['html'], 'menu'),
(namespaces['html'], 'meter'),
(namespaces['html'], 'multicol'),
(namespaces['html'], 'nav'),
(namespaces['html'], 'nextid'),
(namespaces['html'], 'ol'),
(namespaces['html'], 'output'),
(namespaces['html'], 'optgroup'),
(namespaces['html'], 'option'),
(namespaces['html'], 'p'),
(namespaces['html'], 'pre'),
(namespaces['html'], 'progress'),
(namespaces['html'], 'q'),
(namespaces['html'], 's'),
(namespaces['html'], 'samp'),
(namespaces['html'], 'section'),
(namespaces['html'], 'select'),
(namespaces['html'], 'small'),
(namespaces['html'], 'sound'),
(namespaces['html'], 'source'),
(namespaces['html'], 'spacer'),
(namespaces['html'], 'span'),
(namespaces['html'], 'strike'),
(namespaces['html'], 'strong'),
(namespaces['html'], 'sub'),
(namespaces['html'], 'sup'),
(namespaces['html'], 'table'),
(namespaces['html'], 'tbody'),
(namespaces['html'], 'td'),
(namespaces['html'], 'textarea'),
(namespaces['html'], 'time'),
(namespaces['html'], 'tfoot'),
(namespaces['html'], 'th'),
(namespaces['html'], 'thead'),
(namespaces['html'], 'tr'),
(namespaces['html'], 'tt'),
(namespaces['html'], 'u'),
(namespaces['html'], 'ul'),
(namespaces['html'], 'var'),
(namespaces['html'], 'video'),
(namespaces['mathml'], 'maction'),
(namespaces['mathml'], 'math'),
(namespaces['mathml'], 'merror'),
(namespaces['mathml'], 'mfrac'),
(namespaces['mathml'], 'mi'),
(namespaces['mathml'], 'mmultiscripts'),
(namespaces['mathml'], 'mn'),
(namespaces['mathml'], 'mo'),
(namespaces['mathml'], 'mover'),
(namespaces['mathml'], 'mpadded'),
(namespaces['mathml'], 'mphantom'),
(namespaces['mathml'], 'mprescripts'),
(namespaces['mathml'], 'mroot'),
(namespaces['mathml'], 'mrow'),
(namespaces['mathml'], 'mspace'),
(namespaces['mathml'], 'msqrt'),
(namespaces['mathml'], 'mstyle'),
(namespaces['mathml'], 'msub'),
(namespaces['mathml'], 'msubsup'),
(namespaces['mathml'], 'msup'),
(namespaces['mathml'], 'mtable'),
(namespaces['mathml'], 'mtd'),
(namespaces['mathml'], 'mtext'),
(namespaces['mathml'], 'mtr'),
(namespaces['mathml'], 'munder'),
(namespaces['mathml'], 'munderover'),
(namespaces['mathml'], 'none'),
(namespaces['svg'], 'a'),
(namespaces['svg'], 'animate'),
(namespaces['svg'], 'animateColor'),
(namespaces['svg'], 'animateMotion'),
(namespaces['svg'], 'animateTransform'),
(namespaces['svg'], 'clipPath'),
(namespaces['svg'], 'circle'),
(namespaces['svg'], 'defs'),
(namespaces['svg'], 'desc'),
(namespaces['svg'], 'ellipse'),
(namespaces['svg'], 'font-face'),
(namespaces['svg'], 'font-face-name'),
(namespaces['svg'], 'font-face-src'),
(namespaces['svg'], 'g'),
(namespaces['svg'], 'glyph'),
(namespaces['svg'], 'hkern'),
(namespaces['svg'], 'linearGradient'),
(namespaces['svg'], 'line'),
(namespaces['svg'], 'marker'),
(namespaces['svg'], 'metadata'),
(namespaces['svg'], 'missing-glyph'),
(namespaces['svg'], 'mpath'),
(namespaces['svg'], 'path'),
(namespaces['svg'], 'polygon'),
(namespaces['svg'], 'polyline'),
(namespaces['svg'], 'radialGradient'),
(namespaces['svg'], 'rect'),
(namespaces['svg'], 'set'),
(namespaces['svg'], 'stop'),
(namespaces['svg'], 'svg'),
(namespaces['svg'], 'switch'),
(namespaces['svg'], 'text'),
(namespaces['svg'], 'title'),
(namespaces['svg'], 'tspan'),
(namespaces['svg'], 'use'),
))
allowed_attributes = frozenset((
# HTML attributes
(None, 'abbr'),
(None, 'accept'),
(None, 'accept-charset'),
(None, 'accesskey'),
(None, 'action'),
(None, 'align'),
(None, 'alt'),
(None, 'autocomplete'),
(None, 'autofocus'),
(None, 'axis'),
(None, 'background'),
(None, 'balance'),
(None, 'bgcolor'),
(None, 'bgproperties'),
(None, 'border'),
(None, 'bordercolor'),
(None, 'bordercolordark'),
(None, 'bordercolorlight'),
(None, 'bottompadding'),
(None, 'cellpadding'),
(None, 'cellspacing'),
(None, 'ch'),
(None, 'challenge'),
(None, 'char'),
(None, 'charoff'),
(None, 'choff'),
(None, 'charset'),
(None, 'checked'),
(None, 'cite'),
(None, 'class'),
(None, 'clear'),
(None, 'color'),
(None, 'cols'),
(None, 'colspan'),
(None, 'compact'),
(None, 'contenteditable'),
(None, 'controls'),
(None, 'coords'),
(None, 'data'),
(None, 'datafld'),
(None, 'datapagesize'),
(None, 'datasrc'),
(None, 'datetime'),
(None, 'default'),
(None, 'delay'),
(None, 'dir'),
(None, 'disabled'),
(None, 'draggable'),
(None, 'dynsrc'),
(None, 'enctype'),
(None, 'end'),
(None, 'face'),
(None, 'for'),
(None, 'form'),
(None, 'frame'),
(None, 'galleryimg'),
(None, 'gutter'),
(None, 'headers'),
(None, 'height'),
(None, 'hidefocus'),
(None, 'hidden'),
(None, 'high'),
(None, 'href'),
(None, 'hreflang'),
(None, 'hspace'),
(None, 'icon'),
(None, 'id'),
(None, 'inputmode'),
(None, 'ismap'),
(None, 'keytype'),
(None, 'label'),
(None, 'leftspacing'),
(None, 'lang'),
(None, 'list'),
(None, 'longdesc'),
(None, 'loop'),
(None, 'loopcount'),
(None, 'loopend'),
(None, 'loopstart'),
(None, 'low'),
(None, 'lowsrc'),
(None, 'max'),
(None, 'maxlength'),
(None, 'media'),
(None, 'method'),
(None, 'min'),
(None, 'multiple'),
(None, 'name'),
(None, 'nohref'),
(None, 'noshade'),
(None, 'nowrap'),
(None, 'open'),
(None, 'optimum'),
(None, 'pattern'),
(None, 'ping'),
(None, 'point-size'),
(None, 'poster'),
(None, 'pqg'),
(None, 'preload'),
(None, 'prompt'),
(None, 'radiogroup'),
(None, 'readonly'),
(None, 'rel'),
(None, 'repeat-max'),
(None, 'repeat-min'),
(None, 'replace'),
(None, 'required'),
(None, 'rev'),
(None, 'rightspacing'),
(None, 'rows'),
(None, 'rowspan'),
(None, 'rules'),
(None, 'scope'),
(None, 'selected'),
(None, 'shape'),
(None, 'size'),
(None, 'span'),
(None, 'src'),
(None, 'start'),
(None, 'step'),
(None, 'style'),
(None, 'summary'),
(None, 'suppress'),
(None, 'tabindex'),
(None, 'target'),
(None, 'template'),
(None, 'title'),
(None, 'toppadding'),
(None, 'type'),
(None, 'unselectable'),
(None, 'usemap'),
(None, 'urn'),
(None, 'valign'),
(None, 'value'),
(None, 'variable'),
(None, 'volume'),
(None, 'vspace'),
(None, 'vrml'),
(None, 'width'),
(None, 'wrap'),
(namespaces['xml'], 'lang'),
# MathML attributes
(None, 'actiontype'),
(None, 'align'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnlines'),
(None, 'columnspacing'),
(None, 'columnspan'),
(None, 'depth'),
(None, 'display'),
(None, 'displaystyle'),
(None, 'equalcolumns'),
(None, 'equalrows'),
(None, 'fence'),
(None, 'fontstyle'),
(None, 'fontweight'),
(None, 'frame'),
(None, 'height'),
(None, 'linethickness'),
(None, 'lspace'),
(None, 'mathbackground'),
(None, 'mathcolor'),
(None, 'mathvariant'),
(None, 'mathvariant'),
(None, 'maxsize'),
(None, 'minsize'),
(None, 'other'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowlines'),
(None, 'rowspacing'),
(None, 'rowspan'),
(None, 'rspace'),
(None, 'scriptlevel'),
(None, 'selection'),
(None, 'separator'),
(None, 'stretchy'),
(None, 'width'),
(None, 'width'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'type'),
# SVG attributes
(None, 'accent-height'),
(None, 'accumulate'),
(None, 'additive'),
(None, 'alphabetic'),
(None, 'arabic-form'),
(None, 'ascent'),
(None, 'attributeName'),
(None, 'attributeType'),
(None, 'baseProfile'),
(None, 'bbox'),
(None, 'begin'),
(None, 'by'),
(None, 'calcMode'),
(None, 'cap-height'),
(None, 'class'),
(None, 'clip-path'),
(None, 'color'),
(None, 'color-rendering'),
(None, 'content'),
(None, 'cx'),
(None, 'cy'),
(None, 'd'),
(None, 'dx'),
(None, 'dy'),
(None, 'descent'),
(None, 'display'),
(None, 'dur'),
(None, 'end'),
(None, 'fill'),
(None, 'fill-opacity'),
(None, 'fill-rule'),
(None, 'font-family'),
(None, 'font-size'),
(None, 'font-stretch'),
(None, 'font-style'),
(None, 'font-variant'),
(None, 'font-weight'),
(None, 'from'),
(None, 'fx'),
(None, 'fy'),
(None, 'g1'),
(None, 'g2'),
(None, 'glyph-name'),
(None, 'gradientUnits'),
(None, 'hanging'),
(None, 'height'),
(None, 'horiz-adv-x'),
(None, 'horiz-origin-x'),
(None, 'id'),
(None, 'ideographic'),
(None, 'k'),
(None, 'keyPoints'),
(None, 'keySplines'),
(None, 'keyTimes'),
(None, 'lang'),
(None, 'marker-end'),
(None, 'marker-mid'),
(None, 'marker-start'),
(None, 'markerHeight'),
(None, 'markerUnits'),
(None, 'markerWidth'),
(None, 'mathematical'),
(None, 'max'),
(None, 'min'),
(None, 'name'),
(None, 'offset'),
(None, 'opacity'),
(None, 'orient'),
(None, 'origin'),
(None, 'overline-position'),
(None, 'overline-thickness'),
(None, 'panose-1'),
(None, 'path'),
(None, 'pathLength'),
(None, 'points'),
(None, 'preserveAspectRatio'),
(None, 'r'),
(None, 'refX'),
(None, 'refY'),
(None, 'repeatCount'),
(None, 'repeatDur'),
(None, 'requiredExtensions'),
(None, 'requiredFeatures'),
(None, 'restart'),
(None, 'rotate'),
(None, 'rx'),
(None, 'ry'),
(None, 'slope'),
(None, 'stemh'),
(None, 'stemv'),
(None, 'stop-color'),
(None, 'stop-opacity'),
(None, 'strikethrough-position'),
(None, 'strikethrough-thickness'),
(None, 'stroke'),
(None, 'stroke-dasharray'),
(None, 'stroke-dashoffset'),
(None, 'stroke-linecap'),
(None, 'stroke-linejoin'),
(None, 'stroke-miterlimit'),
(None, 'stroke-opacity'),
(None, 'stroke-width'),
(None, 'systemLanguage'),
(None, 'target'),
(None, 'text-anchor'),
(None, 'to'),
(None, 'transform'),
(None, 'type'),
(None, 'u1'),
(None, 'u2'),
(None, 'underline-position'),
(None, 'underline-thickness'),
(None, 'unicode'),
(None, 'unicode-range'),
(None, 'units-per-em'),
(None, 'values'),
(None, 'version'),
(None, 'viewBox'),
(None, 'visibility'),
(None, 'width'),
(None, 'widths'),
(None, 'x'),
(None, 'x-height'),
(None, 'x1'),
(None, 'x2'),
(namespaces['xlink'], 'actuate'),
(namespaces['xlink'], 'arcrole'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'role'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'title'),
(namespaces['xlink'], 'type'),
(namespaces['xml'], 'base'),
(namespaces['xml'], 'lang'),
(namespaces['xml'], 'space'),
(None, 'y'),
(None, 'y1'),
(None, 'y2'),
(None, 'zoomAndPan'),
))
attr_val_is_uri = frozenset((
(None, 'href'),
(None, 'src'),
(None, 'cite'),
(None, 'action'),
(None, 'longdesc'),
(None, 'poster'),
(None, 'background'),
(None, 'datasrc'),
(None, 'dynsrc'),
(None, 'lowsrc'),
(None, 'ping'),
(namespaces['xlink'], 'href'),
(namespaces['xml'], 'base'),
))
svg_attr_val_allows_ref = frozenset((
(None, 'clip-path'),
(None, 'color-profile'),
(None, 'cursor'),
(None, 'fill'),
(None, 'filter'),
(None, 'marker'),
(None, 'marker-start'),
(None, 'marker-mid'),
(None, 'marker-end'),
(None, 'mask'),
(None, 'stroke'),
))
svg_allow_local_href = frozenset((
(None, 'altGlyph'),
(None, 'animate'),
(None, 'animateColor'),
(None, 'animateMotion'),
(None, 'animateTransform'),
(None, 'cursor'),
(None, 'feImage'),
(None, 'filter'),
(None, 'linearGradient'),
(None, 'pattern'),
(None, 'radialGradient'),
(None, 'textpath'),
(None, 'tref'),
(None, 'set'),
(None, 'use')
))
allowed_css_properties = frozenset((
'azimuth',
'background-color',
'border-bottom-color',
'border-collapse',
'border-color',
'border-left-color',
'border-right-color',
'border-top-color',
'clear',
'color',
'cursor',
'direction',
'display',
'elevation',
'float',
'font',
'font-family',
'font-size',
'font-style',
'font-variant',
'font-weight',
'height',
'letter-spacing',
'line-height',
'overflow',
'pause',
'pause-after',
'pause-before',
'pitch',
'pitch-range',
'richness',
'speak',
'speak-header',
'speak-numeral',
'speak-punctuation',
'speech-rate',
'stress',
'text-align',
'text-decoration',
'text-indent',
'unicode-bidi',
'vertical-align',
'voice-family',
'volume',
'white-space',
'width',
))
allowed_css_keywords = frozenset((
'auto',
'aqua',
'black',
'block',
'blue',
'bold',
'both',
'bottom',
'brown',
'center',
'collapse',
'dashed',
'dotted',
'fuchsia',
'gray',
'green',
'!important',
'italic',
'left',
'lime',
'maroon',
'medium',
'none',
'navy',
'normal',
'nowrap',
'olive',
'pointer',
'purple',
'red',
'right',
'solid',
'silver',
'teal',
'top',
'transparent',
'underline',
'white',
'yellow',
))
allowed_svg_properties = frozenset((
'fill',
'fill-opacity',
'fill-rule',
'stroke',
'stroke-width',
'stroke-linecap',
'stroke-linejoin',
'stroke-opacity',
))
allowed_protocols = frozenset((
'ed2k',
'ftp',
'http',
'https',
'irc',
'mailto',
'news',
'gopher',
'nntp',
'telnet',
'webcal',
'xmpp',
'callto',
'feed',
'urn',
'aim',
'rsync',
'tag',
'ssh',
'sftp',
'rtsp',
'afs',
'data',
))
allowed_content_types = frozenset((
'image/png',
'image/jpeg',
'image/gif',
'image/webp',
'image/bmp',
'text/plain',
))
data_content_type = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class Filter(base.Filter):
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
def __init__(self,
source,
allowed_elements=allowed_elements,
allowed_attributes=allowed_attributes,
allowed_css_properties=allowed_css_properties,
allowed_css_keywords=allowed_css_keywords,
allowed_svg_properties=allowed_svg_properties,
allowed_protocols=allowed_protocols,
allowed_content_types=allowed_content_types,
attr_val_is_uri=attr_val_is_uri,
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
svg_allow_local_href=svg_allow_local_href):
"""Creates a Filter
:arg allowed_elements: set of elements to allow--everything else will
be escaped
:arg allowed_attributes: set of attributes to allow in
elements--everything else will be stripped
:arg allowed_css_properties: set of CSS properties to allow--everything
else will be stripped
:arg allowed_css_keywords: set of CSS keywords to allow--everything
else will be stripped
:arg allowed_svg_properties: set of SVG properties to allow--everything
else will be removed
:arg allowed_protocols: set of allowed protocols for URIs
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
:arg attr_val_is_uri: set of attributes that have URI values--values
that have a scheme not listed in ``allowed_protocols`` are removed
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
references
:arg svg_allow_local_href: set of SVG elements that can have local
hrefs--these are removed
"""
super(Filter, self).__init__(source)
warnings.warn(_deprecation_msg, DeprecationWarning)
self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes
self.allowed_css_properties = allowed_css_properties
self.allowed_css_keywords = allowed_css_keywords
self.allowed_svg_properties = allowed_svg_properties
self.allowed_protocols = allowed_protocols
self.allowed_content_types = allowed_content_types
self.attr_val_is_uri = attr_val_is_uri
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
self.svg_allow_local_href = svg_allow_local_href
def __iter__(self):
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
# allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in ("StartTag", "EndTag", "EmptyTag"):
name = token["name"]
namespace = token["namespace"]
if ((namespace, name) in self.allowed_elements or
(namespace is None and
(namespaces["html"], name) in self.allowed_elements)):
return self.allowed_token(token)
else:
return self.disallowed_token(token)
elif token_type == "Comment":
pass
else:
return token
def allowed_token(self, token):
if "data" in token:
attrs = token["data"]
attr_names = set(attrs.keys())
# Remove forbidden attributes
for to_remove in (attr_names - self.allowed_attributes):
del token["data"][to_remove]
attr_names.remove(to_remove)
# Remove attributes with disallowed URL values
for attr in (attr_names & self.attr_val_is_uri):
assert attr in attrs
# I don't have a clue where this regexp comes from or why it matches those
# characters, nor why we call unescape. I just know it's always been here.
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
# this will do is remove *more* than it otherwise would.
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
uri = urlparse.urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = data_content_type.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
attrs[(namespaces['xlink'], 'href')])):
del attrs[(namespaces['xlink'], 'href')]
if (None, 'style') in attrs:
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
token["data"] = attrs
return token
def disallowed_token(self, token):
token_type = token["type"]
if token_type == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
assert token_type in ("StartTag", "EmptyTag")
attrs = []
for (ns, name), v in token["data"].items():
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if keyword not in self.allowed_css_keywords and \
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)

View file

@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
import re
from . import _base
from . import base
from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
class Filter(_base.Filter):
class Filter(base.Filter):
"""Collapses whitespace except in pre, textarea, and script elements"""
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self):
preserve = 0
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):

File diff suppressed because it is too large Load diff

View file

@ -1,271 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from .tokenizer import HTMLTokenizer
from .constants import tokenTypes
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs']
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
allowed_css_properties = acceptable_css_properties
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in list(tokenTypes.keys()):
token_type = tokenTypes[token_type]
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
return self.allowed_token(token, token_type)
else:
return self.disallowed_token(token, token_type)
elif token_type == tokenTypes["Comment"]:
pass
else:
return token
def allowed_token(self, token, token_type):
if "data" in token:
attrs = dict([(name, val) for name, val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name, val] for name, val in list(attrs.items())]
return token
def disallowed_token(self, token, token_type):
if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
token["data"] = "<%s%s>" % (token["name"], attrs)
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
if token["type"] in list(tokenTypes.keys()):
token["type"] = "Characters"
else:
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
# Change case matching defaults as we only output lowercase html anyway
# This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName, parser=parser)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token

409
lib/html5lib/serializer.py Normal file
View file

@ -0,0 +1,409 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import re
from codecs import register_error, xmlcharrefreplace_errors
from .constants import voidElements, booleanAttributes, spaceCharacters
from .constants import rcdataElements, entities, xmlEntities
from . import treewalkers, _utils
from xml.sax.saxutils import escape
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")
_encode_entity_map = {}
_is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((_is_ucs4 and len(v) > 1) or
(not _is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = _utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if v not in _encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
_encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = _encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error("htmlentityreplace", htmlentityreplace_errors)
def serialize(input, tree="etree", encoding=None, **serializer_opts):
"""Serializes the input token stream using the specified treewalker
:arg input: the token stream to serialize
:arg tree: the treewalker to use
:arg encoding: the encoding to use
:arg serializer_opts: any options to pass to the
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
:returns: the tree serialized as a string
Example:
>>> from html5lib.html5parser import parse
>>> from html5lib.serializer import serialize
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
>>> serialize(token_stream, omit_optional_tags=False)
'<html><head></head><body><p>Hi!</p></body></html>'
"""
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
s = HTMLSerializer(**serializer_opts)
return s.render(walker(input), encoding)
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer
:arg inject_meta_charset: Whether or not to inject the meta charset.
Defaults to ``True``.
:arg quote_attr_values: Whether to quote attribute values that don't
require quoting per legacy browser behavior (``"legacy"``), when
required by the standard (``"spec"``), or always (``"always"``).
Defaults to ``"legacy"``.
:arg quote_char: Use given quote character for attribute quoting.
Defaults to ``"`` which will use double quotes unless attribute
value contains a double quote, in which case single quotes are
used.
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
values.
Defaults to ``False``.
:arg escape_rcdata: Whether to escape characters that need to be
escaped within normal elements within rcdata elements such as
style.
Defaults to ``False``.
:arg resolve_entities: Whether to resolve named character entities that
appear in the source tree. The XML predefined entities &lt; &gt;
&amp; &quot; &apos; are unaffected by this setting.
Defaults to ``True``.
:arg strip_whitespace: Whether to remove semantically meaningless
whitespace. (This compresses all whitespace to a single space
except within ``pre``.)
Defaults to ``False``.
:arg minimize_boolean_attributes: Shortens boolean attributes to give
just the attribute value, for example::
<input disabled="disabled">
becomes::
<input disabled>
Defaults to ``True``.
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
start tag of void elements (empty elements whose end tag is
forbidden). E.g. ``<hr/>``.
Defaults to ``False``.
:arg space_before_trailing_solidus: Places a space immediately before
the closing slash in a tag using a trailing solidus. E.g.
``<hr />``. Requires ``use_trailing_solidus=True``.
Defaults to ``True``.
:arg sanitize: Strip all unsafe or unknown constructs from output.
See :py:class:`html5lib.filters.sanitizer.Filter`.
Defaults to ``False``.
:arg omit_optional_tags: Omit start/end tags that are optional.
Defaults to ``True``.
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
Defaults to ``False``.
"""
unexpected_args = frozenset(kwargs) - frozenset(self.options)
if len(unexpected_args) > 0:
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "htmlentityreplace")
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
# pylint:disable=too-many-nested-blocks
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from .filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# Alphabetical attributes is here under the assumption that none of
# the later filters add or change order of attributes; it needs to be
# before the sanitizer so escaped elements come out correctly
if self.alphabetical_attributes:
from .filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from .filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from .filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from .filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError("System identifier contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = _quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = _quoteAttributeLegacy.search(v) is not None
else:
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(" /")
else:
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if key not in entities:
self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
"""Serializes the stream from the treewalker into a string
:arg treewalker: the treewalker to serialize
:arg encoding: the string encoding to use
:returns: the serialized tree
Example:
>>> from html5lib import parse, getTreeWalker
>>> from html5lib.serializer import HTMLSerializer
>>> token_stream = parse('<html><body>Hi!</body></html>')
>>> walker = getTreeWalker('etree')
>>> serializer = HTMLSerializer(omit_optional_tags=False)
>>> serializer.render(walker(token_stream))
'<html><head></head><body>Hi!</body></html>'
"""
if encoding:
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
class SerializeError(Exception):
"""Error in serialized tree"""
pass

View file

@ -1,16 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from .. import treewalkers
from .htmlserializer import HTMLSerializer
def serialize(input, tree="etree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
else:
raise ValueError("type must be html")
return s.render(walker(input), encoding)

View file

@ -1,320 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import gettext
_ = gettext.gettext
try:
from functools import reduce
except ImportError:
pass
from ..constants import voidElements, booleanAttributes, spaceCharacters
from ..constants import rcdataElements, entities, xmlEntities
from .. import utils
from xml.sax.saxutils import escape
spaceCharacters = "".join(spaceCharacters)
try:
from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
unicode_encode_errors = "strict"
else:
unicode_encode_errors = "htmlentityreplace"
encode_entity_map = {}
is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((is_ucs4 and len(v) > 1) or
(not is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if not v in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error(unicode_encode_errors, htmlentityreplace_errors)
del register_error
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Keyword options (default given first unless specified) include:
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
in which case single quotes are used instead.
escape_lt_in_attrs=False|True
Whether to escape < in attribute values.
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
are unaffected by this setting.
strip_whitespace=False|True
Whether to remove semantically meaningless whitespace. (This
compresses all whitespace to a single space except within pre.)
minimize_boolean_attributes=True|False
Shortens boolean attributes to give just the attribute value,
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus=False|True
Includes a close-tag slash at the end of the start tag of void
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
space_before_trailing_solidus=True|False
Places a space immediately before the closing slash in a tag
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize=False|True
Strip all unsafe or unknown constructs from output.
See `html5lib user documentation`_
omit_optional_tags=True|False
Omit start/end tags that are optional.
alphabetical_attributes=False|True
Reorder attributes to be in alphabetical order.
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from ..filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from ..filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from ..filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from ..filters.optionaltags import Filter
treewalker = Filter(treewalker)
# Alphabetical attributes must be last, as other filters
# could add attributes and alter the order
if self.alphabetical_attributes:
from ..filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = "'"
else:
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA"))
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
for (attr_namespace, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple())
and k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values or not v:
quote_attr = True
else:
quote_attr = reduce(lambda x, y: x or (y in v),
spaceCharacters + ">\"'=", False)
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(" /")
else:
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError(_("Comment contains --"))
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
if encoding:
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
def SerializeError(Exception):
"""Error in serialized tree"""
pass

View file

@ -0,0 +1 @@
from __future__ import absolute_import, division, unicode_literals

View file

@ -0,0 +1,108 @@
from __future__ import print_function
import os.path
import sys
import pkg_resources
import pytest
from .tree_construction import TreeConstructionFile
from .tokenizer import TokenizerFile
from .sanitizer import SanitizerFile
_dir = os.path.abspath(os.path.dirname(__file__))
_root = os.path.join(_dir, "..", "..")
_testdata = os.path.join(_dir, "testdata")
_tree_construction = os.path.join(_testdata, "tree-construction")
_tokenizer = os.path.join(_testdata, "tokenizer")
_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
def fail_if_missing_pytest_expect():
"""Throws an exception halting pytest if pytest-expect isn't working"""
try:
from pytest_expect import expect # noqa
except ImportError:
header = '*' * 78
print(
'\n' +
header + '\n' +
'ERROR: Either pytest-expect or its dependency u-msgpack-python is not\n' +
'installed. Please install them both before running pytest.\n' +
header + '\n',
file=sys.stderr
)
raise
fail_if_missing_pytest_expect()
def pytest_configure(config):
msgs = []
if not os.path.exists(_testdata):
msg = "testdata not available! "
if os.path.exists(os.path.join(_root, ".git")):
msg += ("Please run git submodule update --init --recursive " +
"and then run tests again.")
else:
msg += ("The testdata doesn't appear to be included with this package, " +
"so finding the right version will be hard. :(")
msgs.append(msg)
if config.option.update_xfail:
# Check for optional requirements
req_file = os.path.join(_root, "requirements-optional.txt")
if os.path.exists(req_file):
with open(req_file, "r") as fp:
for line in fp:
if (line.strip() and
not (line.startswith("-r") or
line.startswith("#"))):
if ";" in line:
spec, marker = line.strip().split(";", 1)
else:
spec, marker = line.strip(), None
req = pkg_resources.Requirement.parse(spec)
if marker and not pkg_resources.evaluate_marker(marker):
msgs.append("%s not available in this environment" % spec)
else:
try:
installed = pkg_resources.working_set.find(req)
except pkg_resources.VersionConflict:
msgs.append("Outdated version of %s installed, need %s" % (req.name, spec))
else:
if not installed:
msgs.append("Need %s" % spec)
# Check cElementTree
import xml.etree.ElementTree as ElementTree
try:
import xml.etree.cElementTree as cElementTree
except ImportError:
msgs.append("cElementTree unable to be imported")
else:
if cElementTree.Element is ElementTree.Element:
msgs.append("cElementTree is just an alias for ElementTree")
if msgs:
pytest.exit("\n".join(msgs))
def pytest_collect_file(path, parent):
dir = os.path.abspath(path.dirname)
dir_and_parents = set()
while dir not in dir_and_parents:
dir_and_parents.add(dir)
dir = os.path.dirname(dir)
if _tree_construction in dir_and_parents:
if path.ext == ".dat":
return TreeConstructionFile(path, parent)
elif _tokenizer in dir_and_parents:
if path.ext == ".test":
return TokenizerFile(path, parent)
elif _sanitizer_testdata in dir_and_parents:
if path.ext == ".dat":
return SanitizerFile(path, parent)

View file

@ -0,0 +1,51 @@
from __future__ import absolute_import, division, unicode_literals
import codecs
import json
import pytest
from html5lib import parseFragment, serialize
class SanitizerFile(pytest.File):
def collect(self):
with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
tests = json.load(fp)
for i, test in enumerate(tests):
yield SanitizerTest(str(i), self, test=test)
class SanitizerTest(pytest.Item):
def __init__(self, name, parent, test):
super(SanitizerTest, self).__init__(name, parent)
self.obj = lambda: 1 # this is to hack around skipif needing a function!
self.test = test
def runtest(self):
input = self.test["input"]
expected = self.test["output"]
parsed = parseFragment(input)
with pytest.deprecated_call():
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char="'",
alphabetical_attributes=True)
errorMsg = "\n".join(["\n\nInput:", input,
"\nExpected:", expected,
"\nReceived:", serialized])
assert expected == serialized, errorMsg
def repr_failure(self, excinfo):
traceback = excinfo.traceback
ntraceback = traceback.cut(path=__file__)
excinfo.traceback = ntraceback.filter()
return excinfo.getrepr(funcargs=True,
showlocals=False,
style="short", tbfilter=False)

View file

@ -0,0 +1,199 @@
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=wrong-import-position
import os
import sys
import codecs
import glob
import xml.sax.handler
base_path = os.path.split(__file__)[0]
test_dir = os.path.join(base_path, 'testdata')
sys.path.insert(0, os.path.abspath(os.path.join(base_path,
os.path.pardir,
os.path.pardir)))
from html5lib import treebuilders, treewalkers, treeadapters # noqa
del base_path
# Build a dict of available trees
treeTypes = {}
# DOM impls
treeTypes["DOM"] = {
"builder": treebuilders.getTreeBuilder("dom"),
"walker": treewalkers.getTreeWalker("dom")
}
# ElementTree impls
import xml.etree.ElementTree as ElementTree # noqa
treeTypes['ElementTree'] = {
"builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
"walker": treewalkers.getTreeWalker("etree", ElementTree)
}
try:
import xml.etree.cElementTree as cElementTree # noqa
except ImportError:
treeTypes['cElementTree'] = None
else:
# On Python 3.3 and above cElementTree is an alias, don't run them twice.
if cElementTree.Element is ElementTree.Element:
treeTypes['cElementTree'] = None
else:
treeTypes['cElementTree'] = {
"builder": treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True),
"walker": treewalkers.getTreeWalker("etree", cElementTree)
}
try:
import lxml.etree as lxml # noqa
except ImportError:
treeTypes['lxml'] = None
else:
treeTypes['lxml'] = {
"builder": treebuilders.getTreeBuilder("lxml"),
"walker": treewalkers.getTreeWalker("lxml")
}
# Genshi impls
try:
import genshi # noqa
except ImportError:
treeTypes["genshi"] = None
else:
treeTypes["genshi"] = {
"builder": treebuilders.getTreeBuilder("dom"),
"adapter": lambda tree: treeadapters.genshi.to_genshi(treewalkers.getTreeWalker("dom")(tree)),
"walker": treewalkers.getTreeWalker("genshi")
}
# pylint:enable=wrong-import-position
def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))
class DefaultDict(dict):
def __init__(self, default, *args, **kwargs):
self.default = default
dict.__init__(self, *args, **kwargs)
def __getitem__(self, key):
return dict.get(self, key, self.default)
class TestData(object):
def __init__(self, filename, newTestHeading="data", encoding="utf8"):
if encoding is None:
self.f = open(filename, mode="rb")
else:
self.f = codecs.open(filename, encoding=encoding)
self.encoding = encoding
self.newTestHeading = newTestHeading
def __iter__(self):
data = DefaultDict(None)
key = None
for line in self.f:
heading = self.isSectionHeading(line)
if heading:
if data and heading == self.newTestHeading:
# Remove trailing newline
data[key] = data[key][:-1]
yield self.normaliseOutput(data)
data = DefaultDict(None)
key = heading
data[key] = "" if self.encoding else b""
elif key is not None:
data[key] += line
if data:
yield self.normaliseOutput(data)
def isSectionHeading(self, line):
"""If the current heading is a test section heading return the heading,
otherwise return False"""
# print(line)
if line.startswith("#" if self.encoding else b"#"):
return line[1:].strip()
else:
return False
def normaliseOutput(self, data):
# Remove trailing newlines
for key, value in data.items():
if value.endswith("\n" if self.encoding else b"\n"):
data[key] = value[:-1]
return data
def convert(stripChars):
def convertData(data):
"""convert the output of str(document) to the format used in the testcases"""
data = data.split("\n")
rv = []
for line in data:
if line.startswith("|"):
rv.append(line[stripChars:])
else:
rv.append(line)
return "\n".join(rv)
return convertData
convertExpected = convert(2)
def errorMessage(input, expected, actual):
msg = ("Input:\n%s\nExpected:\n%s\nReceived\n%s\n" %
(repr(input), repr(expected), repr(actual)))
if sys.version_info[0] == 2:
msg = msg.encode("ascii", "backslashreplace")
return msg
class TracingSaxHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self.visited = []
def startDocument(self):
self.visited.append('startDocument')
def endDocument(self):
self.visited.append('endDocument')
def startPrefixMapping(self, prefix, uri):
# These are ignored as their order is not guaranteed
pass
def endPrefixMapping(self, prefix):
# These are ignored as their order is not guaranteed
pass
def startElement(self, name, attrs):
self.visited.append(('startElement', name, attrs))
def endElement(self, name):
self.visited.append(('endElement', name))
def startElementNS(self, name, qname, attrs):
self.visited.append(('startElementNS', name, qname, dict(attrs)))
def endElementNS(self, name, qname):
self.visited.append(('endElementNS', name, qname))
def characters(self, content):
self.visited.append(('characters', content))
def ignorableWhitespace(self, whitespace):
self.visited.append(('ignorableWhitespace', whitespace))
def processingInstruction(self, target, data):
self.visited.append(('processingInstruction', target, data))
def skippedEntity(self, name):
self.visited.append(('skippedEntity', name))

View file

@ -0,0 +1,78 @@
from __future__ import absolute_import, division, unicode_literals
from collections import OrderedDict
import pytest
import html5lib
from html5lib.filters.alphabeticalattributes import Filter
from html5lib.serializer import HTMLSerializer
@pytest.mark.parametrize('msg, attrs, expected_attrs', [
(
'no attrs',
{},
{}
),
(
'one attr',
{(None, 'alt'): 'image'},
OrderedDict([((None, 'alt'), 'image')])
),
(
'multiple attrs',
{
(None, 'src'): 'foo',
(None, 'alt'): 'image',
(None, 'style'): 'border: 1px solid black;'
},
OrderedDict([
((None, 'alt'), 'image'),
((None, 'src'), 'foo'),
((None, 'style'), 'border: 1px solid black;')
])
),
])
def test_alphabetizing(msg, attrs, expected_attrs):
tokens = [{'type': 'StartTag', 'name': 'img', 'data': attrs}]
output_tokens = list(Filter(tokens))
attrs = output_tokens[0]['data']
assert attrs == expected_attrs
def test_with_different_namespaces():
tokens = [{
'type': 'StartTag',
'name': 'pattern',
'data': {
(None, 'id'): 'patt1',
('http://www.w3.org/1999/xlink', 'href'): '#patt2'
}
}]
output_tokens = list(Filter(tokens))
attrs = output_tokens[0]['data']
assert attrs == OrderedDict([
((None, 'id'), 'patt1'),
(('http://www.w3.org/1999/xlink', 'href'), '#patt2')
])
def test_with_serializer():
"""Verify filter works in the context of everything else"""
parser = html5lib.HTMLParser()
dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
walker = html5lib.getTreeWalker('etree')
ser = HTMLSerializer(
alphabetical_attributes=True,
quote_attr_values='always'
)
# FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
# that gets fixed, we can fix this expected result.
assert (
ser.render(walker(dom)) ==
'<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
)

View file

@ -0,0 +1,117 @@
from __future__ import absolute_import, division, unicode_literals
import os
import pytest
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
from html5lib import HTMLParser, _inputstream
def test_basic_prescan_length():
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
pad = 1024 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 1024 # Sanity
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'utf-8' == stream.charEncoding[0].name
def test_parser_reparse():
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
pad = 10240 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 10240 # Sanity
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'windows-1252' == stream.charEncoding[0].name
p = HTMLParser(namespaceHTMLElements=False)
doc = p.parse(data, useChardet=False)
assert 'utf-8' == p.documentEncoding
assert doc.find(".//title").text == "Caf\u00E9"
@pytest.mark.parametrize("expected,data,kwargs", [
("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
("windows-1252", b"", {}),
])
def test_parser_args(expected, data, kwargs):
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
assert expected == stream.charEncoding[0].name
p = HTMLParser()
p.parse(data, useChardet=False, **kwargs)
assert expected == p.documentEncoding
@pytest.mark.parametrize("kwargs", [
{"override_encoding": "iso-8859-2"},
{"override_encoding": None},
{"transport_encoding": "iso-8859-2"},
{"transport_encoding": None},
{"same_origin_parent_encoding": "iso-8859-2"},
{"same_origin_parent_encoding": None},
{"likely_encoding": "iso-8859-2"},
{"likely_encoding": None},
{"default_encoding": "iso-8859-2"},
{"default_encoding": None},
{"foo_encoding": "iso-8859-2"},
{"foo_encoding": None},
])
def test_parser_args_raises(kwargs):
with pytest.raises(TypeError) as exc_info:
p = HTMLParser()
p.parse("", useChardet=False, **kwargs)
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
def param_encoding():
for filename in get_data_files("encoding"):
tests = _TestData(filename, b"data", encoding=None)
for test in tests:
yield test[b'data'], test[b'encoding']
@pytest.mark.parametrize("data, encoding", param_encoding())
def test_parser_encoding(data, encoding):
p = HTMLParser()
assert p.documentEncoding is None
p.parse(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
@pytest.mark.parametrize("data, encoding", param_encoding())
def test_prescan_encoding(data, encoding):
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
# Very crude way to ignore irrelevant tests
if len(data) > stream.numBytesMeta:
return
assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name)
# pylint:disable=wrong-import-position
try:
import chardet # noqa
except ImportError:
print("chardet not found, skipping chardet tests")
else:
def test_chardet():
with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding
assert encoding[0].name == "big5"
# pylint:enable=wrong-import-position

View file

@ -0,0 +1,41 @@
from __future__ import absolute_import, division, unicode_literals
import six
from mock import Mock
from . import support
def _createReprMock(r):
"""Creates a mock with a __repr__ returning r
Also provides __str__ mock with default mock behaviour"""
mock = Mock()
mock.__repr__ = Mock()
mock.__repr__.return_value = r
mock.__str__ = Mock(wraps=mock.__str__)
return mock
def test_errorMessage():
# Create mock objects to take repr of
input = _createReprMock("1")
expected = _createReprMock("2")
actual = _createReprMock("3")
# Run the actual test
r = support.errorMessage(input, expected, actual)
# Assertions!
if six.PY2:
assert b"Input:\n1\nExpected:\n2\nReceived\n3\n" == r
else:
assert six.PY3
assert "Input:\n1\nExpected:\n2\nReceived\n3\n" == r
assert input.__repr__.call_count == 1
assert expected.__repr__.call_count == 1
assert actual.__repr__.call_count == 1
assert not input.__str__.called
assert not expected.__str__.called
assert not actual.__str__.called

View file

@ -0,0 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib.filters.optionaltags import Filter
def test_empty():
assert list(Filter([])) == []

View file

@ -0,0 +1,94 @@
from __future__ import absolute_import, division, unicode_literals
from six import PY2, text_type
import io
from . import support # noqa
from html5lib.constants import namespaces
from html5lib import parse, parseFragment, HTMLParser
# tests that aren't autogenerated from text files
def test_assertDoctypeCloneable():
doc = parse('<!DOCTYPE HTML>', treebuilder="dom")
assert doc.cloneNode(True) is not None
def test_line_counter():
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
assert parse("<pre>\nx\n&gt;\n</pre>") is not None
def test_namespace_html_elements_0_dom():
doc = parse("<html></html>",
treebuilder="dom",
namespaceHTMLElements=True)
assert doc.childNodes[0].namespaceURI == namespaces["html"]
def test_namespace_html_elements_1_dom():
doc = parse("<html></html>",
treebuilder="dom",
namespaceHTMLElements=False)
assert doc.childNodes[0].namespaceURI is None
def test_namespace_html_elements_0_etree():
doc = parse("<html></html>",
treebuilder="etree",
namespaceHTMLElements=True)
assert doc.tag == "{%s}html" % (namespaces["html"],)
def test_namespace_html_elements_1_etree():
doc = parse("<html></html>",
treebuilder="etree",
namespaceHTMLElements=False)
assert doc.tag == "html"
def test_unicode_file():
assert parse(io.StringIO("a")) is not None
def test_debug_log():
parser = HTMLParser(debug=True)
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")
expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
if PY2:
for i, log in enumerate(expected):
log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
expected[i] = tuple(log)
assert parser.log == expected
def test_no_duplicate_clone():
frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
assert len(frag) == 2
def test_self_closing_col():
parser = HTMLParser()
parser.parseFragment('<table><colgroup><col /></colgroup></table>')
assert not parser.errors

View file

@ -0,0 +1,133 @@
from __future__ import absolute_import, division, unicode_literals
import pytest
from html5lib import constants, parseFragment, serialize
from html5lib.filters import sanitizer
def sanitize_html(stream):
parsed = parseFragment(stream)
with pytest.deprecated_call():
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
return serialized
def test_should_handle_astral_plane_characters():
sanitized = sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
expected = '<p>\U0001d4b5 \U0001d538</p>'
assert expected == sanitized
def test_should_allow_relative_uris():
sanitized = sanitize_html('<p><a href="/example.com"></a></p>')
expected = '<p><a href="/example.com"></a></p>'
assert expected == sanitized
def test_invalid_data_uri():
sanitized = sanitize_html('<audio controls="" src="data:foobar"></audio>')
expected = '<audio controls></audio>'
assert expected == sanitized
def test_invalid_ipv6_url():
sanitized = sanitize_html('<a href="h://]">')
expected = "<a></a>"
assert expected == sanitized
def test_data_uri_disallowed_type():
sanitized = sanitize_html('<audio controls="" src="data:text/html,<html>"></audio>')
expected = "<audio controls></audio>"
assert expected == sanitized
def param_sanitizer():
for ns, tag_name in sanitizer.allowed_elements:
if ns != constants.namespaces["html"]:
continue
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
'tfoot', 'th', 'thead', 'tr', 'select']:
continue # TODO
if tag_name == 'image':
yield ("test_should_allow_%s_tag" % tag_name,
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
elif tag_name == 'br':
yield ("test_should_allow_%s_tag" % tag_name,
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
elif tag_name in constants.voidElements:
yield ("test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
else:
yield ("test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
for ns, attribute_name in sanitizer.allowed_attributes:
if ns is not None:
continue
if attribute_name != attribute_name.lower():
continue # TODO
if attribute_name == 'style':
continue
attribute_value = 'foo'
if attribute_name in sanitizer.attr_val_is_uri:
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
yield ("test_should_allow_%s_attribute" % attribute_name,
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
for protocol in sanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
yield ("test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
for protocol in sanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
protocol = protocol.upper()
yield ("test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
@pytest.mark.parametrize("expected, input",
(pytest.param(expected, input, id=id)
for id, expected, input in param_sanitizer()))
def test_sanitizer(expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
assert expected == sanitize_html(input)
def test_lowercase_color_codes_in_style():
sanitized = sanitize_html("<p style=\"border: 1px solid #a2a2a2;\"></p>")
expected = '<p style=\"border: 1px solid #a2a2a2;\"></p>'
assert expected == sanitized
def test_uppercase_color_codes_in_style():
sanitized = sanitize_html("<p style=\"border: 1px solid #A2A2A2;\"></p>")
expected = '<p style=\"border: 1px solid #A2A2A2;\"></p>'
assert expected == sanitized

View file

@ -0,0 +1,226 @@
from __future__ import absolute_import, division, unicode_literals
import os
import json
import pytest
from .support import get_data_files
from html5lib import constants
from html5lib.filters.lint import Filter as Lint
from html5lib.serializer import HTMLSerializer, serialize
from html5lib.treewalkers.base import TreeWalker
# pylint:disable=wrong-import-position
optionals_loaded = []
try:
from lxml import etree
optionals_loaded.append("lxml")
except ImportError:
pass
# pylint:enable=wrong-import-position
default_namespace = constants.namespaces["html"]
class JsonWalker(TreeWalker):
def __iter__(self):
for token in self.tree:
type = token[0]
if type == "StartTag":
if len(token) == 4:
namespace, name, attrib = token[1:4]
else:
namespace = default_namespace
name, attrib = token[1:3]
yield self.startTag(namespace, name, self._convertAttrib(attrib))
elif type == "EndTag":
if len(token) == 3:
namespace, name = token[1:3]
else:
namespace = default_namespace
name = token[1]
yield self.endTag(namespace, name)
elif type == "EmptyTag":
if len(token) == 4:
namespace, name, attrib = token[1:]
else:
namespace = default_namespace
name, attrib = token[1:]
for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
yield token
elif type == "Comment":
yield self.comment(token[1])
elif type in ("Characters", "SpaceCharacters"):
for token in self.text(token[1]):
yield token
elif type == "Doctype":
if len(token) == 4:
yield self.doctype(token[1], token[2], token[3])
elif len(token) == 3:
yield self.doctype(token[1], token[2])
else:
yield self.doctype(token[1])
else:
raise ValueError("Unknown token type: " + type)
def _convertAttrib(self, attribs):
"""html5lib tree-walkers use a dict of (namespace, name): value for
attributes, but JSON cannot represent this. Convert from the format
in the serializer tests (a list of dicts with "namespace", "name",
and "value" as keys) to html5lib's tree-walker format."""
attrs = {}
for attrib in attribs:
name = (attrib["namespace"], attrib["name"])
assert(name not in attrs)
attrs[name] = attrib["value"]
return attrs
def serialize_html(input, options):
options = {str(k): v for k, v in options.items()}
encoding = options.get("encoding", None)
if "encoding" in options:
del options["encoding"]
stream = Lint(JsonWalker(input), False)
serializer = HTMLSerializer(alphabetical_attributes=True, **options)
return serializer.render(stream, encoding)
def throwsWithLatin1(input):
with pytest.raises(UnicodeEncodeError):
serialize_html(input, {"encoding": "iso-8859-1"})
def testDoctypeName():
throwsWithLatin1([["Doctype", "\u0101"]])
def testDoctypePublicId():
throwsWithLatin1([["Doctype", "potato", "\u0101"]])
def testDoctypeSystemId():
throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])
def testCdataCharacters():
test_serializer([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
["<style>&amacr;"], {"encoding": "iso-8859-1"})
def testCharacters():
test_serializer([["Characters", "\u0101"]],
["&amacr;"], {"encoding": "iso-8859-1"})
def testStartTagName():
throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
def testAttributeName():
throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])
def testAttributeValue():
test_serializer([["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "potato", "value": "\u0101"}]]],
["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
def testEndTagName():
throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])
def testComment():
throwsWithLatin1([["Comment", "\u0101"]])
def testThrowsUnknownOption():
with pytest.raises(TypeError):
HTMLSerializer(foobar=None)
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
def testSpecQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "spec"}
test_serializer(input_, output_, options_)
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000"))
def testLegacyQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "legacy"}
test_serializer(input_, output_, options_)
@pytest.fixture
def lxml_parser():
return etree.XMLParser(resolve_entities=False)
@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
def testEntityReplacement(lxml_parser):
doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
result = serialize(tree, tree="lxml", omit_optional_tags=False)
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>'
@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
def testEntityXML(lxml_parser):
doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
result = serialize(tree, tree="lxml", omit_optional_tags=False)
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>'
@pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable")
def testEntityNoResolve(lxml_parser):
doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
tree = etree.fromstring(doc, parser=lxml_parser).getroottree()
result = serialize(tree, tree="lxml", omit_optional_tags=False,
resolve_entities=False)
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
def param_serializer():
for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
with open(filename) as fp:
tests = json.load(fp)
for test in tests['tests']:
yield test["input"], test["expected"], test.get("options", {})
@pytest.mark.parametrize("input, expected, options", param_serializer())
def test_serializer(input, expected, options):
encoding = options.get("encoding", None)
if encoding:
expected = list(map(lambda x: x.encode(encoding), expected))
result = serialize_html(input, options)
if len(expected) == 1:
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
elif result not in expected:
assert False, "Expected: %s, Received: %s" % (expected, result)

View file

@ -0,0 +1,325 @@
from __future__ import absolute_import, division, unicode_literals
from . import support # noqa
import codecs
import sys
from io import BytesIO, StringIO
import pytest
import six
from six.moves import http_client, urllib
from html5lib._inputstream import (BufferedStream, HTMLInputStream,
HTMLUnicodeInputStream, HTMLBinaryInputStream)
from html5lib._utils import supports_lone_surrogates
def test_basic():
s = b"abc"
fp = BufferedStream(BytesIO(s))
read = fp.read(10)
assert read == s
def test_read_length():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
read2 = fp.read(2)
assert read2 == b"bc"
read3 = fp.read(3)
assert read3 == b"def"
read4 = fp.read(4)
assert read4 == b""
def test_tell():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
assert fp.tell() == 1
read2 = fp.read(2)
assert read2 == b"bc"
assert fp.tell() == 3
read3 = fp.read(3)
assert read3 == b"def"
assert fp.tell() == 6
read4 = fp.read(4)
assert read4 == b""
assert fp.tell() == 6
def test_seek():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
fp.seek(0)
read2 = fp.read(1)
assert read2 == b"a"
read3 = fp.read(2)
assert read3 == b"bc"
fp.seek(2)
read4 = fp.read(2)
assert read4 == b"cd"
fp.seek(4)
read5 = fp.read(2)
assert read5 == b"ef"
def test_seek_tell():
fp = BufferedStream(BytesIO(b"abcdef"))
read1 = fp.read(1)
assert read1 == b"a"
assert fp.tell() == 1
fp.seek(0)
read2 = fp.read(1)
assert read2 == b"a"
assert fp.tell() == 1
read3 = fp.read(2)
assert read3 == b"bc"
assert fp.tell() == 3
fp.seek(2)
read4 = fp.read(2)
assert read4 == b"cd"
assert fp.tell() == 4
fp.seek(4)
read5 = fp.read(2)
assert read5 == b"ef"
assert fp.tell() == 6
class HTMLUnicodeInputStreamShortChunk(HTMLUnicodeInputStream):
_defaultChunkSize = 2
class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
_defaultChunkSize = 2
def test_char_ascii():
stream = HTMLInputStream(b"'", override_encoding='ascii')
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "'"
def test_char_utf8():
stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == '\u2018'
def test_char_win1252():
stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
assert stream.charEncoding[0].name == 'windows-1252'
assert stream.char() == "\xa9"
assert stream.char() == "\xf1"
assert stream.char() == "\u2019"
def test_bom():
stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
assert stream.charEncoding[0].name == 'utf-8'
assert stream.char() == "'"
def test_utf_16():
stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be']
assert len(stream.charsUntil(' ', True)) == 1025
def test_newlines():
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
assert stream.position() == (1, 0)
assert stream.charsUntil('c') == "a\nbb\n"
assert stream.position() == (3, 0)
assert stream.charsUntil('x') == "ccc\ndddd"
assert stream.position() == (4, 4)
assert stream.charsUntil('e') == "x"
assert stream.position() == (4, 5)
def test_newlines2():
size = HTMLUnicodeInputStream._defaultChunkSize
stream = HTMLInputStream("\r" * size + "\n")
assert stream.charsUntil('x') == "\n" * size
def test_position():
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
assert stream.position() == (1, 0)
assert stream.charsUntil('c') == "a\nbb\n"
assert stream.position() == (3, 0)
stream.unget("\n")
assert stream.position() == (2, 2)
assert stream.charsUntil('c') == "\n"
assert stream.position() == (3, 0)
stream.unget("\n")
assert stream.position() == (2, 2)
assert stream.char() == "\n"
assert stream.position() == (3, 0)
assert stream.charsUntil('e') == "ccc\nddd"
assert stream.position() == (4, 3)
assert stream.charsUntil('h') == "e\nf\ng"
assert stream.position() == (6, 1)
def test_position2():
stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
assert stream.position() == (1, 0)
assert stream.char() == "a"
assert stream.position() == (1, 1)
assert stream.char() == "b"
assert stream.position() == (1, 2)
assert stream.char() == "c"
assert stream.position() == (1, 3)
assert stream.char() == "\n"
assert stream.position() == (2, 0)
assert stream.char() == "d"
assert stream.position() == (2, 1)
def test_python_issue_20007():
"""
Make sure we have a work-around for Python bug #20007
http://bugs.python.org/issue20007
"""
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
# pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
source = http_client.HTTPResponse(FakeSocket())
source.begin()
stream = HTMLInputStream(source)
assert stream.charsUntil(" ") == "Text"
def test_python_issue_20007_b():
"""
Make sure we have a work-around for Python bug #20007
http://bugs.python.org/issue20007
"""
if six.PY2:
return
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
# pylint:disable=unused-argument
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
source = http_client.HTTPResponse(FakeSocket())
source.begin()
wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
stream = HTMLInputStream(wrapped)
assert stream.charsUntil(" ") == "Text"
@pytest.mark.parametrize("inp,num",
[("\u0000", 0),
("\u0001", 1),
("\u0008", 1),
("\u0009", 0),
("\u000A", 0),
("\u000B", 1),
("\u000C", 0),
("\u000D", 0),
("\u000E", 1),
("\u001F", 1),
("\u0020", 0),
("\u007E", 0),
("\u007F", 1),
("\u009F", 1),
("\u00A0", 0),
("\uFDCF", 0),
("\uFDD0", 1),
("\uFDEF", 1),
("\uFDF0", 0),
("\uFFFD", 0),
("\uFFFE", 1),
("\uFFFF", 1),
("\U0001FFFD", 0),
("\U0001FFFE", 1),
("\U0001FFFF", 1),
("\U0002FFFD", 0),
("\U0002FFFE", 1),
("\U0002FFFF", 1),
("\U0003FFFD", 0),
("\U0003FFFE", 1),
("\U0003FFFF", 1),
("\U0004FFFD", 0),
("\U0004FFFE", 1),
("\U0004FFFF", 1),
("\U0005FFFD", 0),
("\U0005FFFE", 1),
("\U0005FFFF", 1),
("\U0006FFFD", 0),
("\U0006FFFE", 1),
("\U0006FFFF", 1),
("\U0007FFFD", 0),
("\U0007FFFE", 1),
("\U0007FFFF", 1),
("\U0008FFFD", 0),
("\U0008FFFE", 1),
("\U0008FFFF", 1),
("\U0009FFFD", 0),
("\U0009FFFE", 1),
("\U0009FFFF", 1),
("\U000AFFFD", 0),
("\U000AFFFE", 1),
("\U000AFFFF", 1),
("\U000BFFFD", 0),
("\U000BFFFE", 1),
("\U000BFFFF", 1),
("\U000CFFFD", 0),
("\U000CFFFE", 1),
("\U000CFFFF", 1),
("\U000DFFFD", 0),
("\U000DFFFE", 1),
("\U000DFFFF", 1),
("\U000EFFFD", 0),
("\U000EFFFE", 1),
("\U000EFFFF", 1),
("\U000FFFFD", 0),
("\U000FFFFE", 1),
("\U000FFFFF", 1),
("\U0010FFFD", 0),
("\U0010FFFE", 1),
("\U0010FFFF", 1),
("\x01\x01\x01", 3),
("a\x01a\x01a\x01a", 3)])
def test_invalid_codepoints(inp, num):
stream = HTMLUnicodeInputStream(StringIO(inp))
for _i in range(len(inp)):
stream.char()
assert len(stream.errors) == num
@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
@pytest.mark.parametrize("inp,num",
[("'\\uD7FF'", 0),
("'\\uD800'", 1),
("'\\uDBFF'", 1),
("'\\uDC00'", 1),
("'\\uDFFF'", 1),
("'\\uE000'", 0),
("'\\uD800\\uD800\\uD800'", 3),
("'a\\uD800a\\uD800a\\uD800a'", 3),
("'\\uDFFF\\uDBFF'", 2),
pytest.param(
"'\\uDBFF\\uDFFF'", 2,
marks=pytest.mark.skipif(
sys.maxunicode == 0xFFFF,
reason="narrow Python"))])
def test_invalid_codepoints_surrogates(inp, num):
inp = eval(inp) # pylint:disable=eval-used
fp = StringIO(inp)
if ord(max(fp.read())) > 0xFFFF:
pytest.skip("StringIO altered string")
fp.seek(0)
stream = HTMLUnicodeInputStream(fp)
for _i in range(len(inp)):
stream.char()
assert len(stream.errors) == num

View file

@ -0,0 +1,66 @@
from __future__ import absolute_import, division, unicode_literals
import io
from six import unichr, text_type
from html5lib._tokenizer import HTMLTokenizer
from html5lib.constants import tokenTypes
def ignore_parse_errors(toks):
for tok in toks:
if tok['type'] != tokenTypes['ParseError']:
yield tok
def test_maintain_attribute_order():
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">")
toks = HTMLTokenizer(stream)
out = list(ignore_parse_errors(toks))
assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
attrs_tok = out[0]['data']
assert len(attrs_tok) == len(attrs)
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
assert in_name == out_name
assert in_value == out_value
def test_duplicate_attribute():
stream = io.StringIO("<span a=1 a=2 a=3>")
toks = HTMLTokenizer(stream)
out = list(ignore_parse_errors(toks))
assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
attrs_tok = out[0]['data']
assert len(attrs_tok) == 1
assert list(attrs_tok.items()) == [('a', '1')]
def test_maintain_duplicate_attribute_order():
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + " a=100>")
toks = HTMLTokenizer(stream)
out = list(ignore_parse_errors(toks))
assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
attrs_tok = out[0]['data']
assert len(attrs_tok) == len(attrs)
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
assert in_name == out_name
assert in_value == out_value

View file

@ -0,0 +1,40 @@
from __future__ import absolute_import, division, unicode_literals
from . import support # noqa
import html5lib
from html5lib.treeadapters import sax
from html5lib.treewalkers import getTreeWalker
def test_to_sax():
handler = support.TracingSaxHandler()
tree = html5lib.parse("""<html xml:lang="en">
<title>Directory Listing</title>
<a href="/"><b/></p>
""", treebuilder="etree")
walker = getTreeWalker("etree")
sax.to_sax(walker(tree), handler)
expected = [
'startDocument',
('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
'html', {(None, 'xml:lang'): 'en'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
('characters', 'Directory Listing'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
('characters', '\n '),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
'endDocument',
]
assert expected == handler.visited

View file

@ -0,0 +1,205 @@
from __future__ import absolute_import, division, unicode_literals
import itertools
import sys
from six import unichr, text_type
import pytest
try:
import lxml.etree
except ImportError:
pass
from .support import treeTypes
from html5lib import html5parser, treewalkers
from html5lib.filters.lint import Filter as Lint
import re
attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
def sortattrs(x):
lines = x.group(0).split("\n")
lines.sort()
return "\n".join(lines)
def test_all_tokens():
expected = [
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'data': 'a', 'type': 'Characters'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'b', 'type': 'Characters'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'c', 'type': 'Characters'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
]
for _, treeCls in sorted(treeTypes.items()):
if treeCls is None:
continue
p = html5parser.HTMLParser(tree=treeCls["builder"])
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
document = treeCls.get("adapter", lambda x: x)(document)
output = Lint(treeCls["walker"](document))
for expectedToken, outputToken in zip(expected, output):
assert expectedToken == outputToken
def set_attribute_on_first_child(docfrag, name, value, treeName):
"""naively sets an attribute on the first child of the document
fragment passed in"""
setter = {'ElementTree': lambda d: d[0].set,
'DOM': lambda d: d.firstChild.setAttribute}
setter['cElementTree'] = setter['ElementTree']
try:
setter.get(treeName, setter['DOM'])(docfrag)(name, value)
except AttributeError:
setter['ElementTree'](docfrag)(name, value)
def param_treewalker_six_mix():
"""Str/Unicode mix. If str attrs added to tree"""
# On Python 2.x string literals are of type str. Unless, like this
# file, the programmer imports unicode_literals from __future__.
# In that case, string literals become objects of type unicode.
# This test simulates a Py2 user, modifying attributes on a document
# fragment but not using the u'' syntax nor importing unicode_literals
sm_tests = [
('<a href="http://example.com">Example</a>',
[(str('class'), str('test123'))],
'<a>\n class="test123"\n href="http://example.com"\n "Example"'),
('<link href="http://example.com/cow">',
[(str('rel'), str('alternate'))],
'<link>\n href="http://example.com/cow"\n rel="alternate"\n "Example"')
]
for tree in sorted(treeTypes.items()):
for intext, attrs, expected in sm_tests:
yield intext, expected, attrs, tree
@pytest.mark.parametrize("intext, expected, attrs_to_add, tree", param_treewalker_six_mix())
def test_treewalker_six_mix(intext, expected, attrs_to_add, tree):
"""tests what happens when we add attributes to the intext"""
treeName, treeClass = tree
if treeClass is None:
pytest.skip("Treebuilder not loaded")
parser = html5parser.HTMLParser(tree=treeClass["builder"])
document = parser.parseFragment(intext)
for nom, val in attrs_to_add:
set_attribute_on_first_child(document, nom, val, treeName)
document = treeClass.get("adapter", lambda x: x)(document)
output = treewalkers.pprint(treeClass["walker"](document))
output = attrlist.sub(sortattrs, output)
if output not in expected:
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
@pytest.mark.parametrize("tree,char", itertools.product(sorted(treeTypes.items()), ["x", "\u1234"]))
def test_fragment_single_char(tree, char):
expected = [
{'data': char, 'type': 'Characters'}
]
treeName, treeClass = tree
if treeClass is None:
pytest.skip("Treebuilder not loaded")
parser = html5parser.HTMLParser(tree=treeClass["builder"])
document = parser.parseFragment(char)
document = treeClass.get("adapter", lambda x: x)(document)
output = Lint(treeClass["walker"](document))
assert list(output) == expected
@pytest.mark.skipif(treeTypes["lxml"] is None, reason="lxml not importable")
def test_lxml_xml():
expected = [
{'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'},
{'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag'},
{'name': 'div', 'namespace': None, 'type': 'EndTag'},
{'name': 'div', 'namespace': None, 'type': 'EndTag'}
]
lxmltree = lxml.etree.fromstring('<div><div></div></div>')
walker = treewalkers.getTreeWalker('lxml')
output = Lint(walker(lxmltree))
assert list(output) == expected
@pytest.mark.parametrize("treeName",
[pytest.param(treeName, marks=[getattr(pytest.mark, treeName),
pytest.mark.skipif(
treeName != "lxml" or
sys.version_info < (3, 7), reason="dict order undef")])
for treeName in sorted(treeTypes.keys())])
def test_maintain_attribute_order(treeName):
treeAPIs = treeTypes[treeName]
if treeAPIs is None:
pytest.skip("Treebuilder not loaded")
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
data = "<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">"
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
document = parser.parseFragment(data)
document = treeAPIs.get("adapter", lambda x: x)(document)
output = list(Lint(treeAPIs["walker"](document)))
assert len(output) == 2
assert output[0]['type'] == 'StartTag'
assert output[1]['type'] == "EndTag"
attrs_out = output[0]['data']
assert len(attrs) == len(attrs_out)
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_out.items()):
assert (None, in_name) == out_name
assert in_value == out_value
@pytest.mark.parametrize("treeName",
[pytest.param(treeName, marks=[getattr(pytest.mark, treeName),
pytest.mark.skipif(
treeName != "lxml" or
sys.version_info < (3, 7), reason="dict order undef")])
for treeName in sorted(treeTypes.keys())])
def test_maintain_attribute_order_adjusted(treeName):
treeAPIs = treeTypes[treeName]
if treeAPIs is None:
pytest.skip("Treebuilder not loaded")
# generate loads to maximize the chance a hash-based mutation will occur
data = "<svg a=1 refx=2 b=3 xml:lang=4 c=5>"
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
document = parser.parseFragment(data)
document = treeAPIs.get("adapter", lambda x: x)(document)
output = list(Lint(treeAPIs["walker"](document)))
assert len(output) == 2
assert output[0]['type'] == 'StartTag'
assert output[1]['type'] == "EndTag"
attrs_out = output[0]['data']
assert list(attrs_out.items()) == [((None, 'a'), '1'),
((None, 'refX'), '2'),
((None, 'b'), '3'),
(('http://www.w3.org/XML/1998/namespace', 'lang'), '4'),
((None, 'c'), '5')]

View file

@ -0,0 +1,125 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib.filters.whitespace import Filter
from html5lib.constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
def runTest(input, expected):
output = list(Filter(input))
errorMsg = "\n".join(["\n\nInput:", str(input),
"\nExpected:", str(expected),
"\nReceived:", str(output)])
assert expected == output, errorMsg
def runTestUnmodifiedOutput(input):
runTest(input, input)
def testPhrasingElements():
runTestUnmodifiedOutput(
[{"type": "Characters", "data": "This is a "},
{"type": "StartTag", "name": "span", "data": []},
{"type": "Characters", "data": "phrase"},
{"type": "EndTag", "name": "span", "data": []},
{"type": "SpaceCharacters", "data": " "},
{"type": "Characters", "data": "with"},
{"type": "SpaceCharacters", "data": " "},
{"type": "StartTag", "name": "em", "data": []},
{"type": "Characters", "data": "emphasised text"},
{"type": "EndTag", "name": "em", "data": []},
{"type": "Characters", "data": " and an "},
{"type": "StartTag", "name": "img", "data": [["alt", "image"]]},
{"type": "Characters", "data": "."}])
def testLeadingWhitespace():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "Characters", "data": "foo"},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "SpaceCharacters", "data": " "},
{"type": "Characters", "data": "foo"},
{"type": "EndTag", "name": "p", "data": []}])
def testLeadingWhitespaceAsCharacters():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": spaceCharacters + "foo"},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": " foo"},
{"type": "EndTag", "name": "p", "data": []}])
def testTrailingWhitespace():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo"},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo"},
{"type": "SpaceCharacters", "data": " "},
{"type": "EndTag", "name": "p", "data": []}])
def testTrailingWhitespaceAsCharacters():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo "},
{"type": "EndTag", "name": "p", "data": []}])
def testWhitespace():
runTest(
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
{"type": "EndTag", "name": "p", "data": []}],
[{"type": "StartTag", "name": "p", "data": []},
{"type": "Characters", "data": "foo bar"},
{"type": "EndTag", "name": "p", "data": []}])
def testLeadingWhitespaceInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "Characters", "data": "foo"},
{"type": "EndTag", "name": "pre", "data": []}])
def testLeadingWhitespaceAsCharactersInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": spaceCharacters + "foo"},
{"type": "EndTag", "name": "pre", "data": []}])
def testTrailingWhitespaceInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": "foo"},
{"type": "SpaceCharacters", "data": spaceCharacters},
{"type": "EndTag", "name": "pre", "data": []}])
def testTrailingWhitespaceAsCharactersInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters},
{"type": "EndTag", "name": "pre", "data": []}])
def testWhitespaceInPre():
runTestUnmodifiedOutput(
[{"type": "StartTag", "name": "pre", "data": []},
{"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
{"type": "EndTag", "name": "pre", "data": []}])

View file

@ -0,0 +1,2 @@
*.dat -text diff
*.test -text diff

34
lib/html5lib/tests/testdata/AUTHORS.rst vendored Normal file
View file

@ -0,0 +1,34 @@
Credits
=======
The ``html5lib`` test data is maintained by:
- James Graham
- Geoffrey Sneddon
Contributors
------------
- Adam Barth
- Andi Sidwell
- Anne van Kesteren
- David Flanagan
- Edward Z. Yang
- Geoffrey Sneddon
- Henri Sivonen
- Ian Hickson
- Jacques Distler
- James Graham
- Lachlan Hunt
- lantis63
- Mark Pilgrim
- Mats Palmgren
- Ms2ger
- Nolan Waite
- Philip Taylor
- Rafael Weinstein
- Ryan King
- Sam Ruby
- Simon Pieters
- Thomas Broyer

21
lib/html5lib/tests/testdata/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and
other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,51 @@
老子《道德經》 第一~四十章
老子道經
第一章
道可道,非常道。名可名,非常名。無,名天地之始﹔有,名萬物之母。
故常無,欲以觀其妙;常有,欲以觀其徼。此兩者,同出而異名,同謂之
玄。玄之又玄,眾妙之門。
第二章
天下皆知美之為美,斯惡矣﹔皆知善之為善,斯不善矣。故有無相生,難
易相成,長短相形,高下相傾,音聲相和,前後相隨。是以聖人處「無為
」之事,行「不言」之教。萬物作焉而不辭,生而不有,為而不恃,功成
而弗居。夫唯弗居,是以不去。
第三章
不尚賢,使民不爭﹔不貴難得之貨,使民不為盜﹔不見可欲,使民心不亂
。是以「聖人」之治,虛其心,實其腹,弱其志,強其骨。常使民無知無
欲。使夫智者不敢為也。為「無為」,則無不治。
第四章
「道」沖,而用之或不盈。淵兮,似萬物之宗﹔挫其銳,解其紛,和其光
,同其塵﹔湛兮似或存。吾不知誰之子?象帝之先。
第五章
天地不仁,以萬物為芻狗﹔聖人不仁,以百姓為芻狗。天地之間,其猶橐
蘥乎?虛而不屈,動而愈出。多言數窮,不如守中。
第六章
谷神不死,是謂玄牝。玄牝之門,是謂天地根。綿綿若存,用之不勤。
第七章
天長地久。天地所以能長且久者,以其不自生,故能長久。是以聖人後其
身而身先,外其身而身存。非以其無私邪?故能成其私。
第八章
上善若水。水善利萬物而不爭。處眾人之所惡,故幾於道。居善地,心善
淵,與善仁,言善信,政善治,事善能,動善時。夫唯不爭,故無尤。
第九章
持而盈之,不如其已﹔揣而銳之,不可長保。金玉滿堂,莫之能守﹔富貴
而驕,自遺其咎。功遂身退,天之道。

View file

@ -0,0 +1,10 @@
#data
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
<!--京-->
<title>Yahoo! JAPAN</title>
<meta name="description" content="日本最大級のポータルサイト。検索、オークション、ニュース、メール、コミュニティ、ショッピング、など80以上のサービスを展開。あなたの生活をより豊かにする「ライフ・エンジン」を目指していきます。">
<style type="text/css" media="all">
#encoding
euc-jp

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,115 @@
#data
<meta
#encoding
windows-1252
#data
<
#encoding
windows-1252
#data
<!
#encoding
windows-1252
#data
<meta charset = "
#encoding
windows-1252
#data
<meta charset=euc-jp
#encoding
windows-1252
#data
<meta <meta charset='euc-jp'>
#encoding
euc-jp
#data
<meta charset = 'euc-jp'>
#encoding
euc-jp
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
#encoding
utf-8
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf
#encoding
windows-1252
#data
<meta http-equiv="Content-Type<meta charset="utf-8">
#encoding
windows-1252
#data
<meta http-equiv="Content-Type" content="text/html; charset='utf-8'">
#encoding
utf-8
#data
<meta http-equiv="Content-Type" content="text/html; charset='utf-8">
#encoding
windows-1252
#data
<meta
#encoding
windows-1252
#data
<meta charset =
#encoding
windows-1252
#data
<meta charset= utf-8
>
#encoding
utf-8
#data
<meta content = "text/html;
#encoding
windows-1252
#data
<meta charset="UTF-16">
#encoding
utf-8
#data
<meta charset="UTF-16LE">
#encoding
utf-8
#data
<meta charset="UTF-16BE">
#encoding
utf-8
#data
<html a=ñ>
<meta charset="utf-8">
#encoding
utf-8
#data
<html ñ>
<meta charset="utf-8">
#encoding
utf-8
#data
<html>ñ
<meta charset="utf-8">
#encoding
utf-8

View file

@ -0,0 +1,125 @@
{"tests": [
{"description": "proper attribute value escaping",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test \"with\" &quot;"}]]],
"expected": ["<span title='test \"with\" &amp;quot;'>"]
},
{"description": "proper attribute value non-quoting",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo"}]]],
"expected": ["<span title=foo>"],
"xhtml": ["<span title=\"foo\">"]
},
{"description": "proper attribute value non-quoting (with <)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo<bar"}]]],
"expected": ["<span title=foo<bar>"],
"xhtml": ["<span title=\"foo&lt;bar\">"]
},
{"description": "proper attribute value quoting (with =)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo=bar"}]]],
"expected": ["<span title=\"foo=bar\">"]
},
{"description": "proper attribute value quoting (with >)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo>bar"}]]],
"expected": ["<span title=\"foo>bar\">"]
},
{"description": "proper attribute value quoting (with \")",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\"bar"}]]],
"expected": ["<span title='foo\"bar'>"]
},
{"description": "proper attribute value quoting (with ')",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar"}]]],
"expected": ["<span title=\"foo'bar\">"]
},
{"description": "proper attribute value quoting (with both \" and ')",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar\"baz"}]]],
"expected": ["<span title=\"foo'bar&quot;baz\">"]
},
{"description": "proper attribute value quoting (with space)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo bar"}]]],
"expected": ["<span title=\"foo bar\">"]
},
{"description": "proper attribute value quoting (with tab)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\tbar"}]]],
"expected": ["<span title=\"foo\tbar\">"]
},
{"description": "proper attribute value quoting (with LF)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\nbar"}]]],
"expected": ["<span title=\"foo\nbar\">"]
},
{"description": "proper attribute value quoting (with CR)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\rbar"}]]],
"expected": ["<span title=\"foo\rbar\">"]
},
{"description": "proper attribute value non-quoting (with linetab)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Bbar"}]]],
"expected": ["<span title=foo\u000Bbar>"],
"xhtml": ["<span title=\"foo\u000Bbar\">"]
},
{"description": "proper attribute value quoting (with form feed)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Cbar"}]]],
"expected": ["<span title=\"foo\u000Cbar\">"]
},
{"description": "void element (as EmptyTag token)",
"input": [["EmptyTag", "img", {}]],
"expected": ["<img>"],
"xhtml": ["<img />"]
},
{"description": "void element (as StartTag token)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "img", {}]],
"expected": ["<img>"],
"xhtml": ["<img />"]
},
{"description": "doctype in error",
"input": [["Doctype", "foo"]],
"expected": ["<!DOCTYPE foo>"]
},
{"description": "character data",
"options": {"encoding":"utf-8"},
"input": [["Characters", "a<b>c&d"]],
"expected": ["a&lt;b&gt;c&amp;d"]
},
{"description": "rcdata",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"],
"xhtml": ["<script>a&lt;b&gt;c&amp;d"]
},
{"description": "doctype",
"input": [["Doctype", "HTML"]],
"expected": ["<!DOCTYPE HTML>"]
},
{"description": "HTML 4.01 DOCTYPE",
"input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd"]],
"expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"]
},
{"description": "HTML 4.01 DOCTYPE without system identifer",
"input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN"]],
"expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"]
},
{"description": "IBM DOCTYPE without public identifer",
"input": [["Doctype", "html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]],
"expected": ["<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"]
}
]}

View file

@ -0,0 +1,66 @@
{"tests": [
{"description": "no encoding",
"options": {"inject_meta_charset": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": [""],
"xhtml": ["<head></head>"]
},
{"description": "empytag head",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/title",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml","title",{}], ["Characters", "foo"],["EndTag", "http://www.w3.org/1999/xhtml", "title"], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8><title>foo</title>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><title>foo</title></head>"]
},
{"description": "head w/meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ two meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8><meta charset=utf-8>", "<head><meta charset=utf-8><meta charset=ascii>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta charset=\"utf-8\" /></head>", "<head><meta charset=\"utf-8\" /><meta charset=\"ascii\" /></head>"]
},
{"description": "head w/robots",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8><meta content=noindex name=robots>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta content=\"noindex\" name=\"robots\" /></head>"]
},
{"description": "head w/robots & charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta content=noindex name=robots><meta charset=utf-8>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
},
{"description": "head w/robots & charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
}
]}

View file

@ -0,0 +1,965 @@
{"tests": [
{"description": "html start-tag followed by text, with attributes",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", [{"namespace": null, "name": "lang", "value": "en"}]], ["Characters", "foo"]],
"expected": ["<html lang=en>foo"]
},
{"description": "html start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Comment", "foo"]],
"expected": ["<html><!--foo-->"]
},
{"description": "html start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", " foo"]],
"expected": ["<html> foo"]
},
{"description": "html start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "html start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "html start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "html start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}]],
"expected": [""]
},
{"description": "html end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Comment", "foo"]],
"expected": ["</html><!--foo-->"]
},
{"description": "html end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", " foo"]],
"expected": ["</html> foo"]
},
{"description": "html end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "html end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "html end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "html end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"]],
"expected": [""]
},
{"description": "head start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Comment", "foo"]],
"expected": ["<head><!--foo-->"]
},
{"description": "head start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", " foo"]],
"expected": ["<head> foo"]
},
{"description": "head start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", "foo"]],
"expected": ["<head>foo"]
},
{"description": "head start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head start-tag followed by end-tag (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["<head></foo>", "</foo>"]
},
{"description": "empty head element",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": [""]
},
{"description": "head start-tag followed by empty-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}]],
"expected": ["<head>", ""]
},
{"description": "head end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Comment", "foo"]],
"expected": ["</head><!--foo-->"]
},
{"description": "head end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", " foo"]],
"expected": ["</head> foo"]
},
{"description": "head end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "head end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "head end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": [""]
},
{"description": "body start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Comment", "foo"]],
"expected": ["<body><!--foo-->"]
},
{"description": "body start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", " foo"]],
"expected": ["<body> foo"]
},
{"description": "body start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "body start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "body start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "body start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}]],
"expected": [""]
},
{"description": "body end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Comment", "foo"]],
"expected": ["</body><!--foo-->"]
},
{"description": "body end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", " foo"]],
"expected": ["</body> foo"]
},
{"description": "body end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "body end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "body end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "body end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"]],
"expected": [""]
},
{"description": "li end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Comment", "foo"]],
"expected": ["</li><!--foo-->"]
},
{"description": "li end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", " foo"]],
"expected": ["</li> foo"]
},
{"description": "li end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", "foo"]],
"expected": ["</li>foo"]
},
{"description": "li end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</li><foo>"]
},
{"description": "li end-tag followed by li start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "li", {}]],
"expected": ["<li>"]
},
{"description": "li end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "li end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"]],
"expected": [""]
},
{"description": "dt end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Comment", "foo"]],
"expected": ["</dt><!--foo-->"]
},
{"description": "dt end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", " foo"]],
"expected": ["</dt> foo"]
},
{"description": "dt end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", "foo"]],
"expected": ["</dt>foo"]
},
{"description": "dt end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</dt><foo>"]
},
{"description": "dt end-tag followed by dt start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
"expected": ["<dt>"]
},
{"description": "dt end-tag followed by dd start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
"expected": ["<dd>"]
},
{"description": "dt end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</dt></foo>"]
},
{"description": "dt end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"]],
"expected": ["</dt>"]
},
{"description": "dd end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Comment", "foo"]],
"expected": ["</dd><!--foo-->"]
},
{"description": "dd end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", " foo"]],
"expected": ["</dd> foo"]
},
{"description": "dd end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", "foo"]],
"expected": ["</dd>foo"]
},
{"description": "dd end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</dd><foo>"]
},
{"description": "dd end-tag followed by dd start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
"expected": ["<dd>"]
},
{"description": "dd end-tag followed by dt start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
"expected": ["<dt>"]
},
{"description": "dd end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "dd end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"]],
"expected": [""]
},
{"description": "p end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Comment", "foo"]],
"expected": ["</p><!--foo-->"]
},
{"description": "p end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", " foo"]],
"expected": ["</p> foo"]
},
{"description": "p end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", "foo"]],
"expected": ["</p>foo"]
},
{"description": "p end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</p><foo>"]
},
{"description": "p end-tag followed by address start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "address", {}]],
"expected": ["<address>"]
},
{"description": "p end-tag followed by article start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "article", {}]],
"expected": ["<article>"]
},
{"description": "p end-tag followed by aside start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "aside", {}]],
"expected": ["<aside>"]
},
{"description": "p end-tag followed by blockquote start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "blockquote", {}]],
"expected": ["<blockquote>"]
},
{"description": "p end-tag followed by datagrid start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "datagrid", {}]],
"expected": ["<datagrid>"]
},
{"description": "p end-tag followed by dialog start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dialog", {}]],
"expected": ["<dialog>"]
},
{"description": "p end-tag followed by dir start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dir", {}]],
"expected": ["<dir>"]
},
{"description": "p end-tag followed by div start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
"expected": ["<div>"]
},
{"description": "p end-tag followed by dl start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dl", {}]],
"expected": ["<dl>"]
},
{"description": "p end-tag followed by fieldset start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "fieldset", {}]],
"expected": ["<fieldset>"]
},
{"description": "p end-tag followed by footer start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "footer", {}]],
"expected": ["<footer>"]
},
{"description": "p end-tag followed by form start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "form", {}]],
"expected": ["<form>"]
},
{"description": "p end-tag followed by h1 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h1", {}]],
"expected": ["<h1>"]
},
{"description": "p end-tag followed by h2 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h2", {}]],
"expected": ["<h2>"]
},
{"description": "p end-tag followed by h3 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h3", {}]],
"expected": ["<h3>"]
},
{"description": "p end-tag followed by h4 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h4", {}]],
"expected": ["<h4>"]
},
{"description": "p end-tag followed by h5 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h5", {}]],
"expected": ["<h5>"]
},
{"description": "p end-tag followed by h6 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h6", {}]],
"expected": ["<h6>"]
},
{"description": "p end-tag followed by header start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "header", {}]],
"expected": ["<header>"]
},
{"description": "p end-tag followed by hr empty-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EmptyTag", "hr", {}]],
"expected": ["<hr>"]
},
{"description": "p end-tag followed by menu start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "menu", {}]],
"expected": ["<menu>"]
},
{"description": "p end-tag followed by nav start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "nav", {}]],
"expected": ["<nav>"]
},
{"description": "p end-tag followed by ol start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ol", {}]],
"expected": ["<ol>"]
},
{"description": "p end-tag followed by p start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "p", {}]],
"expected": ["<p>"]
},
{"description": "p end-tag followed by pre start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}]],
"expected": ["<pre>"]
},
{"description": "p end-tag followed by section start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "section", {}]],
"expected": ["<section>"]
},
{"description": "p end-tag followed by table start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "table", {}]],
"expected": ["<table>"]
},
{"description": "p end-tag followed by ul start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ul", {}]],
"expected": ["<ul>"]
},
{"description": "p end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "p end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"]],
"expected": [""]
},
{"description": "optgroup end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Comment", "foo"]],
"expected": ["</optgroup><!--foo-->"]
},
{"description": "optgroup end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", " foo"]],
"expected": ["</optgroup> foo"]
},
{"description": "optgroup end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", "foo"]],
"expected": ["</optgroup>foo"]
},
{"description": "optgroup end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</optgroup><foo>"]
},
{"description": "optgroup end-tag followed by optgroup start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
"expected": ["<optgroup>"]
},
{"description": "optgroup end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "optgroup end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"]],
"expected": [""]
},
{"description": "option end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Comment", "foo"]],
"expected": ["</option><!--foo-->"]
},
{"description": "option end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", " foo"]],
"expected": ["</option> foo"]
},
{"description": "option end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", "foo"]],
"expected": ["</option>foo"]
},
{"description": "option end-tag followed by optgroup start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
"expected": ["<optgroup>"]
},
{"description": "option end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</option><foo>"]
},
{"description": "option end-tag followed by option start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "option", {}]],
"expected": ["<option>"]
},
{"description": "option end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "option end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"]],
"expected": [""]
},
{"description": "colgroup start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Comment", "foo"]],
"expected": ["<colgroup><!--foo-->"]
},
{"description": "colgroup start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", " foo"]],
"expected": ["<colgroup> foo"]
},
{"description": "colgroup start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", "foo"]],
"expected": ["<colgroup>foo"]
},
{"description": "colgroup start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<colgroup><foo>"]
},
{"description": "first colgroup in a table with a col child",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EmptyTag", "col", {}]],
"expected": ["<table><col>"]
},
{"description": "colgroup with a col child, following another colgroup",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "col", {}]],
"expected": ["</colgroup><col>", "<colgroup><col>"]
},
{"description": "colgroup start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["<colgroup></foo>"]
},
{"description": "colgroup start-tag at EOF",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}]],
"expected": ["<colgroup>"]
},
{"description": "colgroup end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Comment", "foo"]],
"expected": ["</colgroup><!--foo-->"]
},
{"description": "colgroup end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", " foo"]],
"expected": ["</colgroup> foo"]
},
{"description": "colgroup end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "colgroup end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "colgroup end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "colgroup end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"]],
"expected": [""]
},
{"description": "thead end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Comment", "foo"]],
"expected": ["</thead><!--foo-->"]
},
{"description": "thead end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", " foo"]],
"expected": ["</thead> foo"]
},
{"description": "thead end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", "foo"]],
"expected": ["</thead>foo"]
},
{"description": "thead end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</thead><foo>"]
},
{"description": "thead end-tag followed by tbody start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>"]
},
{"description": "thead end-tag followed by tfoot start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
"expected": ["<tfoot>"]
},
{"description": "thead end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</thead></foo>"]
},
{"description": "thead end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"]],
"expected": ["</thead>"]
},
{"description": "tbody start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Comment", "foo"]],
"expected": ["<tbody><!--foo-->"]
},
{"description": "tbody start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", " foo"]],
"expected": ["<tbody> foo"]
},
{"description": "tbody start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", "foo"]],
"expected": ["<tbody>foo"]
},
{"description": "tbody start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<tbody><foo>"]
},
{"description": "first tbody in a table with a tr child",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<table><tr>"]
},
{"description": "tbody with a tr child, following another tbody",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tbody><tr>", "</tbody><tr>"]
},
{"description": "tbody with a tr child, following a thead",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tbody><tr>", "</thead><tr>"]
},
{"description": "tbody with a tr child, following a tfoot",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tbody><tr>", "</tfoot><tr>"]
},
{"description": "tbody start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["<tbody></foo>"]
},
{"description": "tbody start-tag at EOF",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>"]
},
{"description": "tbody end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Comment", "foo"]],
"expected": ["</tbody><!--foo-->"]
},
{"description": "tbody end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", " foo"]],
"expected": ["</tbody> foo"]
},
{"description": "tbody end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", "foo"]],
"expected": ["</tbody>foo"]
},
{"description": "tbody end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</tbody><foo>"]
},
{"description": "tbody end-tag followed by tbody start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>", "</tbody>"]
},
{"description": "tbody end-tag followed by tfoot start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
"expected": ["<tfoot>"]
},
{"description": "tbody end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "tbody end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"]],
"expected": [""]
},
{"description": "tfoot end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Comment", "foo"]],
"expected": ["</tfoot><!--foo-->"]
},
{"description": "tfoot end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", " foo"]],
"expected": ["</tfoot> foo"]
},
{"description": "tfoot end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", "foo"]],
"expected": ["</tfoot>foo"]
},
{"description": "tfoot end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</tfoot><foo>"]
},
{"description": "tfoot end-tag followed by tbody start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>", "</tfoot>"]
},
{"description": "tfoot end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "tfoot end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"]],
"expected": [""]
},
{"description": "tr end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Comment", "foo"]],
"expected": ["</tr><!--foo-->"]
},
{"description": "tr end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", " foo"]],
"expected": ["</tr> foo"]
},
{"description": "tr end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", "foo"]],
"expected": ["</tr>foo"]
},
{"description": "tr end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</tr><foo>"]
},
{"description": "tr end-tag followed by tr start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tr>", "</tr>"]
},
{"description": "tr end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "tr end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"]],
"expected": [""]
},
{"description": "td end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Comment", "foo"]],
"expected": ["</td><!--foo-->"]
},
{"description": "td end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", " foo"]],
"expected": ["</td> foo"]
},
{"description": "td end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", "foo"]],
"expected": ["</td>foo"]
},
{"description": "td end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</td><foo>"]
},
{"description": "td end-tag followed by td start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
"expected": ["<td>", "</td>"]
},
{"description": "td end-tag followed by th start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
"expected": ["<th>", "</td>"]
},
{"description": "td end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "td end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"]],
"expected": [""]
},
{"description": "th end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Comment", "foo"]],
"expected": ["</th><!--foo-->"]
},
{"description": "th end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", " foo"]],
"expected": ["</th> foo"]
},
{"description": "th end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", "foo"]],
"expected": ["</th>foo"]
},
{"description": "th end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</th><foo>"]
},
{"description": "th end-tag followed by th start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
"expected": ["<th>", "</th>"]
},
{"description": "th end-tag followed by td start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
"expected": ["<td>", "</th>"]
},
{"description": "th end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "th end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml" , "th"]],
"expected": [""]
}
]}

View file

@ -0,0 +1,60 @@
{"tests":[
{"description": "quote_char=\"'\"",
"options": {"quote_char": "'"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test 'with' quote_char"}]]],
"expected": ["<span title='test &#39;with&#39; quote_char'>"]
},
{"description": "quote_attr_values=true",
"options": {"quote_attr_values": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "button", [{"namespace": null, "name": "disabled", "value" :"disabled"}]]],
"expected": ["<button disabled>"],
"xhtml": ["<button disabled=\"disabled\">"]
},
{"description": "quote_attr_values=true with irrelevant",
"options": {"quote_attr_values": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
"expected": ["<div irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
},
{"description": "use_trailing_solidus=true with void element",
"options": {"use_trailing_solidus": true},
"input": [["EmptyTag", "img", {}]],
"expected": ["<img />"]
},
{"description": "use_trailing_solidus=true with non-void element",
"options": {"use_trailing_solidus": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
"expected": ["<div>"]
},
{"description": "minimize_boolean_attributes=false",
"options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
"expected": ["<div irrelevant=irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
},
{"description": "minimize_boolean_attributes=false with empty value",
"options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :""}]]],
"expected": ["<div irrelevant=\"\">"]
},
{"description": "escape less than signs in attribute values",
"options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "a", [{"namespace": null, "name": "title", "value": "a<b>c&d"}]]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"]
},
{"description": "rcdata",
"options": {"escape_rcdata": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a&lt;b&gt;c&amp;d"]
}
]}

View file

@ -0,0 +1,51 @@
{"tests": [
{"description": "bare text with leading spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "\t\r\n\u000C foo"]],
"expected": [" foo"]
},
{"description": "bare text with trailing spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000C"]],
"expected": ["foo "]
},
{"description": "bare text with inner spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000C bar"]],
"expected": ["foo bar"]
},
{"description": "text within <pre>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
"expected": ["<pre>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</pre>"]
},
{"description": "text within <pre>, with inner markup",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C fo"], ["StartTag", "http://www.w3.org/1999/xhtml", "span", {}], ["Characters", "o \t\r\n\u000C b"], ["EndTag", "http://www.w3.org/1999/xhtml", "span"], ["Characters", "ar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
"expected": ["<pre>\t\r\n\u000C fo<span>o \t\r\n\u000C b</span>ar \t\r\n\u000C</pre>"]
},
{"description": "text within <textarea>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "textarea", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "textarea"]],
"expected": ["<textarea>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</textarea>"]
},
{"description": "text within <script>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "script"]],
"expected": ["<script>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</script>"]
},
{"description": "text within <style>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "style"]],
"expected": ["<style>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</style>"]
}
]}

View file

@ -0,0 +1,107 @@
Tokenizer tests
===============
The test format is [JSON](http://www.json.org/). This has the advantage
that the syntax allows backward-compatible extensions to the tests and
the disadvantage that it is relatively verbose.
Basic Structure
---------------
{"tests": [
    {"description": "Test description",
    "input": "input_string",
    "output": [expected_output_tokens],
    "initialStates": [initial_states],
    "lastStartTag": last_start_tag,
"errors": [parse_errors]
    }
]}
Multiple tests per file are allowed simply by adding more objects to the
"tests" list.
Each parse error is an object that contains error `code` and one-based
error location indices: `line` and `col`.
`description`, `input` and `output` are always present. The other values
are optional.
### Test set-up
`test.input` is a string containing the characters to pass to the
tokenizer. Specifically, it represents the characters of the **input
stream**, and so implementations are expected to perform the processing
described in the spec's **Preprocessing the input stream** section
before feeding the result to the tokenizer.
If `test.doubleEscaped` is present and `true`, then `test.input` is not
quite as described above. Instead, it must first be subjected to another
round of unescaping (i.e., in addition to any unescaping involved in the
JSON import), and the result of *that* represents the characters of the
input stream. Currently, the only unescaping required by this option is
to convert each sequence of the form \\uHHHH (where H is a hex digit)
into the corresponding Unicode code point. (Note that this option also
affects the interpretation of `test.output`.)
`test.initialStates` is a list of strings, each being the name of a
tokenizer state which can be one of the following:
- `Data state`
- `PLAINTEXT state`
- `RCDATA state`
- `RAWTEXT state`
- `Script data state`
- `CDATA section state`
The test should be run once for each string, using it
to set the tokenizer's initial state for that run. If
`test.initialStates` is omitted, it defaults to `["Data state"]`.
`test.lastStartTag` is a lowercase string that should be used as "the
tag name of the last start tag to have been emitted from this
tokenizer", referenced in the spec's definition of **appropriate end tag
token**. If it is omitted, it is treated as if "no start tag has been
emitted from this tokenizer".
### Test results
`test.output` is a list of tokens, ordered with the first produced by
the tokenizer the first (leftmost) in the list. The list must mach the
**complete** list of tokens that the tokenizer should produce. Valid
tokens are:
["DOCTYPE", name, public_id, system_id, correctness]
["StartTag", name, {attributes}*, true*]
["StartTag", name, {attributes}]
["EndTag", name]
["Comment", data]
["Character", data]
`public_id` and `system_id` are either strings or `null`. `correctness`
is either `true` or `false`; `true` corresponds to the force-quirks flag
being false, and vice-versa.
When the self-closing flag is set, the `StartTag` array has `true` as
its fourth entry. When the flag is not set, the array has only three
entries for backwards compatibility.
All adjacent character tokens are coalesced into a single
`["Character", data]` token.
If `test.doubleEscaped` is present and `true`, then every string within
`test.output` must be further unescaped (as described above) before
comparing with the tokenizer's output.
xmlViolation tests
------------------
`tokenizer/xmlViolation.test` differs from the above in a couple of
ways:
- The name of the single member of the top-level JSON object is
"xmlViolationTests" instead of "tests".
- Each test's expected output assumes that implementation is applying
the tweaks given in the spec's "Coercing an HTML DOM into an
infoset" section.

View file

@ -0,0 +1,93 @@
{"tests": [
{"description":"PLAINTEXT content model flag",
"initialStates":["PLAINTEXT state"],
"lastStartTag":"plaintext",
"input":"<head>&body;",
"output":[["Character", "<head>&body;"]]},
{"description":"PLAINTEXT with seeming close tag",
"initialStates":["PLAINTEXT state"],
"lastStartTag":"plaintext",
"input":"</plaintext>&body;",
"output":[["Character", "</plaintext>&body;"]]},
{"description":"End tag closing RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp>",
"output":[["Character", "foo"], ["EndTag", "xmp"]]},
{"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xMp>",
"output":[["Character", "foo"], ["EndTag", "xmp"]]},
{"description":"End tag closing RCDATA or RAWTEXT (ending with space)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp ",
"output":[["Character", "foo"]],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 10 }
]},
{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp",
"output":[["Character", "foo</xmp"]]},
{"description":"End tag closing RCDATA or RAWTEXT (ending with slash)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp/",
"output":[["Character", "foo"]],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 10 }
]},
{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp<",
"output":[["Character", "foo</xmp<"]]},
{"description":"End tag with incorrect name in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</foo>bar</xmp>",
"output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]},
{"description":"Partial end tags leading straight into partial end tags",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</xmp</xmp</xmp>",
"output":[["Character", "</xmp</xmp"], ["EndTag", "xmp"]]},
{"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</foo>bar</xmpaar>",
"output":[["Character", "</foo>bar</xmpaar>"]]},
{"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp></baz>",
"output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]},
{"description":"RAWTEXT w/ something looking like an entity",
"initialStates":["RAWTEXT state"],
"lastStartTag":"xmp",
"input":"&foo;",
"output":[["Character", "&foo;"]]},
{"description":"RCDATA w/ an entity",
"initialStates":["RCDATA state"],
"lastStartTag":"textarea",
"input":"&lt;",
"output":[["Character", "<"]]}
]}

View file

@ -0,0 +1,330 @@
{
"tests": [
{
"description":"CR in bogus comment state",
"input":"<?\u000d",
"output":[["Comment", "?\u000a"]],
"errors":[
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
]
},
{
"description":"CRLF in bogus comment state",
"input":"<?\u000d\u000a",
"output":[["Comment", "?\u000a"]],
"errors":[
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
]
},
{
"description":"CRLFLF in bogus comment state",
"input":"<?\u000d\u000a\u000a",
"output":[["Comment", "?\u000a\u000a"]],
"errors":[
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
]
},
{
"description":"Raw NUL replacement",
"doubleEscaped":true,
"initialStates":["RCDATA state", "RAWTEXT state", "PLAINTEXT state", "Script data state"],
"input":"\\u0000",
"output":[["Character", "\\uFFFD"]],
"errors":[
{ "code": "unexpected-null-character", "line": 1, "col": 1 }
]
},
{
"description":"NUL in CDATA section",
"doubleEscaped":true,
"initialStates":["CDATA section state"],
"input":"\\u0000]]>",
"output":[["Character", "\\u0000"]]
},
{
"description":"NUL in script HTML comment",
"doubleEscaped":true,
"initialStates":["Script data state"],
"input":"<!--test\\u0000--><!--test-\\u0000--><!--test--\\u0000-->",
"output":[["Character", "<!--test\\uFFFD--><!--test-\\uFFFD--><!--test--\\uFFFD-->"]],
"errors":[
{ "code": "unexpected-null-character", "line": 1, "col": 9 },
{ "code": "unexpected-null-character", "line": 1, "col": 22 },
{ "code": "unexpected-null-character", "line": 1, "col": 36 }
]
},
{
"description":"NUL in script HTML comment - double escaped",
"doubleEscaped":true,
"initialStates":["Script data state"],
"input":"<!--<script>\\u0000--><!--<script>-\\u0000--><!--<script>--\\u0000-->",
"output":[["Character", "<!--<script>\\uFFFD--><!--<script>-\\uFFFD--><!--<script>--\\uFFFD-->"]],
"errors":[
{ "code": "unexpected-null-character", "line": 1, "col": 13 },
{ "code": "unexpected-null-character", "line": 1, "col": 30 },
{ "code": "unexpected-null-character", "line": 1, "col": 48 }
]
},
{
"description":"EOF in script HTML comment",
"initialStates":["Script data state"],
"input":"<!--test",
"output":[["Character", "<!--test"]],
"errors":[
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 9 }
]
},
{
"description":"EOF in script HTML comment after dash",
"initialStates":["Script data state"],
"input":"<!--test-",
"output":[["Character", "<!--test-"]],
"errors":[
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 10 }
]
},
{
"description":"EOF in script HTML comment after dash dash",
"initialStates":["Script data state"],
"input":"<!--test--",
"output":[["Character", "<!--test--"]],
"errors":[
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 11 }
]
},
{
"description":"EOF in script HTML comment double escaped after dash",
"initialStates":["Script data state"],
"input":"<!--<script>-",
"output":[["Character", "<!--<script>-"]],
"errors":[
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 14 }
]
},
{
"description":"EOF in script HTML comment double escaped after dash dash",
"initialStates":["Script data state"],
"input":"<!--<script>--",
"output":[["Character", "<!--<script>--"]],
"errors":[
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 15 }
]
},
{
"description":"EOF in script HTML comment - double escaped",
"initialStates":["Script data state"],
"input":"<!--<script>",
"output":[["Character", "<!--<script>"]],
"errors":[
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 13 }
]
},
{
"description":"Dash in script HTML comment",
"initialStates":["Script data state"],
"input":"<!-- - -->",
"output":[["Character", "<!-- - -->"]]
},
{
"description":"Dash less-than in script HTML comment",
"initialStates":["Script data state"],
"input":"<!-- -< -->",
"output":[["Character", "<!-- -< -->"]]
},
{
"description":"Dash at end of script HTML comment",
"initialStates":["Script data state"],
"input":"<!--test--->",
"output":[["Character", "<!--test--->"]]
},
{
"description":"</script> in script HTML comment",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!-- </script> --></script>",
"output":[["Character", "<!-- "], ["EndTag", "script"], ["Character", " -->"], ["EndTag", "script"]]
},
{
"description":"</script> in script HTML comment - double escaped",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!-- <script></script> --></script>",
"output":[["Character", "<!-- <script></script> -->"], ["EndTag", "script"]]
},
{
"description":"</script> in script HTML comment - double escaped with nested <script>",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!-- <script><script></script></script> --></script>",
"output":[["Character", "<!-- <script><script></script>"], ["EndTag", "script"], ["Character", " -->"], ["EndTag", "script"]]
},
{
"description":"</script> in script HTML comment - double escaped with abrupt end",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!-- <script>--></script> --></script>",
"output":[["Character", "<!-- <script>-->"], ["EndTag", "script"], ["Character", " -->"], ["EndTag", "script"]]
},
{
"description":"Incomplete start tag in script HTML comment double escaped",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!--<scrip></script>-->",
"output":[["Character", "<!--<scrip>"], ["EndTag", "script"], ["Character", "-->"]]
},
{
"description":"Unclosed start tag in script HTML comment double escaped",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!--<script</script>-->",
"output":[["Character", "<!--<script"], ["EndTag", "script"], ["Character", "-->"]]
},
{
"description":"Incomplete end tag in script HTML comment double escaped",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!--<script></scrip>-->",
"output":[["Character", "<!--<script></scrip>-->"]]
},
{
"description":"Unclosed end tag in script HTML comment double escaped",
"initialStates":["Script data state"],
"lastStartTag":"script",
"input":"<!--<script></script-->",
"output":[["Character", "<!--<script></script-->"]]
},
{
"description":"leading U+FEFF must pass through",
"initialStates":["Data state", "RCDATA state", "RAWTEXT state", "Script data state"],
"doubleEscaped":true,
"input":"\\uFEFFfoo\\uFEFFbar",
"output":[["Character", "\\uFEFFfoo\\uFEFFbar"]]
},
{
"description":"Non BMP-charref in RCDATA",
"initialStates":["RCDATA state"],
"input":"&NotEqualTilde;",
"output":[["Character", "\u2242\u0338"]]
},
{
"description":"Bad charref in RCDATA",
"initialStates":["RCDATA state"],
"input":"&NotEqualTild;",
"output":[["Character", "&NotEqualTild;"]],
"errors":[
{ "code": "unknown-named-character-reference", "line": 1, "col": 14 }
]
},
{
"description":"lowercase endtags",
"initialStates":["RCDATA state", "RAWTEXT state", "Script data state"],
"lastStartTag":"xmp",
"input":"</XMP>",
"output":[["EndTag","xmp"]]
},
{
"description":"bad endtag (space before name)",
"initialStates":["RCDATA state", "RAWTEXT state", "Script data state"],
"lastStartTag":"xmp",
"input":"</ XMP>",
"output":[["Character","</ XMP>"]]
},
{
"description":"bad endtag (not matching last start tag)",
"initialStates":["RCDATA state", "RAWTEXT state", "Script data state"],
"lastStartTag":"xmp",
"input":"</xm>",
"output":[["Character","</xm>"]]
},
{
"description":"bad endtag (without close bracket)",
"initialStates":["RCDATA state", "RAWTEXT state", "Script data state"],
"lastStartTag":"xmp",
"input":"</xm ",
"output":[["Character","</xm "]]
},
{
"description":"bad endtag (trailing solidus)",
"initialStates":["RCDATA state", "RAWTEXT state", "Script data state"],
"lastStartTag":"xmp",
"input":"</xm/",
"output":[["Character","</xm/"]]
},
{
"description":"Non BMP-charref in attribute",
"input":"<p id=\"&NotEqualTilde;\">",
"output":[["StartTag", "p", {"id":"\u2242\u0338"}]]
},
{
"description":"--!NUL in comment ",
"doubleEscaped":true,
"input":"<!----!\\u0000-->",
"output":[["Comment", "--!\\uFFFD"]],
"errors":[
{ "code": "unexpected-null-character", "line": 1, "col": 8 }
]
},
{
"description":"space EOF after doctype ",
"input":"<!DOCTYPE html ",
"output":[["DOCTYPE", "html", null, null , false]],
"errors":[
{ "code": "eof-in-doctype", "line": 1, "col": 16 }
]
},
{
"description":"CDATA in HTML content",
"input":"<![CDATA[foo]]>",
"output":[["Comment", "[CDATA[foo]]"]],
"errors":[
{ "code": "cdata-in-html-content", "line": 1, "col": 9 }
]
},
{
"description":"CDATA content",
"input":"foo&#32;]]>",
"initialStates":["CDATA section state"],
"output":[["Character", "foo&#32;"]]
},
{
"description":"CDATA followed by HTML content",
"input":"foo&#32;]]>&#32;",
"initialStates":["CDATA section state"],
"output":[["Character", "foo&#32; "]]
},
{
"description":"CDATA with extra bracket",
"input":"foo]]]>",
"initialStates":["CDATA section state"],
"output":[["Character", "foo]"]]
},
{
"description":"CDATA without end marker",
"input":"foo",
"initialStates":["CDATA section state"],
"output":[["Character", "foo"]],
"errors":[
{ "code": "eof-in-cdata", "line": 1, "col": 4 }
]
},
{
"description":"CDATA with single bracket ending",
"input":"foo]",
"initialStates":["CDATA section state"],
"output":[["Character", "foo]"]],
"errors":[
{ "code": "eof-in-cdata", "line": 1, "col": 5 }
]
},
{
"description":"CDATA with two brackets ending",
"input":"foo]]",
"initialStates":["CDATA section state"],
"output":[["Character", "foo]]"]],
"errors":[
{ "code": "eof-in-cdata", "line": 1, "col": 6 }
]
}
]
}

View file

@ -0,0 +1,542 @@
{"tests": [
{"description": "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.",
"input":"<h a=\"&noti;\">",
"output": [["StartTag", "h", {"a": "&noti;"}]]},
{"description": "Entity name requiring semicolon instead followed by the equals sign in a double-quoted attribute value.",
"input":"<h a=\"&lang=\">",
"output": [["StartTag", "h", {"a": "&lang="}]]},
{"description": "Valid entity name followed by the equals sign in a double-quoted attribute value.",
"input":"<h a=\"&not=\">",
"output": [["StartTag", "h", {"a": "&not="}]]},
{"description": "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.",
"input":"<h a='&noti;'>",
"output": [["StartTag", "h", {"a": "&noti;"}]]},
{"description": "Entity name requiring semicolon instead followed by the equals sign in a single-quoted attribute value.",
"input":"<h a='&lang='>",
"output": [["StartTag", "h", {"a": "&lang="}]]},
{"description": "Valid entity name followed by the equals sign in a single-quoted attribute value.",
"input":"<h a='&not='>",
"output": [["StartTag", "h", {"a": "&not="}]]},
{"description": "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.",
"input":"<h a=&noti;>",
"output": [["StartTag", "h", {"a": "&noti;"}]]},
{"description": "Entity name requiring semicolon instead followed by the equals sign in an unquoted attribute value.",
"input":"<h a=&lang=>",
"output": [["StartTag", "h", {"a": "&lang="}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 11 }
]},
{"description": "Valid entity name followed by the equals sign in an unquoted attribute value.",
"input":"<h a=&not=>",
"output": [["StartTag", "h", {"a": "&not="}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 10 }
]},
{"description": "Ambiguous ampersand.",
"input":"&rrrraannddom;",
"output": [["Character", "&rrrraannddom;"]],
"errors":[
{ "code": "unknown-named-character-reference", "line": 1, "col": 14 }
]},
{"description": "Semicolonless named entity 'not' followed by 'i;' in body",
"input":"&noti;",
"output": [["Character", "\u00ACi;"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 5 }
]},
{"description": "Very long undefined named entity in body",
"input":"&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;",
"output": [["Character", "&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;"]],
"errors":[
{ "code": "unknown-named-character-reference", "line": 1, "col": 950 }
]},
{"description": "CR as numeric entity",
"input":"&#013;",
"output": [["Character", "\r"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 7 }
]},
{"description": "CR as hexadecimal numeric entity",
"input":"&#x00D;",
"output": [["Character", "\r"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 EURO SIGN numeric entity.",
"input":"&#0128;",
"output": [["Character", "\u20AC"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0129;",
"output": [["Character", "\u0081"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK numeric entity.",
"input":"&#0130;",
"output": [["Character", "\u201A"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK numeric entity.",
"input":"&#0131;",
"output": [["Character", "\u0192"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK numeric entity.",
"input":"&#0132;",
"output": [["Character", "\u201E"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 HORIZONTAL ELLIPSIS numeric entity.",
"input":"&#0133;",
"output": [["Character", "\u2026"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 DAGGER numeric entity.",
"input":"&#0134;",
"output": [["Character", "\u2020"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 DOUBLE DAGGER numeric entity.",
"input":"&#0135;",
"output": [["Character", "\u2021"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT numeric entity.",
"input":"&#0136;",
"output": [["Character", "\u02C6"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 PER MILLE SIGN numeric entity.",
"input":"&#0137;",
"output": [["Character", "\u2030"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON numeric entity.",
"input":"&#0138;",
"output": [["Character", "\u0160"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK numeric entity.",
"input":"&#0139;",
"output": [["Character", "\u2039"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE numeric entity.",
"input":"&#0140;",
"output": [["Character", "\u0152"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0141;",
"output": [["Character", "\u008D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON numeric entity.",
"input":"&#0142;",
"output": [["Character", "\u017D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0143;",
"output": [["Character", "\u008F"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0144;",
"output": [["Character", "\u0090"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK numeric entity.",
"input":"&#0145;",
"output": [["Character", "\u2018"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK numeric entity.",
"input":"&#0146;",
"output": [["Character", "\u2019"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK numeric entity.",
"input":"&#0147;",
"output": [["Character", "\u201C"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK numeric entity.",
"input":"&#0148;",
"output": [["Character", "\u201D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 BULLET numeric entity.",
"input":"&#0149;",
"output": [["Character", "\u2022"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 EN DASH numeric entity.",
"input":"&#0150;",
"output": [["Character", "\u2013"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 EM DASH numeric entity.",
"input":"&#0151;",
"output": [["Character", "\u2014"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SMALL TILDE numeric entity.",
"input":"&#0152;",
"output": [["Character", "\u02DC"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 TRADE MARK SIGN numeric entity.",
"input":"&#0153;",
"output": [["Character", "\u2122"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON numeric entity.",
"input":"&#0154;",
"output": [["Character", "\u0161"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK numeric entity.",
"input":"&#0155;",
"output": [["Character", "\u203A"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LIGATURE OE numeric entity.",
"input":"&#0156;",
"output": [["Character", "\u0153"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0157;",
"output": [["Character", "\u009D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 EURO SIGN hexadecimal numeric entity.",
"input":"&#x080;",
"output": [["Character", "\u20AC"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x081;",
"output": [["Character", "\u0081"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x082;",
"output": [["Character", "\u201A"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK hexadecimal numeric entity.",
"input":"&#x083;",
"output": [["Character", "\u0192"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x084;",
"output": [["Character", "\u201E"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 HORIZONTAL ELLIPSIS hexadecimal numeric entity.",
"input":"&#x085;",
"output": [["Character", "\u2026"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 DAGGER hexadecimal numeric entity.",
"input":"&#x086;",
"output": [["Character", "\u2020"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 DOUBLE DAGGER hexadecimal numeric entity.",
"input":"&#x087;",
"output": [["Character", "\u2021"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT hexadecimal numeric entity.",
"input":"&#x088;",
"output": [["Character", "\u02C6"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 PER MILLE SIGN hexadecimal numeric entity.",
"input":"&#x089;",
"output": [["Character", "\u2030"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON hexadecimal numeric entity.",
"input":"&#x08A;",
"output": [["Character", "\u0160"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x08B;",
"output": [["Character", "\u2039"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE hexadecimal numeric entity.",
"input":"&#x08C;",
"output": [["Character", "\u0152"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x08D;",
"output": [["Character", "\u008D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON hexadecimal numeric entity.",
"input":"&#x08E;",
"output": [["Character", "\u017D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x08F;",
"output": [["Character", "\u008F"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x090;",
"output": [["Character", "\u0090"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x091;",
"output": [["Character", "\u2018"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x092;",
"output": [["Character", "\u2019"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x093;",
"output": [["Character", "\u201C"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x094;",
"output": [["Character", "\u201D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 BULLET hexadecimal numeric entity.",
"input":"&#x095;",
"output": [["Character", "\u2022"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 EN DASH hexadecimal numeric entity.",
"input":"&#x096;",
"output": [["Character", "\u2013"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 EM DASH hexadecimal numeric entity.",
"input":"&#x097;",
"output": [["Character", "\u2014"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SMALL TILDE hexadecimal numeric entity.",
"input":"&#x098;",
"output": [["Character", "\u02DC"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 TRADE MARK SIGN hexadecimal numeric entity.",
"input":"&#x099;",
"output": [["Character", "\u2122"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON hexadecimal numeric entity.",
"input":"&#x09A;",
"output": [["Character", "\u0161"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x09B;",
"output": [["Character", "\u203A"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LIGATURE OE hexadecimal numeric entity.",
"input":"&#x09C;",
"output": [["Character", "\u0153"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x09D;",
"output": [["Character", "\u009D"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN SMALL LETTER Z WITH CARON hexadecimal numeric entity.",
"input":"&#x09E;",
"output": [["Character", "\u017E"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Windows-1252 LATIN CAPITAL LETTER Y WITH DIAERESIS hexadecimal numeric entity.",
"input":"&#x09F;",
"output": [["Character", "\u0178"]],
"errors":[
{ "code": "control-character-reference", "line": 1, "col": 8 }
]},
{"description": "Decimal numeric entity followed by hex character a.",
"input":"&#97a",
"output": [["Character", "aa"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 5 }
]},
{"description": "Decimal numeric entity followed by hex character A.",
"input":"&#97A",
"output": [["Character", "aA"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 5 }
]},
{"description": "Decimal numeric entity followed by hex character f.",
"input":"&#97f",
"output": [["Character", "af"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 5 }
]},
{"description": "Decimal numeric entity followed by hex character A.",
"input":"&#97F",
"output": [["Character", "aF"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 5 }
]}
]}

View file

@ -0,0 +1,36 @@
{"tests": [
{"description":"Commented close tag in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!--</xmp>--></xmp>",
"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
{"description":"Bogus comment in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!-->baz</xmp>",
"output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!--></xmp><!-->baz</xmp>",
"output":[["Character", "foo<!-->"], ["EndTag", "xmp"], ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]],
"errors":[
{ "code": "abrupt-closing-of-empty-comment", "line": 1, "col": 19 }
]},
{"description":"Commented entities in RCDATA",
"initialStates":["RCDATA state"],
"lastStartTag":"xmp",
"input":" &amp; <!-- &amp; --> &amp; </xmp>",
"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
]}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,9 @@
{"tests": [
{"description":"<!---- >",
"input":"<!---- >",
"output":[["Comment","-- >"]],
"errors":[
{ "code": "eof-in-comment", "line": 1, "col": 9 }
]}
]}

View file

@ -0,0 +1,349 @@
{"tests": [
{"description":"Correct Doctype lowercase",
"input":"<!DOCTYPE html>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype uppercase",
"input":"<!DOCTYPE HTML>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype mixed case",
"input":"<!DOCTYPE HtMl>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype case with EOF",
"input":"<!DOCTYPE HtMl",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "eof-in-doctype", "line": 1, "col": 15 }
]},
{"description":"Truncated doctype start",
"input":"<!DOC>",
"output":[["Comment", "DOC"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 }
]},
{"description":"Doctype in error",
"input":"<!DOCTYPE foo>",
"output":[["DOCTYPE", "foo", null, null, true]]},
{"description":"Single Start Tag",
"input":"<h>",
"output":[["StartTag", "h", {}]]},
{"description":"Empty end tag",
"input":"</>",
"output":[],
"errors":[
{ "code": "missing-end-tag-name", "line": 1, "col": 3 }
]},
{"description":"Empty start tag",
"input":"<>",
"output":[["Character", "<>"]],
"errors":[
{ "code": "invalid-first-character-of-tag-name", "line": 1, "col": 2 }
]},
{"description":"Start Tag w/attribute",
"input":"<h a='b'>",
"output":[["StartTag", "h", {"a":"b"}]]},
{"description":"Start Tag w/attribute no quotes",
"input":"<h a=b>",
"output":[["StartTag", "h", {"a":"b"}]]},
{"description":"Start/End Tag",
"input":"<h></h>",
"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
{"description":"Two unclosed start tags",
"input":"<p>One<p>Two",
"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
{"description":"End Tag w/attribute",
"input":"<h></h a='b'>",
"output":[["StartTag", "h", {}], ["EndTag", "h"]],
"errors":[
{ "code": "end-tag-with-attributes", "line": 1, "col": 13 }
]},
{"description":"Multiple atts",
"input":"<h a='b' c='d'>",
"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
{"description":"Multiple atts no space",
"input":"<h a='b'c='d'>",
"output":[["StartTag", "h", {"a":"b", "c":"d"}]],
"errors":[
{ "code": "missing-whitespace-between-attributes", "line": 1, "col": 9 }
]},
{"description":"Repeated attr",
"input":"<h a='b' a='d'>",
"output":[["StartTag", "h", {"a":"b"}]],
"errors":[
{ "code": "duplicate-attribute", "line": 1, "col": 11 }
]},
{"description":"Simple comment",
"input":"<!--comment-->",
"output":[["Comment", "comment"]]},
{"description":"Comment, Central dash no space",
"input":"<!----->",
"output":[["Comment", "-"]]},
{"description":"Comment, two central dashes",
"input":"<!-- --comment -->",
"output":[["Comment", " --comment "]]},
{"description":"Comment, central less-than bang",
"input":"<!--<!-->",
"output":[["Comment", "<!"]]},
{"description":"Unfinished comment",
"input":"<!--comment",
"output":[["Comment", "comment"]],
"errors":[
{ "code": "eof-in-comment", "line": 1, "col": 12 }
]},
{"description":"Unfinished comment after start of nested comment",
"input":"<!-- <!--",
"output":[["Comment", " <!"]],
"errors":[
{ "code": "eof-in-comment", "line": 1, "col": 10 }
]},
{"description":"Start of a comment",
"input":"<!-",
"output":[["Comment", "-"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 }
]},
{"description":"Short comment",
"input":"<!-->",
"output":[["Comment", ""]],
"errors":[
{ "code": "abrupt-closing-of-empty-comment", "line": 1, "col": 5 }
]},
{"description":"Short comment two",
"input":"<!--->",
"output":[["Comment", ""]],
"errors":[
{ "code": "abrupt-closing-of-empty-comment", "line": 1, "col": 6 }
]},
{"description":"Short comment three",
"input":"<!---->",
"output":[["Comment", ""]]},
{"description":"< in comment",
"input":"<!-- <test-->",
"output":[["Comment", " <test"]]},
{"description":"<! in comment",
"input":"<!-- <!test-->",
"output":[["Comment", " <!test"]]},
{"description":"<!- in comment",
"input":"<!-- <!-test-->",
"output":[["Comment", " <!-test"]]},
{"description":"Nested comment",
"input":"<!-- <!--test-->",
"output":[["Comment", " <!--test"]],
"errors":[
{ "code": "nested-comment", "line": 1, "col": 10 }
]},
{"description":"Nested comment with extra <",
"input":"<!-- <<!--test-->",
"output":[["Comment", " <<!--test"]],
"errors":[
{ "code": "nested-comment", "line": 1, "col": 11 }
]},
{"description":"< in script data",
"initialStates":["Script data state"],
"input":"<test-->",
"output":[["Character", "<test-->"]]},
{"description":"<! in script data",
"initialStates":["Script data state"],
"input":"<!test-->",
"output":[["Character", "<!test-->"]]},
{"description":"<!- in script data",
"initialStates":["Script data state"],
"input":"<!-test-->",
"output":[["Character", "<!-test-->"]]},
{"description":"Escaped script data",
"initialStates":["Script data state"],
"input":"<!--test-->",
"output":[["Character", "<!--test-->"]]},
{"description":"< in script HTML comment",
"initialStates":["Script data state"],
"input":"<!-- < test -->",
"output":[["Character", "<!-- < test -->"]]},
{"description":"</ in script HTML comment",
"initialStates":["Script data state"],
"input":"<!-- </ test -->",
"output":[["Character", "<!-- </ test -->"]]},
{"description":"Start tag in script HTML comment",
"initialStates":["Script data state"],
"input":"<!-- <test> -->",
"output":[["Character", "<!-- <test> -->"]]},
{"description":"End tag in script HTML comment",
"initialStates":["Script data state"],
"input":"<!-- </test> -->",
"output":[["Character", "<!-- </test> -->"]]},
{"description":"- in script HTML comment double escaped",
"initialStates":["Script data state"],
"input":"<!--<script>-</script>-->",
"output":[["Character", "<!--<script>-</script>-->"]]},
{"description":"-- in script HTML comment double escaped",
"initialStates":["Script data state"],
"input":"<!--<script>--</script>-->",
"output":[["Character", "<!--<script>--</script>-->"]]},
{"description":"--- in script HTML comment double escaped",
"initialStates":["Script data state"],
"input":"<!--<script>---</script>-->",
"output":[["Character", "<!--<script>---</script>-->"]]},
{"description":"- spaced in script HTML comment double escaped",
"initialStates":["Script data state"],
"input":"<!--<script> - </script>-->",
"output":[["Character", "<!--<script> - </script>-->"]]},
{"description":"-- spaced in script HTML comment double escaped",
"initialStates":["Script data state"],
"input":"<!--<script> -- </script>-->",
"output":[["Character", "<!--<script> -- </script>-->"]]},
{"description":"Ampersand EOF",
"input":"&",
"output":[["Character", "&"]]},
{"description":"Ampersand ampersand EOF",
"input":"&&",
"output":[["Character", "&&"]]},
{"description":"Ampersand space EOF",
"input":"& ",
"output":[["Character", "& "]]},
{"description":"Unfinished entity",
"input":"&f",
"output":[["Character", "&f"]]},
{"description":"Ampersand, number sign",
"input":"&#",
"output":[["Character", "&#"]],
"errors":[
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 }
]},
{"description":"Unfinished numeric entity",
"input":"&#x",
"output":[["Character", "&#x"]],
"errors":[
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 4 }
]},
{"description":"Entity with trailing semicolon (1)",
"input":"I'm &not;it",
"output":[["Character","I'm \u00ACit"]]},
{"description":"Entity with trailing semicolon (2)",
"input":"I'm &notin;",
"output":[["Character","I'm \u2209"]]},
{"description":"Entity without trailing semicolon (1)",
"input":"I'm &notit",
"output":[["Character","I'm \u00ACit"]],
"errors": [
{"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
]},
{"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin",
"output":[["Character","I'm \u00ACin"]],
"errors": [
{"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
]},
{"description":"Partial entity match at end of file",
"input":"I'm &no",
"output":[["Character","I'm &no"]]},
{"description":"Non-ASCII character reference name",
"input":"&\u00AC;",
"output":[["Character", "&\u00AC;"]]},
{"description":"ASCII decimal entity",
"input":"&#0036;",
"output":[["Character","$"]]},
{"description":"ASCII hexadecimal entity",
"input":"&#x3f;",
"output":[["Character","?"]]},
{"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
{"description":"Entity in attribute without semicolon ending in x",
"input":"<h a='&notx'>",
"output":[["StartTag", "h", {"a":"&notx"}]]},
{"description":"Entity in attribute without semicolon ending in 1",
"input":"<h a='&not1'>",
"output":[["StartTag", "h", {"a":"&not1"}]]},
{"description":"Entity in attribute without semicolon ending in i",
"input":"<h a='&noti'>",
"output":[["StartTag", "h", {"a":"&noti"}]]},
{"description":"Entity in attribute without semicolon",
"input":"<h a='&COPY'>",
"output":[["StartTag", "h", {"a":"\u00A9"}]],
"errors": [
{"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 12 }
]},
{"description":"Unquoted attribute ending in ampersand",
"input":"<s o=& t>",
"output":[["StartTag","s",{"o":"&","t":""}]]},
{"description":"Unquoted attribute at end of tag with final character of &, with tag followed by characters",
"input":"<a a=a&>foo",
"output":[["StartTag", "a", {"a":"a&"}], ["Character", "foo"]]},
{"description":"plaintext element",
"input":"<plaintext>foobar",
"output":[["StartTag","plaintext",{}], ["Character","foobar"]]},
{"description":"Open angled bracket in unquoted attribute value state",
"input":"<a a=f<>",
"output":[["StartTag", "a", {"a":"f<"}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 7 }
]}
]}

View file

@ -0,0 +1,275 @@
{"tests": [
{"description":"DOCTYPE without name",
"input":"<!DOCTYPE>",
"output":[["DOCTYPE", null, null, null, false]],
"errors":[
{ "code": "missing-doctype-name", "line": 1, "col": 10 }
]},
{"description":"DOCTYPE without space before name",
"input":"<!DOCTYPEhtml>",
"output":[["DOCTYPE", "html", null, null, true]],
"errors":[
{ "code": "missing-whitespace-before-doctype-name", "line": 1, "col": 10 }
]},
{"description":"Incorrect DOCTYPE without a space before name",
"input":"<!DOCTYPEfoo>",
"output":[["DOCTYPE", "foo", null, null, true]],
"errors":[
{ "code": "missing-whitespace-before-doctype-name", "line": 1, "col": 10 }
]},
{"description":"DOCTYPE with publicId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
{"description":"DOCTYPE with EOF after PUBLIC",
"input":"<!DOCTYPE html PUBLIC",
"output":[["DOCTYPE", "html", null, null, false]],
"errors": [
{ "code": "eof-in-doctype", "col": 22, "line": 1 }
]},
{"description":"DOCTYPE with EOF after PUBLIC '",
"input":"<!DOCTYPE html PUBLIC '",
"output":[["DOCTYPE", "html", "", null, false]],
"errors": [
{ "code": "eof-in-doctype", "col": 24, "line": 1 }
]},
{"description":"DOCTYPE with EOF after PUBLIC 'x",
"input":"<!DOCTYPE html PUBLIC 'x",
"output":[["DOCTYPE", "html", "x", null, false]],
"errors": [
{ "code": "eof-in-doctype", "col": 25, "line": 1 }
]},
{"description":"DOCTYPE with systemId",
"input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with single-quoted systemId",
"input":"<!DOCTYPE html SYSTEM '-//W3C//DTD HTML Transitional 4.01//EN'>",
"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with publicId and systemId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with > in double-quoted publicId",
"input":"<!DOCTYPE html PUBLIC \">x",
"output":[["DOCTYPE", "html", "", null, false], ["Character", "x"]],
"errors": [
{ "code": "abrupt-doctype-public-identifier", "col": 24, "line": 1 }
]},
{"description":"DOCTYPE with > in single-quoted publicId",
"input":"<!DOCTYPE html PUBLIC '>x",
"output":[["DOCTYPE", "html", "", null, false], ["Character", "x"]],
"errors": [
{ "code": "abrupt-doctype-public-identifier", "col": 24, "line": 1 }
]},
{"description":"DOCTYPE with > in double-quoted systemId",
"input":"<!DOCTYPE html PUBLIC \"foo\" \">x",
"output":[["DOCTYPE", "html", "foo", "", false], ["Character", "x"]],
"errors": [
{ "code": "abrupt-doctype-system-identifier", "col": 30, "line": 1 }
]},
{"description":"DOCTYPE with > in single-quoted systemId",
"input":"<!DOCTYPE html PUBLIC 'foo' '>x",
"output":[["DOCTYPE", "html", "foo", "", false], ["Character", "x"]],
"errors": [
{ "code": "abrupt-doctype-system-identifier", "col": 30, "line": 1 }
]},
{"description":"Incomplete doctype",
"input":"<!DOCTYPE html ",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "eof-in-doctype", "line": 1, "col": 16 }
]},
{"description":"Numeric entity representing the NUL character",
"input":"&#0000;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "null-character-reference", "line": 1, "col": 8 }
]},
{"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "null-character-reference", "line": 1, "col": 9 }
]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 11 }
]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 13 }
]},
{"description":"Hexadecimal entity pair representing a surrogate pair",
"input":"&#xD869;&#xDED6;",
"output":[["Character", "\uFFFD\uFFFD"]],
"errors":[
{ "code": "surrogate-character-reference", "line": 1, "col": 9 },
{ "code": "surrogate-character-reference", "line": 1, "col": 17 }
]},
{"description":"Hexadecimal entity with mixed uppercase and lowercase",
"input":"&#xaBcD;",
"output":[["Character", "\uABCD"]]},
{"description":"Entity without a name",
"input":"&;",
"output":[["Character", "&;"]]},
{"description":"Unescaped ampersand in attribute value",
"input":"<h a='&'>",
"output":[["StartTag", "h", { "a":"&" }]]},
{"description":"StartTag containing <",
"input":"<a<b>",
"output":[["StartTag", "a<b", { }]]},
{"description":"Non-void element containing trailing /",
"input":"<h/>",
"output":[["StartTag","h",{},true]]},
{"description":"Void element with permitted slash",
"input":"<br/>",
"output":[["StartTag","br",{},true]]},
{"description":"Void element with permitted slash (with attribute)",
"input":"<br foo='bar'/>",
"output":[["StartTag","br",{"foo":"bar"},true]]},
{"description":"StartTag containing /",
"input":"<h/a='b'>",
"output":[["StartTag", "h", { "a":"b" }]],
"errors":[
{ "code": "unexpected-solidus-in-tag", "line": 1, "col": 4 }
]},
{"description":"Double-quoted attribute value",
"input":"<h a=\"b\">",
"output":[["StartTag", "h", { "a":"b" }]]},
{"description":"Unescaped </",
"input":"</",
"output":[["Character", "</"]],
"errors":[
{ "code": "eof-before-tag-name", "line": 1, "col": 3 }
]},
{"description":"Illegal end tag name",
"input":"</1>",
"output":[["Comment", "1"]],
"errors":[
{ "code": "invalid-first-character-of-tag-name", "line": 1, "col": 3 }
]},
{"description":"Simili processing instruction",
"input":"<?namespace>",
"output":[["Comment", "?namespace"]],
"errors":[
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
]},
{"description":"A bogus comment stops at >, even if preceeded by two dashes",
"input":"<?foo-->",
"output":[["Comment", "?foo--"]],
"errors":[
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
]},
{"description":"Unescaped <",
"input":"foo < bar",
"output":[["Character", "foo < bar"]],
"errors":[
{ "code": "invalid-first-character-of-tag-name", "line": 1, "col": 6 }
]},
{"description":"Null Byte Replacement",
"input":"\u0000",
"output":[["Character", "\u0000"]],
"errors":[
{ "code": "unexpected-null-character", "line": 1, "col": 1 }
]},
{"description":"Comment with dash",
"input":"<!---x",
"output":[["Comment", "-x"]],
"errors":[
{ "code": "eof-in-comment", "line": 1, "col": 7 }
]},
{"description":"Entity + newline",
"input":"\nx\n&gt;\n",
"output":[["Character","\nx\n>\n"]]},
{"description":"Start tag with no attributes but space before the greater-than sign",
"input":"<h >",
"output":[["StartTag", "h", {}]]},
{"description":"Empty attribute followed by uppercase attribute",
"input":"<h a B=''>",
"output":[["StartTag", "h", {"a":"", "b":""}]]},
{"description":"Double-quote after attribute name",
"input":"<h a \">",
"output":[["StartTag", "h", {"a":"", "\"":""}]],
"errors":[
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 6 }
]},
{"description":"Single-quote after attribute name",
"input":"<h a '>",
"output":[["StartTag", "h", {"a":"", "'":""}]],
"errors":[
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 6 }
]},
{"description":"Empty end tag with following characters",
"input":"a</>bc",
"output":[["Character", "abc"]],
"errors":[
{ "code": "missing-end-tag-name", "line": 1, "col": 4 }
]},
{"description":"Empty end tag with following tag",
"input":"a</><b>c",
"output":[["Character", "a"], ["StartTag", "b", {}], ["Character", "c"]],
"errors":[
{ "code": "missing-end-tag-name", "line": 1, "col": 4 }
]},
{"description":"Empty end tag with following comment",
"input":"a</><!--b-->c",
"output":[["Character", "a"], ["Comment", "b"], ["Character", "c"]],
"errors":[
{ "code": "missing-end-tag-name", "line": 1, "col": 4 }
]},
{"description":"Empty end tag with following end tag",
"input":"a</></b>c",
"output":[["Character", "a"], ["EndTag", "b"], ["Character", "c"]],
"errors":[
{ "code": "missing-end-tag-name", "line": 1, "col": 4 }
]}
]}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,532 @@
{"tests": [
{"description":"< in attribute name",
"input":"<z/0 <>",
"output":[["StartTag", "z", {"0": "", "<": ""}]],
"errors":[
{ "code": "unexpected-solidus-in-tag", "line": 1, "col": 4 },
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 7 }
]},
{"description":"< in unquoted attribute value",
"input":"<z x=<>",
"output":[["StartTag", "z", {"x": "<"}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 6 }
]},
{"description":"= in unquoted attribute value",
"input":"<z z=z=z>",
"output":[["StartTag", "z", {"z": "z=z"}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 7 }
]},
{"description":"= attribute",
"input":"<z =>",
"output":[["StartTag", "z", {"=": ""}]],
"errors":[
{ "code": "unexpected-equals-sign-before-attribute-name", "line": 1, "col": 4 }
]},
{"description":"== attribute",
"input":"<z ==>",
"output":[["StartTag", "z", {"=": ""}]],
"errors":[
{ "code": "unexpected-equals-sign-before-attribute-name", "line": 1, "col": 4 },
{ "code": "missing-attribute-value", "line": 1, "col": 6 }
]},
{"description":"=== attribute",
"input":"<z ===>",
"output":[["StartTag", "z", {"=": "="}]],
"errors":[
{ "code": "unexpected-equals-sign-before-attribute-name", "line": 1, "col": 4 },
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 6 }
]},
{"description":"==== attribute",
"input":"<z ====>",
"output":[["StartTag", "z", {"=": "=="}]],
"errors":[
{ "code": "unexpected-equals-sign-before-attribute-name", "line": 1, "col": 4 },
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 6 },
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 7 }
]},
{"description":"\" after ampersand in double-quoted attribute value",
"input":"<z z=\"&\">",
"output":[["StartTag", "z", {"z": "&"}]]},
{"description":"' after ampersand in double-quoted attribute value",
"input":"<z z=\"&'\">",
"output":[["StartTag", "z", {"z": "&'"}]]},
{"description":"' after ampersand in single-quoted attribute value",
"input":"<z z='&'>",
"output":[["StartTag", "z", {"z": "&"}]]},
{"description":"\" after ampersand in single-quoted attribute value",
"input":"<z z='&\"'>",
"output":[["StartTag", "z", {"z": "&\""}]]},
{"description":"Text after bogus character reference",
"input":"<z z='&xlink_xmlns;'>bar<z>",
"output":[["StartTag","z",{"z":"&xlink_xmlns;"}],["Character","bar"],["StartTag","z",{}]]},
{"description":"Text after hex character reference",
"input":"<z z='&#x0020; foo'>bar<z>",
"output":[["StartTag","z",{"z":" foo"}],["Character","bar"],["StartTag","z",{}]]},
{"description":"Attribute name starting with \"",
"input":"<foo \"='bar'>",
"output":[["StartTag", "foo", {"\"": "bar"}]],
"errors":[
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 6 }
]},
{"description":"Attribute name starting with '",
"input":"<foo '='bar'>",
"output":[["StartTag", "foo", {"'": "bar"}]],
"errors":[
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 6 }
]},
{"description":"Attribute name containing \"",
"input":"<foo a\"b='bar'>",
"output":[["StartTag", "foo", {"a\"b": "bar"}]],
"errors":[
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 7 }
]},
{"description":"Attribute name containing '",
"input":"<foo a'b='bar'>",
"output":[["StartTag", "foo", {"a'b": "bar"}]],
"errors":[
{ "code": "unexpected-character-in-attribute-name", "line": 1, "col": 7 }
]},
{"description":"Unquoted attribute value containing '",
"input":"<foo a=b'c>",
"output":[["StartTag", "foo", {"a": "b'c"}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 9 }
]},
{"description":"Unquoted attribute value containing \"",
"input":"<foo a=b\"c>",
"output":[["StartTag", "foo", {"a": "b\"c"}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 9 }
]},
{"description":"Double-quoted attribute value not followed by whitespace",
"input":"<foo a=\"b\"c>",
"output":[["StartTag", "foo", {"a": "b", "c": ""}]],
"errors":[
{ "code": "missing-whitespace-between-attributes", "line": 1, "col": 11 }
]},
{"description":"Single-quoted attribute value not followed by whitespace",
"input":"<foo a='b'c>",
"output":[["StartTag", "foo", {"a": "b", "c": ""}]],
"errors":[
{ "code": "missing-whitespace-between-attributes", "line": 1, "col": 11 }
]},
{"description":"Quoted attribute followed by permitted /",
"input":"<br a='b'/>",
"output":[["StartTag","br",{"a":"b"},true]]},
{"description":"Quoted attribute followed by non-permitted /",
"input":"<bar a='b'/>",
"output":[["StartTag","bar",{"a":"b"},true]]},
{"description":"CR EOF after doctype name",
"input":"<!doctype html \r",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "eof-in-doctype", "line": 2, "col": 1 }
]},
{"description":"CR EOF in tag name",
"input":"<z\r",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 2, "col": 1 }
]},
{"description":"Slash EOF in tag name",
"input":"<z/",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 4 }
]},
{"description":"Zero hex numeric entity",
"input":"&#x0",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 5 },
{ "code": "null-character-reference", "line": 1, "col": 5 }
]},
{"description":"Zero decimal numeric entity",
"input":"&#0",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "missing-semicolon-after-character-reference", "line": 1, "col": 4 },
{ "code": "null-character-reference", "line": 1, "col": 4 }
]},
{"description":"Zero-prefixed hex numeric entity",
"input":"&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041;",
"output":[["Character", "A"]]},
{"description":"Zero-prefixed decimal numeric entity",
"input":"&#000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000065;",
"output":[["Character", "A"]]},
{"description":"Empty hex numeric entities",
"input":"&#x &#X ",
"output":[["Character", "&#x &#X "]],
"errors":[
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 4 },
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 8 }
]},
{"description":"Invalid digit in hex numeric entity",
"input":"&#xZ",
"output":[["Character", "&#xZ"]],
"errors":[
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 4 }
]},
{"description":"Empty decimal numeric entities",
"input":"&# &#; ",
"output":[["Character", "&# &#; "]],
"errors":[
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 },
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 6 }
]},
{"description":"Invalid digit in decimal numeric entity",
"input":"&#A",
"output":[["Character", "&#A"]],
"errors":[
{ "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 }
]},
{"description":"Non-BMP numeric entity",
"input":"&#x10000;",
"output":[["Character", "\uD800\uDC00"]]},
{"description":"Maximum non-BMP numeric entity",
"input":"&#X10FFFF;",
"output":[["Character", "\uDBFF\uDFFF"]],
"errors":[
{ "code": "noncharacter-character-reference", "line": 1, "col": 11 }
]},
{"description":"Above maximum numeric entity",
"input":"&#x110000;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 11 }
]},
{"description":"32-bit hex numeric entity",
"input":"&#x80000041;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 13 }
]},
{"description":"33-bit hex numeric entity",
"input":"&#x100000041;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 14 }
]},
{"description":"33-bit decimal numeric entity",
"input":"&#4294967361;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 14 }
]},
{"description":"65-bit hex numeric entity",
"input":"&#x10000000000000041;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 22 }
]},
{"description":"65-bit decimal numeric entity",
"input":"&#18446744073709551681;",
"output":[["Character", "\uFFFD"]],
"errors":[
{ "code": "character-reference-outside-unicode-range", "line": 1, "col": 24 }
]},
{"description":"Surrogate code point edge cases",
"input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
"output":[["Character", "\uD7FF\uFFFD\uFFFD\uFFFD\uFFFD\uE000"]],
"errors":[
{ "code": "surrogate-character-reference", "line": 1, "col": 17 },
{ "code": "surrogate-character-reference", "line": 1, "col": 25 },
{ "code": "surrogate-character-reference", "line": 1, "col": 33 },
{ "code": "surrogate-character-reference", "line": 1, "col": 41 }
]},
{"description":"Uppercase start tag name",
"input":"<X>",
"output":[["StartTag", "x", {}]]},
{"description":"Uppercase end tag name",
"input":"</X>",
"output":[["EndTag", "x"]]},
{"description":"Uppercase attribute name",
"input":"<x X>",
"output":[["StartTag", "x", { "x":"" }]]},
{"description":"Tag/attribute name case edge values",
"input":"<x@AZ[`az{ @AZ[`az{>",
"output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
{"description":"Duplicate different-case attributes",
"input":"<x x=1 x=2 X=3>",
"output":[["StartTag", "x", { "x":"1" }]],
"errors":[
{ "code": "duplicate-attribute", "line": 1, "col": 9 },
{ "code": "duplicate-attribute", "line": 1, "col": 13 }
]},
{"description":"Uppercase close tag attributes",
"input":"</x X>",
"output":[["EndTag", "x"]],
"errors":[
{ "code": "end-tag-with-attributes", "line": 1, "col": 6 }
]},
{"description":"Duplicate close tag attributes",
"input":"</x x x>",
"output":[["EndTag", "x"]],
"errors":[
{ "code": "duplicate-attribute", "line": 1, "col": 8 },
{ "code": "end-tag-with-attributes", "line": 1, "col": 8 }
]},
{"description":"Permitted slash",
"input":"<br/>",
"output":[["StartTag","br",{},true]]},
{"description":"Non-permitted slash",
"input":"<xr/>",
"output":[["StartTag","xr",{},true]]},
{"description":"Permitted slash but in close tag",
"input":"</br/>",
"output":[["EndTag", "br"]],
"errors":[
{ "code": "end-tag-with-trailing-solidus", "line": 1, "col": 6 }
]},
{"description":"Doctype public case-sensitivity (1)",
"input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
"output":[["DOCTYPE", "html", "AbC", "XyZ", true]]},
{"description":"Doctype public case-sensitivity (2)",
"input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
"output":[["DOCTYPE", "html", "aBc", "xYz", true]]},
{"description":"Doctype system case-sensitivity (1)",
"input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
"output":[["DOCTYPE", "html", null, "XyZ", true]]},
{"description":"Doctype system case-sensitivity (2)",
"input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
"output":[["DOCTYPE", "html", null, "xYz", true]]},
{"description":"U+0000 in lookahead region after non-matching character",
"input":"<!doc>\u0000",
"output":[["Comment", "doc"], ["Character", "\u0000"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 },
{ "code": "unexpected-null-character", "line": 1, "col": 7 }
]},
{"description":"U+0000 in lookahead region",
"input":"<!doc\u0000",
"output":[["Comment", "doc\uFFFD"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 },
{ "code": "unexpected-null-character", "line": 1, "col": 6 }
]},
{"description":"U+0080 in lookahead region",
"input":"<!doc\u0080",
"output":[["Comment", "doc\u0080"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 },
{ "code": "control-character-in-input-stream", "line": 1, "col": 6 }
]},
{"description":"U+FDD1 in lookahead region",
"input":"<!doc\uFDD1",
"output":[["Comment", "doc\uFDD1"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 },
{ "code": "noncharacter-in-input-stream", "line": 1, "col": 6 }
]},
{"description":"U+1FFFF in lookahead region",
"input":"<!doc\uD83F\uDFFF",
"output":[["Comment", "doc\uD83F\uDFFF"]],
"errors":[
{ "code": "incorrectly-opened-comment", "line": 1, "col": 3 },
{ "code": "noncharacter-in-input-stream", "line": 1, "col": 6 }
]},
{"description":"CR followed by non-LF",
"input":"\r?",
"output":[["Character", "\n?"]]},
{"description":"CR at EOF",
"input":"\r",
"output":[["Character", "\n"]]},
{"description":"LF at EOF",
"input":"\n",
"output":[["Character", "\n"]]},
{"description":"CR LF",
"input":"\r\n",
"output":[["Character", "\n"]]},
{"description":"CR CR",
"input":"\r\r",
"output":[["Character", "\n\n"]]},
{"description":"LF LF",
"input":"\n\n",
"output":[["Character", "\n\n"]]},
{"description":"LF CR",
"input":"\n\r",
"output":[["Character", "\n\n"]]},
{"description":"text CR CR CR text",
"input":"text\r\r\rtext",
"output":[["Character", "text\n\n\ntext"]]},
{"description":"Doctype publik",
"input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "invalid-character-sequence-after-doctype-name", "line": 1, "col": 16 }
]},
{"description":"Doctype publi",
"input":"<!DOCTYPE html PUBLI",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "invalid-character-sequence-after-doctype-name", "line": 1, "col": 16 }
]},
{"description":"Doctype sistem",
"input":"<!DOCTYPE html SISTEM \"AbC\">",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "invalid-character-sequence-after-doctype-name", "line": 1, "col": 16 }
]},
{"description":"Doctype sys",
"input":"<!DOCTYPE html SYS",
"output":[["DOCTYPE", "html", null, null, false]],
"errors":[
{ "code": "invalid-character-sequence-after-doctype-name", "line": 1, "col": 16 }
]},
{"description":"Doctype html x>text",
"input":"<!DOCTYPE html x>text",
"output":[["DOCTYPE", "html", null, null, false], ["Character", "text"]],
"errors":[
{ "code": "invalid-character-sequence-after-doctype-name", "line": 1, "col": 16 }
]},
{"description":"Grave accent in unquoted attribute",
"input":"<a a=aa`>",
"output":[["StartTag", "a", {"a":"aa`"}]],
"errors":[
{ "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 8 }
]},
{"description":"EOF in tag name state ",
"input":"<a",
"output":[],
"errors": [
{ "code": "eof-in-tag", "line": 1, "col": 3 }
]},
{"description":"EOF in before attribute name state",
"input":"<a ",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 4 }
]},
{"description":"EOF in attribute name state",
"input":"<a a",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 5 }
]},
{"description":"EOF in after attribute name state",
"input":"<a a ",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 6 }
]},
{"description":"EOF in before attribute value state",
"input":"<a a =",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 7 }
]},
{"description":"EOF in attribute value (double quoted) state",
"input":"<a a =\"a",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 9 }
]},
{"description":"EOF in attribute value (single quoted) state",
"input":"<a a ='a",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 9 }
]},
{"description":"EOF in attribute value (unquoted) state",
"input":"<a a =a",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 8 }
]},
{"description":"EOF in after attribute value state",
"input":"<a a ='a'",
"output":[],
"errors":[
{ "code": "eof-in-tag", "line": 1, "col": 10 }
]}
]}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,41 @@
{"tests" : [
{"description": "Invalid Unicode character U+DFFF",
"doubleEscaped":true,
"input": "\\uDFFF",
"output":[["Character", "\\uDFFF"]],
"errors":[
{ "code": "surrogate-in-input-stream", "line": 1, "col": 1 }
]},
{"description": "Invalid Unicode character U+D800",
"doubleEscaped":true,
"input": "\\uD800",
"output":[["Character", "\\uD800"]],
"errors":[
{ "code": "surrogate-in-input-stream", "line": 1, "col": 1 }
]},
{"description": "Invalid Unicode character U+DFFF with valid preceding character",
"doubleEscaped":true,
"input": "a\\uDFFF",
"output":[["Character", "a\\uDFFF"]],
"errors":[
{ "code": "surrogate-in-input-stream", "line": 1, "col": 2 }
]},
{"description": "Invalid Unicode character U+D800 with valid following character",
"doubleEscaped":true,
"input": "\\uD800a",
"output":[["Character", "\\uD800a"]],
"errors":[
{ "code": "surrogate-in-input-stream", "line": 1, "col": 1 }
]},
{"description":"CR followed by U+0000",
"input":"\r\u0000",
"output":[["Character", "\n\u0000"]],
"errors":[
{ "code": "unexpected-null-character", "line": 2, "col": 1 }
]}
]
}

View file

@ -0,0 +1,20 @@
{"xmlViolationTests": [
{"description":"Non-XML character",
"input":"a\uFFFFb",
"output":[["Character","a\uFFFDb"]]},
{"description":"Non-XML space",
"input":"a\u000Cb",
"output":[["Character","a b"]]},
{"description":"Double hyphen in comment",
"input":"<!-- foo -- bar -->",
"output":[["Comment"," foo - - bar "]]},
{"description":"FF between attributes",
"input":"<a b=''\u000Cc=''>",
"output":[["StartTag","a",{"b":"","c":""}]]}
]}

View file

@ -0,0 +1,108 @@
Tree Construction Tests
=======================
Each file containing tree construction tests consists of any number of
tests separated by two newlines (LF) and a single newline before the end
of the file. For instance:
[TEST]LF
LF
[TEST]LF
LF
[TEST]LF
Where [TEST] is the following format:
Each test must begin with a string "\#data" followed by a newline (LF).
All subsequent lines until a line that says "\#errors" are the test data
and must be passed to the system being tested unchanged, except with the
final newline (on the last line) removed.
Then there must be a line that says "\#errors". It must be followed by
one line per parse error that a conformant checker would return. It
doesn't matter what those lines are, although they can't be
"\#new-errors", "\#document-fragment", "\#document", "\#script-off",
"\#script-on", or empty, the only thing that matters is that there be
the right number of parse errors.
Then there \*may\* be a line that says "\#new-errors", which works like
the "\#errors" section adding more errors to the expected number of
errors.
Then there \*may\* be a line that says "\#document-fragment", which must
be followed by a newline (LF), followed by a string of characters that
indicates the context element, followed by a newline (LF). If the string
of characters starts with "svg ", the context element is in the SVG
namespace and the substring after "svg " is the local name. If the
string of characters starts with "math ", the context element is in the
MathML namespace and the substring after "math " is the local name.
Otherwise, the context element is in the HTML namespace and the string
is the local name. If this line is present the "\#data" must be parsed
using the HTML fragment parsing algorithm with the context element as
context.
Then there \*may\* be a line that says "\#script-off" or
"\#script-on". If a line that says "\#script-off" is present, the
parser must set the scripting flag to disabled. If a line that says
"\#script-on" is present, it must set it to enabled. Otherwise, the
test should be run in both modes.
Then there must be a line that says "\#document", which must be followed
by a dump of the tree of the parsed DOM. Each node must be represented
by a single line. Each line must start with "| ", followed by two spaces
per parent node that the node has before the root document node.
- Element nodes must be represented by a "`<`" then the *tag name
string* "`>`", and all the attributes must be given, sorted
lexicographically by UTF-16 code unit according to their *attribute
name string*, on subsequent lines, as if they were children of the
element node.
- Attribute nodes must have the *attribute name string*, then an "="
sign, then the attribute value in double quotes (").
- Text nodes must be the string, in double quotes. Newlines aren't
escaped.
- Comments must be "`<`" then "`!-- `" then the data then "` -->`".
- DOCTYPEs must be "`<!DOCTYPE `" then the name then if either of the
system id or public id is non-empty a space, public id in
double-quotes, another space an the system id in double-quotes, and
then in any case "`>`".
- Processing instructions must be "`<?`", then the target, then a
space, then the data and then "`>`". (The HTML parser cannot emit
processing instructions, but scripts can, and the WebVTT to DOM
rules can emit them.)
- Template contents are represented by the string "content" with the
children below it.
The *tag name string* is the local name prefixed by a namespace
designator. For the HTML namespace, the namespace designator is the
empty string, i.e. there's no prefix. For the SVG namespace, the
namespace designator is "svg ". For the MathML namespace, the namespace
designator is "math ".
The *attribute name string* is the local name prefixed by a namespace
designator. For no namespace, the namespace designator is the empty
string, i.e. there's no prefix. For the XLink namespace, the namespace
designator is "xlink ". For the XML namespace, the namespace designator
is "xml ". For the XMLNS namespace, the namespace designator is "xmlns
". Note the difference between "xlink:href" which is an attribute in no
namespace with the local name "xlink:href" and "xlink href" which is an
attribute in the xlink namespace with the local name "href".
If there is also a "\#document-fragment" the bit following "\#document"
must be a representation of the HTML fragment serialization for the
context element given by "\#document-fragment".
For example:
#data
<p>One<p>Two
#errors
3: Missing document type declaration
#document
| <html>
| <head>
| <body>
| <p>
| "One"
| <p>
| "Two"

View file

@ -0,0 +1,354 @@
#data
<a><p></a></p>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,10): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| <p>
| <a>
#data
<a>1<p>2</a>3</p>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,12): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <p>
| <a>
| "2"
| "3"
#data
<a>1<button>2</a>3</button>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,17): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <button>
| <a>
| "2"
| "3"
#data
<a>1<b>2</a>3</b>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,12): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <b>
| "2"
| <b>
| "3"
#data
<a>1<div>2<div>3</a>4</div>5</div>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,20): adoption-agency-1.3
(1,20): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <div>
| <a>
| "2"
| <div>
| <a>
| "3"
| "4"
| "5"
#data
<table><a>1<p>2</a>3</p>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,10): unexpected-start-tag-implies-table-voodoo
(1,11): unexpected-character-implies-table-voodoo
(1,14): unexpected-start-tag-implies-table-voodoo
(1,15): unexpected-character-implies-table-voodoo
(1,19): unexpected-end-tag-implies-table-voodoo
(1,19): adoption-agency-1.3
(1,20): unexpected-character-implies-table-voodoo
(1,24): unexpected-end-tag-implies-table-voodoo
(1,24): eof-in-table
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <p>
| <a>
| "2"
| "3"
| <table>
#data
<b><b><a><p></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <b>
| <b>
| <a>
| <p>
| <a>
#data
<b><a><b><p></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <b>
| <a>
| <b>
| <b>
| <p>
| <a>
#data
<a><b><b><p></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <a>
| <b>
| <b>
| <b>
| <b>
| <p>
| <a>
#data
<p>1<s id="A">2<b id="B">3</p>4</s>5</b>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,30): unexpected-end-tag
(1,35): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <p>
| "1"
| <s>
| id="A"
| "2"
| <b>
| id="B"
| "3"
| <s>
| id="A"
| <b>
| id="B"
| "4"
| <b>
| id="B"
| "5"
#data
<table><a>1<td>2</td>3</table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,10): unexpected-start-tag-implies-table-voodoo
(1,11): unexpected-character-implies-table-voodoo
(1,15): unexpected-cell-in-table-body
(1,30): unexpected-implied-end-tag-in-table-view
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <a>
| "3"
| <table>
| <tbody>
| <tr>
| <td>
| "2"
#data
<table>A<td>B</td>C</table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,8): unexpected-character-implies-table-voodoo
(1,12): unexpected-cell-in-table-body
(1,22): unexpected-character-implies-table-voodoo
#document
| <html>
| <head>
| <body>
| "AC"
| <table>
| <tbody>
| <tr>
| <td>
| "B"
#data
<a><svg><tr><input></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,23): unexpected-end-tag
(1,23): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| <svg svg>
| <svg tr>
| <svg input>
#data
<div><a><b><div><div><div><div><div><div><div><div><div><div></a>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <a>
| <b>
| <b>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <div>
#data
<div><a><b><u><i><code><div></a>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,32): adoption-agency-1.3
(1,32): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <a>
| <b>
| <u>
| <i>
| <code>
| <u>
| <i>
| <code>
| <div>
| <a>
#data
<b><b><b><b>x</b></b></b></b>y
#errors
(1,3): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <b>
| <b>
| <b>
| <b>
| "x"
| "y"
#data
<p><b><b><b><b><p>x
#errors
(1,3): expected-doctype-but-got-start-tag
(1,18): unexpected-end-tag
(1,19): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <p>
| <b>
| <b>
| <b>
| <b>
| <p>
| <b>
| <b>
| <b>
| "x"
#data
<b><em><foo><foob><fooc><aside></b></em>
#errors
(1,35): adoption-agency-1.3
(1,40): adoption-agency-1.3
(1,40): expected-closing-tag-but-got-eof
#document-fragment
div
#document
| <b>
| <em>
| <foo>
| <foob>
| <fooc>
| <aside>
| <b>

View file

@ -0,0 +1,39 @@
#data
<b>1<i>2<p>3</b>4
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,17): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <b>
| "1"
| <i>
| "2"
| <i>
| <p>
| <b>
| "3"
| "4"
#data
<a><div><style></style><address><a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,35): unexpected-start-tag-implies-end-tag
(1,35): adoption-agency-1.3
(1,35): adoption-agency-1.3
(1,35): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <a>
| <div>
| <a>
| <style>
| <address>
| <a>
| <a>

View file

@ -0,0 +1,719 @@
#data
<!doctype html><p>foo<address>bar<p>baz
#errors
(1,39): expected-closing-tag-but-got-eof
30: Unclosed element “address”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <address>
| "bar"
| <p>
| "baz"
#data
<!doctype html><address><p>foo</address>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <address>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<article>bar<p>baz
#errors
(1,39): expected-closing-tag-but-got-eof
30: Unclosed element “article”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <article>
| "bar"
| <p>
| "baz"
#data
<!doctype html><article><p>foo</article>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <article>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<aside>bar<p>baz
#errors
(1,37): expected-closing-tag-but-got-eof
28: Unclosed element “aside”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <aside>
| "bar"
| <p>
| "baz"
#data
<!doctype html><aside><p>foo</aside>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <aside>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<blockquote>bar<p>baz
#errors
(1,42): expected-closing-tag-but-got-eof
33: Unclosed element “blockquote”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <blockquote>
| "bar"
| <p>
| "baz"
#data
<!doctype html><blockquote><p>foo</blockquote>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <blockquote>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<center>bar<p>baz
#errors
(1,38): expected-closing-tag-but-got-eof
29: Unclosed element “center”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <center>
| "bar"
| <p>
| "baz"
#data
<!doctype html><center><p>foo</center>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <center>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<details>bar<p>baz
#errors
(1,39): expected-closing-tag-but-got-eof
30: Unclosed element “details”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <details>
| "bar"
| <p>
| "baz"
#data
<!doctype html><details><p>foo</details>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <details>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<dialog>bar<p>baz
#errors
(1,38): expected-closing-tag-but-got-eof
29: Unclosed element “dialog”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <dialog>
| "bar"
| <p>
| "baz"
#data
<!doctype html><dialog><p>foo</dialog>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <dialog>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<dir>bar<p>baz
#errors
(1,35): expected-closing-tag-but-got-eof
26: Unclosed element “dir”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <dir>
| "bar"
| <p>
| "baz"
#data
<!doctype html><dir><p>foo</dir>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <dir>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<div>bar<p>baz
#errors
(1,35): expected-closing-tag-but-got-eof
26: Unclosed element “div”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <div>
| "bar"
| <p>
| "baz"
#data
<!doctype html><div><p>foo</div>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <div>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<dl>bar<p>baz
#errors
(1,34): expected-closing-tag-but-got-eof
25: Unclosed element “dl”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <dl>
| "bar"
| <p>
| "baz"
#data
<!doctype html><dl><p>foo</dl>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <dl>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<fieldset>bar<p>baz
#errors
(1,40): expected-closing-tag-but-got-eof
31: Unclosed element “fieldset”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <fieldset>
| "bar"
| <p>
| "baz"
#data
<!doctype html><fieldset><p>foo</fieldset>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <fieldset>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<figcaption>bar<p>baz
#errors
(1,42): expected-closing-tag-but-got-eof
33: Unclosed element “figcaption”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <figcaption>
| "bar"
| <p>
| "baz"
#data
<!doctype html><figcaption><p>foo</figcaption>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <figcaption>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<figure>bar<p>baz
#errors
(1,38): expected-closing-tag-but-got-eof
29: Unclosed element “figure”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <figure>
| "bar"
| <p>
| "baz"
#data
<!doctype html><figure><p>foo</figure>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <figure>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<footer>bar<p>baz
#errors
(1,38): expected-closing-tag-but-got-eof
29: Unclosed element “footer”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <footer>
| "bar"
| <p>
| "baz"
#data
<!doctype html><footer><p>foo</footer>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <footer>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<header>bar<p>baz
#errors
(1,38): expected-closing-tag-but-got-eof
29: Unclosed element “header”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <header>
| "bar"
| <p>
| "baz"
#data
<!doctype html><header><p>foo</header>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <header>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<hgroup>bar<p>baz
#errors
(1,38): expected-closing-tag-but-got-eof
29: Unclosed element “hgroup”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <hgroup>
| "bar"
| <p>
| "baz"
#data
<!doctype html><hgroup><p>foo</hgroup>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <hgroup>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<listing>bar<p>baz
#errors
(1,39): expected-closing-tag-but-got-eof
30: Unclosed element “listing”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <listing>
| "bar"
| <p>
| "baz"
#data
<!doctype html><listing><p>foo</listing>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <listing>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<menu>bar<p>baz
#errors
(1,36): expected-closing-tag-but-got-eof
27: Unclosed element “menu”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <menu>
| "bar"
| <p>
| "baz"
#data
<!doctype html><menu><p>foo</menu>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menu>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<nav>bar<p>baz
#errors
(1,35): expected-closing-tag-but-got-eof
26: Unclosed element “nav”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <nav>
| "bar"
| <p>
| "baz"
#data
<!doctype html><nav><p>foo</nav>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <nav>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<ol>bar<p>baz
#errors
(1,34): expected-closing-tag-but-got-eof
25: Unclosed element “ol”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <ol>
| "bar"
| <p>
| "baz"
#data
<!doctype html><ol><p>foo</ol>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <ol>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<pre>bar<p>baz
#errors
(1,35): expected-closing-tag-but-got-eof
26: Unclosed element “pre”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <pre>
| "bar"
| <p>
| "baz"
#data
<!doctype html><pre><p>foo</pre>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <pre>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<section>bar<p>baz
#errors
(1,39): expected-closing-tag-but-got-eof
30: Unclosed element “section”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <section>
| "bar"
| <p>
| "baz"
#data
<!doctype html><section><p>foo</section>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <section>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<summary>bar<p>baz
#errors
(1,39): expected-closing-tag-but-got-eof
30: Unclosed element “summary”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <summary>
| "bar"
| <p>
| "baz"
#data
<!doctype html><summary><p>foo</summary>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <summary>
| <p>
| "foo"
| "bar"
#data
<!doctype html><p>foo<ul>bar<p>baz
#errors
(1,34): expected-closing-tag-but-got-eof
25: Unclosed element “ul”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <ul>
| "bar"
| <p>
| "baz"
#data
<!doctype html><ul><p>foo</ul>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <ul>
| <p>
| "foo"
| "bar"

View file

@ -0,0 +1,224 @@
#data
FOO<!-- BAR -->BAZ
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -->
| "BAZ"
#data
FOO<!-- BAR --!>BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-bang-after-double-dash-in-comment
#new-errors
(1:16) incorrectly-closed-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -->
| "BAZ"
#data
FOO<!-- BAR --! >BAZ
#errors
(1,3): expected-doctype-but-got-chars
#new-errors
(1:20) eof-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR --! >BAZ -->
#data
FOO<!-- BAR --!
>BAZ
#errors
(1,3): expected-doctype-but-got-chars
#new-errors
(1:20) eof-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR --!
>BAZ -->
#data
FOO<!-- BAR -- >BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,21): eof-in-comment
#new-errors
(1:22) eof-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- >BAZ -->
#data
FOO<!-- BAR -- <QUX> -- MUX -->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,24): unexpected-char-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- <QUX> -- MUX -->
| "BAZ"
#data
FOO<!-- BAR -- <QUX> -- MUX --!>BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,24): unexpected-char-in-comment
(1,31): unexpected-bang-after-double-dash-in-comment
#new-errors
(1:32) incorrectly-closed-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- <QUX> -- MUX -->
| "BAZ"
#data
FOO<!-- BAR -- <QUX> -- MUX -- >BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,24): unexpected-char-in-comment
(1,31): unexpected-char-in-comment
(1,35): eof-in-comment
#new-errors
(1:36) eof-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- <QUX> -- MUX -- >BAZ -->
#data
FOO<!---->BAZ
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- -->
| "BAZ"
#data
FOO<!--->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,9): incorrect-comment
#new-errors
(1:9) abrupt-closing-of-empty-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- -->
| "BAZ"
#data
FOO<!-->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,8): incorrect-comment
#new-errors
(1:8) abrupt-closing-of-empty-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- -->
| "BAZ"
#data
<?xml version="1.0">Hi
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,22): expected-doctype-but-got-chars
#new-errors
(1:2) unexpected-question-mark-instead-of-tag-name
#document
| <!-- ?xml version="1.0" -->
| <html>
| <head>
| <body>
| "Hi"
#data
<?xml version="1.0">
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,20): expected-doctype-but-got-eof
#new-errors
(1:2) unexpected-question-mark-instead-of-tag-name
#document
| <!-- ?xml version="1.0" -->
| <html>
| <head>
| <body>
#data
<?xml version
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,13): expected-doctype-but-got-eof
#new-errors
(1:2) unexpected-question-mark-instead-of-tag-name
#document
| <!-- ?xml version -->
| <html>
| <head>
| <body>
#data
FOO<!----->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,10): unexpected-dash-after-double-dash-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- - -->
| "BAZ"
#data
<html><!-- comment --><title>Comment before head</title>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <!-- comment -->
| <head>
| <title>
| "Comment before head"
| <body>

View file

@ -0,0 +1,470 @@
#data
<!DOCTYPE html>Hello
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "Hello"
#data
<!dOctYpE HtMl>Hello
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPEhtml>Hello
#errors
(1,9): need-space-after-doctype
#new-errors
(1:10) missing-whitespace-before-doctype-name
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE>Hello
#errors
(1,9): need-space-after-doctype
(1,10): expected-doctype-name-but-got-right-bracket
(1,10): unknown-doctype
#new-errors
(1:10) missing-doctype-name
#document
| <!DOCTYPE >
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE >Hello
#errors
(1,11): expected-doctype-name-but-got-right-bracket
(1,11): unknown-doctype
#new-errors
(1:11) missing-doctype-name
#document
| <!DOCTYPE >
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato>Hello
#errors
(1,17): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato >Hello
#errors
(1,18): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato taco>Hello
#errors
(1,17): expected-space-or-right-bracket-in-doctype
(1,22): unknown-doctype
#new-errors
(1:18) invalid-character-sequence-after-doctype-name
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato taco "ddd>Hello
#errors
(1,17): expected-space-or-right-bracket-in-doctype
(1,27): unknown-doctype
#new-errors
(1:18) invalid-character-sequence-after-doctype-name
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato sYstEM>Hello
#errors
(1,24): unexpected-char-in-doctype
(1,24): unknown-doctype
#new-errors
(1:24) missing-doctype-system-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato sYstEM >Hello
#errors
(1,28): unexpected-char-in-doctype
(1,28): unknown-doctype
#new-errors
(1:28) missing-doctype-system-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato sYstEM ggg>Hello
#errors
(1,34): unexpected-char-in-doctype
(1,37): unknown-doctype
#new-errors
(1:34) missing-quote-before-doctype-system-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM taco >Hello
#errors
(1,25): unexpected-char-in-doctype
(1,31): unknown-doctype
#new-errors
(1:25) missing-quote-before-doctype-system-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM 'taco"'>Hello
#errors
(1,32): unknown-doctype
#document
| <!DOCTYPE potato "" "taco"">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM "taco">Hello
#errors
(1,31): unknown-doctype
#document
| <!DOCTYPE potato "" "taco">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM "tai'co">Hello
#errors
(1,33): unknown-doctype
#document
| <!DOCTYPE potato "" "tai'co">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEMtaco "ddd">Hello
#errors
(1,24): unexpected-char-in-doctype
(1,34): unknown-doctype
#new-errors
(1:24) missing-quote-before-doctype-system-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato grass SYSTEM taco>Hello
#errors
(1,17): expected-space-or-right-bracket-in-doctype
(1,35): unknown-doctype
#new-errors
(1:18) invalid-character-sequence-after-doctype-name
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato pUbLIc>Hello
#errors
(1,24): unexpected-end-of-doctype
(1,24): unknown-doctype
#new-errors
(1:24) missing-doctype-public-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato pUbLIc >Hello
#errors
(1,25): unexpected-end-of-doctype
(1,25): unknown-doctype
#new-errors
(1:25) missing-doctype-public-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato pUbLIcgoof>Hello
#errors
(1,24): unexpected-char-in-doctype
(1,28): unknown-doctype
#new-errors
(1:24) missing-quote-before-doctype-public-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC goof>Hello
#errors
(1,25): unexpected-char-in-doctype
(1,29): unknown-doctype
#new-errors
(1:25) missing-quote-before-doctype-public-identifier
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC "go'of">Hello
#errors
(1,32): unknown-doctype
#document
| <!DOCTYPE potato "go'of" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC 'go'of'>Hello
#errors
(1,29): unexpected-char-in-doctype
(1,32): unknown-doctype
#new-errors
(1:29) missing-quote-before-doctype-system-identifier
#document
| <!DOCTYPE potato "go" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC 'go:hh of' >Hello
#errors
(1,38): unknown-doctype
#document
| <!DOCTYPE potato "go:hh of" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC "W3C-//dfdf" SYSTEM ggg>Hello
#errors
(1,38): unexpected-char-in-doctype
(1,48): unknown-doctype
#new-errors
(1:38) missing-quote-before-doctype-system-identifier
#document
| <!DOCTYPE potato "W3C-//dfdf" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">Hello
#errors
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE ...>Hello
#errors
(1,14): unknown-doctype
#document
| <!DOCTYPE ...>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
#errors
(2,58): unknown-doctype
#document
| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
#errors
(2,54): unknown-doctype
#document
| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE root-element [SYSTEM OR PUBLIC FPI] "uri" [
<!-- internal declarations -->
]>
#errors
(1,23): expected-space-or-right-bracket-in-doctype
(2,30): unknown-doctype
#new-errors
(1:24) invalid-character-sequence-after-doctype-name
#document
| <!DOCTYPE root-element>
| <html>
| <head>
| <body>
| "]>"
#data
<!DOCTYPE html PUBLIC
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
"http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
#errors
(3,53): unknown-doctype
#document
| <!DOCTYPE html "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"><body><b>Mine!</b></body>
#errors
(1,63): unknown-doctype
#document
| <!DOCTYPE html "" "http://www.w3.org/DTD/HTML4-strict.dtd">
| <html>
| <head>
| <body>
| <b>
| "Mine!"
#data
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">
#errors
(1,50): unexpected-char-in-doctype
#new-errors
(1:50) missing-whitespace-between-doctype-public-and-system-identifiers
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
#errors
(1,50): unexpected-char-in-doctype
#new-errors
(1:50) missing-whitespace-between-doctype-public-and-system-identifiers
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
#errors
(1,21): unexpected-char-in-doctype
(1,49): unexpected-char-in-doctype
#new-errors
(1:22) missing-whitespace-after-doctype-public-keyword
(1:49) missing-whitespace-between-doctype-public-and-system-identifiers
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML PUBLIC'-//W3C//DTD HTML 4.01//EN''http://www.w3.org/TR/html4/strict.dtd'>
#errors
(1,21): unexpected-char-in-doctype
(1,49): unexpected-char-in-doctype
#new-errors
(1:22) missing-whitespace-after-doctype-public-keyword
(1:49) missing-whitespace-between-doctype-public-and-system-identifiers
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>

Binary file not shown.

View file

@ -0,0 +1,943 @@
#data
FOO&gt;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO>BAR"
#data
FOO&gtBAR
#errors
(1,3): expected-doctype-but-got-chars
(1,6): named-entity-without-semicolon
#new-errors
(1:7) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "FOO>BAR"
#data
FOO&gt BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,6): named-entity-without-semicolon
#new-errors
(1:7) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "FOO> BAR"
#data
FOO&gt;;;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO>;;BAR"
#data
I'm &notit; I tell you
#errors
(1,4): expected-doctype-but-got-chars
(1,9): named-entity-without-semicolon
#new-errors
(1:9) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "I'm ¬it; I tell you"
#data
I'm &notin; I tell you
#errors
(1,4): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "I'm ∉ I tell you"
#data
&ammmp;
#errors
(1,1): expected-doctype-but-got-chars
(1,7): unknown-named-character-reference
#new-errors
(1:7) unknown-named-character-reference
#document
| <html>
| <head>
| <body>
| "&ammmp;"
#data
&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;
#errors
(1,1): expected-doctype-but-got-chars
(1,950): unknown-named-character-reference
#new-errors
(1:950) unknown-named-character-reference
#document
| <html>
| <head>
| <body>
| "&ammmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmp;"
#data
FOO& BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO& BAR"
#data
FOO&<BAR>
#errors
(1,3): expected-doctype-but-got-chars
(1,9): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| "FOO&"
| <bar>
#data
FOO&&&&gt;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO&&&>BAR"
#data
FOO&#41;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO)BAR"
#data
FOO&#x41;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOABAR"
#data
FOO&#X41;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOABAR"
#data
FOO&#BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,5): expected-numeric-entity
#new-errors
(1:6) absence-of-digits-in-numeric-character-reference
#document
| <html>
| <head>
| <body>
| "FOO&#BAR"
#data
FOO&#ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,5): expected-numeric-entity
#new-errors
(1:6) absence-of-digits-in-numeric-character-reference
#document
| <html>
| <head>
| <body>
| "FOO&#ZOO"
#data
FOO&#xBAR
#errors
(1,3): expected-doctype-but-got-chars
(1,7): expected-numeric-entity
#new-errors
(1:9) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "FOOºR"
#data
FOO&#xZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,6): expected-numeric-entity
#new-errors
(1:7) absence-of-digits-in-numeric-character-reference
#document
| <html>
| <head>
| <body>
| "FOO&#xZOO"
#data
FOO&#XZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,6): expected-numeric-entity
#new-errors
(1:7) absence-of-digits-in-numeric-character-reference
#document
| <html>
| <head>
| <body>
| "FOO&#XZOO"
#data
FOO&#41BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,7): numeric-entity-without-semicolon
#new-errors
(1:8) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "FOO)BAR"
#data
FOO&#x41BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,10): numeric-entity-without-semicolon
#new-errors
(1:11) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "FOO䆺R"
#data
FOO&#x41ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,8): numeric-entity-without-semicolon
#new-errors
(1:9) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| "FOOAZOO"
#data
FOO&#x0000;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) null-character-reference
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#x0078;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOxZOO"
#data
FOO&#x0079;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOyZOO"
#data
FOO&#x0080;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO€ZOO"
#data
FOO&#x0081;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0082;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0083;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOƒZOO"
#data
FOO&#x0084;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO„ZOO"
#data
FOO&#x0085;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO…ZOO"
#data
FOO&#x0086;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO†ZOO"
#data
FOO&#x0087;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO‡ZOO"
#data
FOO&#x0088;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOˆZOO"
#data
FOO&#x0089;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO‰ZOO"
#data
FOO&#x008A;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOŠZOO"
#data
FOO&#x008B;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x008C;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOŒZOO"
#data
FOO&#x008D;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x008E;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOŽZOO"
#data
FOO&#x008F;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0090;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0091;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0092;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0093;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO“ZOO"
#data
FOO&#x0094;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO”ZOO"
#data
FOO&#x0095;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO•ZOO"
#data
FOO&#x0096;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0097;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO—ZOO"
#data
FOO&#x0098;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO˜ZOO"
#data
FOO&#x0099;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOO™ZOO"
#data
FOO&#x009A;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOšZOO"
#data
FOO&#x009B;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x009C;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOœZOO"
#data
FOO&#x009D;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x009E;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOžZOO"
#data
FOO&#x009F;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) control-character-reference
#document
| <html>
| <head>
| <body>
| "FOOŸZOO"
#data
FOO&#x00A0;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO ZOO"
#data
FOO&#xD7FF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO퟿ZOO"
#data
FOO&#xD800;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) surrogate-character-reference
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xD801;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) surrogate-character-reference
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xDFFE;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) surrogate-character-reference
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xDFFF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#new-errors
(1:12) surrogate-character-reference
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xE000;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x10FFFE;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#new-errors
(1:14) noncharacter-character-reference
#document
| <html>
| <head>
| <body>
| "FOO􏿾ZOO"
#data
FOO&#x1087D4;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO􈟔ZOO"
#data
FOO&#x10FFFF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#new-errors
(1:14) noncharacter-character-reference
#document
| <html>
| <head>
| <body>
| "FOO􏿿ZOO"
#data
FOO&#x110000;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#new-errors
(1:14) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xFFFFFF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#new-errors
(1:14) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#11111111111
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
(1,13): eof-in-numeric-entity
#new-errors
(1:17) missing-semicolon-after-character-reference
(1:17) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>"
#data
FOO&#1111111111
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
(1,13): eof-in-numeric-entity
#new-errors
(1:16) missing-semicolon-after-character-reference
(1:16) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>"
#data
FOO&#111111111111
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
(1,13): eof-in-numeric-entity
#new-errors
(1:18) missing-semicolon-after-character-reference
(1:18) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>"
#data
FOO&#11111111111ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,16): numeric-entity-without-semicolon
(1,16): illegal-codepoint-for-numeric-entity
#new-errors
(1:17) missing-semicolon-after-character-reference
(1:17) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#1111111111ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,15): numeric-entity-without-semicolon
(1,15): illegal-codepoint-for-numeric-entity
#new-errors
(1:16) missing-semicolon-after-character-reference
(1:16) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#111111111111ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,17): numeric-entity-without-semicolon
(1,17): illegal-codepoint-for-numeric-entity
#new-errors
(1:18) missing-semicolon-after-character-reference
(1:18) character-reference-outside-unicode-range
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"

View file

@ -0,0 +1,309 @@
#data
<div bar="ZZ&gt;YY"></div>
#errors
(1,20): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ>YY"
#data
<div bar="ZZ&"></div>
#errors
(1,15): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&"
#data
<div bar='ZZ&'></div>
#errors
(1,15): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&"
#data
<div bar=ZZ&></div>
#errors
(1,13): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&"
#data
<div bar="ZZ&gt=YY"></div>
#errors
(1,15): named-entity-without-semicolon
(1,20): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&gt=YY"
#data
<div bar="ZZ&gt0YY"></div>
#errors
(1,20): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&gt0YY"
#data
<div bar="ZZ&gt9YY"></div>
#errors
(1,20): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&gt9YY"
#data
<div bar="ZZ&gtaYY"></div>
#errors
(1,20): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&gtaYY"
#data
<div bar="ZZ&gtZYY"></div>
#errors
(1,20): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&gtZYY"
#data
<div bar="ZZ&gt YY"></div>
#errors
(1,15): named-entity-without-semicolon
(1,20): expected-doctype-but-got-start-tag
#new-errors
(1:16) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ> YY"
#data
<div bar="ZZ&gt"></div>
#errors
(1,15): named-entity-without-semicolon
(1,17): expected-doctype-but-got-start-tag
#new-errors
(1:16) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ>"
#data
<div bar='ZZ&gt'></div>
#errors
(1,15): named-entity-without-semicolon
(1,17): expected-doctype-but-got-start-tag
#new-errors
(1:16) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ>"
#data
<div bar=ZZ&gt></div>
#errors
(1,14): named-entity-without-semicolon
(1,15): expected-doctype-but-got-start-tag
#new-errors
(1:15) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ>"
#data
<div bar="ZZ&pound_id=23"></div>
#errors
(1,18): named-entity-without-semicolon
(1,26): expected-doctype-but-got-start-tag
#new-errors
(1:19) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ£_id=23"
#data
<div bar="ZZ&prod_id=23"></div>
#errors
(1,25): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&prod_id=23"
#data
<div bar="ZZ&pound;_id=23"></div>
#errors
(1,27): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ£_id=23"
#data
<div bar="ZZ&prod;_id=23"></div>
#errors
(1,26): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ∏_id=23"
#data
<div bar="ZZ&pound=23"></div>
#errors
(1,18): named-entity-without-semicolon
(1,23): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&pound=23"
#data
<div bar="ZZ&prod=23"></div>
#errors
(1,22): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| bar="ZZ&prod=23"
#data
<div>ZZ&pound_id=23</div>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,13): named-entity-without-semicolon
#new-errors
(1:14) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| "ZZ£_id=23"
#data
<div>ZZ&prod_id=23</div>
#errors
(1,5): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| "ZZ&prod_id=23"
#data
<div>ZZ&pound;_id=23</div>
#errors
(1,5): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| "ZZ£_id=23"
#data
<div>ZZ&prod;_id=23</div>
#errors
(1,5): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| "ZZ∏_id=23"
#data
<div>ZZ&pound=23</div>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,13): named-entity-without-semicolon
#new-errors
(1:14) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| "ZZ£=23"
#data
<div>ZZ&prod=23</div>
#errors
(1,5): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
| "ZZ&prod=23"
#data
<div>ZZ&AElig=</div>
#errors
#new-errors
(1:14) missing-semicolon-after-character-reference
#document
| <html>
| <head>
| <body>
| <div>
| "ZZÆ="

View file

@ -0,0 +1,559 @@
#data
<nobr>X
#errors
6: HTML start tag “nobr” in a foreign namespace context.
7: End of file seen and there were open elements.
6: Unclosed element “nobr”.
#document-fragment
svg path
#document
| <svg nobr>
| "X"
#data
<font color></font>X
#errors
12: HTML start tag “font” in a foreign namespace context.
#document-fragment
svg path
#document
| <svg font>
| color=""
| "X"
#data
<font></font>X
#errors
#document-fragment
svg path
#document
| <svg font>
| "X"
#data
<g></path>X
#errors
10: End tag “path” did not match the name of the current open element (“g”).
11: End of file seen and there were open elements.
3: Unclosed element “g”.
#document-fragment
svg path
#document
| <svg g>
| "X"
#data
</path>X
#errors
5: Stray end tag “path”.
#document-fragment
svg path
#document
| "X"
#data
</foreignObject>X
#errors
5: Stray end tag “foreignobject”.
#document-fragment
svg foreignObject
#document
| "X"
#data
</desc>X
#errors
5: Stray end tag “desc”.
#document-fragment
svg desc
#document
| "X"
#data
</title>X
#errors
5: Stray end tag “title”.
#document-fragment
svg title
#document
| "X"
#data
</svg>X
#errors
5: Stray end tag “svg”.
#document-fragment
svg svg
#document
| "X"
#data
</mfenced>X
#errors
5: Stray end tag “mfenced”.
#document-fragment
math mfenced
#document
| "X"
#data
</malignmark>X
#errors
5: Stray end tag “malignmark”.
#document-fragment
math malignmark
#document
| "X"
#data
</math>X
#errors
5: Stray end tag “math”.
#document-fragment
math math
#document
| "X"
#data
</annotation-xml>X
#errors
5: Stray end tag “annotation-xml”.
#document-fragment
math annotation-xml
#document
| "X"
#data
</mtext>X
#errors
5: Stray end tag “mtext”.
#document-fragment
math mtext
#document
| "X"
#data
</mi>X
#errors
5: Stray end tag “mi”.
#document-fragment
math mi
#document
| "X"
#data
</mo>X
#errors
5: Stray end tag “mo”.
#document-fragment
math mo
#document
| "X"
#data
</mn>X
#errors
5: Stray end tag “mn”.
#document-fragment
math mn
#document
| "X"
#data
</ms>X
#errors
5: Stray end tag “ms”.
#document-fragment
math ms
#document
| "X"
#data
<b></b><mglyph/><i></i><malignmark/><u></u><ms/>X
#errors
51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag.
52: End of file seen and there were open elements.
51: Unclosed element “ms”.
#new-errors
(1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus
#document-fragment
math ms
#document
| <b>
| <math mglyph>
| <i>
| <math malignmark>
| <u>
| <ms>
| "X"
#data
<malignmark></malignmark>
#errors
#document-fragment
math ms
#document
| <math malignmark>
#data
<div></div>
#errors
#document-fragment
math ms
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
math ms
#document
| <figure>
#data
<b></b><mglyph/><i></i><malignmark/><u></u><mn/>X
#errors
51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag.
52: End of file seen and there were open elements.
51: Unclosed element “mn”.
#new-errors
(1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus
#document-fragment
math mn
#document
| <b>
| <math mglyph>
| <i>
| <math malignmark>
| <u>
| <mn>
| "X"
#data
<malignmark></malignmark>
#errors
#document-fragment
math mn
#document
| <math malignmark>
#data
<div></div>
#errors
#document-fragment
math mn
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
math mn
#document
| <figure>
#data
<b></b><mglyph/><i></i><malignmark/><u></u><mo/>X
#errors
51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag.
52: End of file seen and there were open elements.
51: Unclosed element “mo”.
#new-errors
(1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus
#document-fragment
math mo
#document
| <b>
| <math mglyph>
| <i>
| <math malignmark>
| <u>
| <mo>
| "X"
#data
<malignmark></malignmark>
#errors
#document-fragment
math mo
#document
| <math malignmark>
#data
<div></div>
#errors
#document-fragment
math mo
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
math mo
#document
| <figure>
#data
<b></b><mglyph/><i></i><malignmark/><u></u><mi/>X
#errors
51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag.
52: End of file seen and there were open elements.
51: Unclosed element “mi”.
#new-errors
(1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus
#document-fragment
math mi
#document
| <b>
| <math mglyph>
| <i>
| <math malignmark>
| <u>
| <mi>
| "X"
#data
<malignmark></malignmark>
#errors
#document-fragment
math mi
#document
| <math malignmark>
#data
<div></div>
#errors
#document-fragment
math mi
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
math mi
#document
| <figure>
#data
<b></b><mglyph/><i></i><malignmark/><u></u><mtext/>X
#errors
51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag.
52: End of file seen and there were open elements.
51: Unclosed element “mtext”.
#new-errors
(1:44-1:52) non-void-html-element-start-tag-with-trailing-solidus
#document-fragment
math mtext
#document
| <b>
| <math mglyph>
| <i>
| <math malignmark>
| <u>
| <mtext>
| "X"
#data
<malignmark></malignmark>
#errors
#document-fragment
math mtext
#document
| <math malignmark>
#data
<div></div>
#errors
#document-fragment
math mtext
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
math mtext
#document
| <figure>
#data
<div></div>
#errors
5: HTML start tag “div” in a foreign namespace context.
#document-fragment
math annotation-xml
#document
| <math div>
#data
<figure></figure>
#errors
#document-fragment
math annotation-xml
#document
| <math figure>
#data
<div></div>
#errors
5: HTML start tag “div” in a foreign namespace context.
#document-fragment
math math
#document
| <math div>
#data
<figure></figure>
#errors
#document-fragment
math math
#document
| <math figure>
#data
<div></div>
#errors
#document-fragment
svg foreignObject
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
svg foreignObject
#document
| <figure>
#data
<div></div>
#errors
#document-fragment
svg title
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
svg title
#document
| <figure>
#data
<figure></figure>
#errors
#document-fragment
svg desc
#document
| <figure>
#data
<div><h1>X</h1></div>
#errors
5: HTML start tag “div” in a foreign namespace context.
9: HTML start tag “h1” in a foreign namespace context.
#document-fragment
svg svg
#document
| <svg div>
| <svg h1>
| "X"
#data
<div></div>
#errors
5: HTML start tag “div” in a foreign namespace context.
#document-fragment
svg svg
#document
| <svg div>
#data
<div></div>
#errors
#document-fragment
svg desc
#document
| <div>
#data
<figure></figure>
#errors
#document-fragment
svg desc
#document
| <figure>
#data
<plaintext><foo>
#errors
(1,16): expected-closing-tag-but-got-eof
#document-fragment
svg desc
#document
| <plaintext>
| "<foo>"
#data
<frameset>X
#errors
6: Stray start tag “frameset”.
#document-fragment
svg desc
#document
| "X"
#data
<head>X
#errors
6: Stray start tag “head”.
#document-fragment
svg desc
#document
| "X"
#data
<body>X
#errors
6: Stray start tag “body”.
#document-fragment
svg desc
#document
| "X"
#data
<html>X
#errors
6: Stray start tag “html”.
#document-fragment
svg desc
#document
| "X"
#data
<html class="foo">X
#errors
6: Stray start tag “html”.
#document-fragment
svg desc
#document
| "X"
#data
<body class="foo">X
#errors
6: Stray start tag “body”.
#document-fragment
svg desc
#document
| "X"

View file

@ -0,0 +1,302 @@
#data
<div<div>
#errors
(1,9): expected-doctype-but-got-start-tag
(1,9): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div<div>
#data
<div foo<bar=''>
#errors
(1,9): invalid-character-in-attribute-name
(1,16): expected-doctype-but-got-start-tag
(1,16): expected-closing-tag-but-got-eof
#new-errors
(1:9) unexpected-character-in-attribute-name
#document
| <html>
| <head>
| <body>
| <div>
| foo<bar=""
#data
<div foo=`bar`>
#errors
(1,10): equals-in-unquoted-attribute-value
(1,14): unexpected-character-in-unquoted-attribute-value
(1,15): expected-doctype-but-got-start-tag
(1,15): expected-closing-tag-but-got-eof
#new-errors
(1:10) unexpected-character-in-unquoted-attribute-value
(1:14) unexpected-character-in-unquoted-attribute-value
#document
| <html>
| <head>
| <body>
| <div>
| foo="`bar`"
#data
<div \"foo=''>
#errors
(1,7): invalid-character-in-attribute-name
(1,14): expected-doctype-but-got-start-tag
(1,14): expected-closing-tag-but-got-eof
#new-errors
(1:7) unexpected-character-in-attribute-name
#document
| <html>
| <head>
| <body>
| <div>
| \"foo=""
#data
<a href='\nbar'></a>
#errors
(1,16): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <a>
| href="\nbar"
#data
<!DOCTYPE html>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
#data
&lang;&rang;
#errors
(1,6): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "⟨⟩"
#data
&apos;
#errors
(1,6): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "'"
#data
&ImaginaryI;
#errors
(1,12): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| ""
#data
&Kopf;
#errors
(1,6): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "𝕂"
#data
&notinva;
#errors
(1,9): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "∉"
#data
<?import namespace="foo" implementation="#bar">
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,47): expected-doctype-but-got-eof
#new-errors
(1:2) unexpected-question-mark-instead-of-tag-name
#document
| <!-- ?import namespace="foo" implementation="#bar" -->
| <html>
| <head>
| <body>
#data
<!--foo--bar-->
#errors
(1,10): unexpected-char-in-comment
(1,15): expected-doctype-but-got-eof
#document
| <!-- foo--bar -->
| <html>
| <head>
| <body>
#data
<![CDATA[x]]>
#errors
(1,2): expected-dashes-or-doctype
(1,13): expected-doctype-but-got-eof
#new-errors
(1:9) cdata-in-html-content
#document
| <!-- [CDATA[x]] -->
| <html>
| <head>
| <body>
#data
<textarea><!--</textarea>--></textarea>
#errors
(1,10): expected-doctype-but-got-start-tag
(1,39): unexpected-end-tag
#document
| <html>
| <head>
| <body>
| <textarea>
| "<!--"
| "-->"
#data
<textarea><!--</textarea>-->
#errors
(1,10): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <textarea>
| "<!--"
| "-->"
#data
<style><!--</style>--></style>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,30): unexpected-end-tag
#document
| <html>
| <head>
| <style>
| "<!--"
| <body>
| "-->"
#data
<style><!--</style>-->
#errors
(1,7): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <style>
| "<!--"
| <body>
| "-->"
#data
<ul><li>A </li> <li>B</li></ul>
#errors
(1,4): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ul>
| <li>
| "A "
| " "
| <li>
| "B"
#data
<table><form><input type=hidden><input></form><div></div></table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,13): unexpected-form-in-table
(1,32): unexpected-hidden-input-in-table
(1,39): unexpected-start-tag-implies-table-voodoo
(1,46): unexpected-end-tag-implies-table-voodoo
(1,46): unexpected-end-tag
(1,51): unexpected-start-tag-implies-table-voodoo
(1,57): unexpected-end-tag-implies-table-voodoo
#document
| <html>
| <head>
| <body>
| <input>
| <div>
| <table>
| <form>
| <input>
| type="hidden"
#data
<i>A<b>B<p></i>C</b>D
#errors
(1,3): expected-doctype-but-got-start-tag
(1,15): adoption-agency-1.3
(1,20): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <i>
| "A"
| <b>
| "B"
| <b>
| <p>
| <b>
| <i>
| "C"
| "D"
#data
<div></div>
#errors
(1,5): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <div>
#data
<svg></svg>
#errors
(1,5): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <svg svg>
#data
<math></math>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <math math>

View file

@ -0,0 +1,54 @@
#data
<button>1</foo>
#errors
(1,8): expected-doctype-but-got-start-tag
(1,15): unexpected-end-tag
(1,15): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <button>
| "1"
#data
<foo>1<p>2</foo>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,16): unexpected-end-tag
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <foo>
| "1"
| <p>
| "2"
#data
<dd>1</foo>
#errors
(1,4): expected-doctype-but-got-start-tag
(1,11): unexpected-end-tag
#document
| <html>
| <head>
| <body>
| <dd>
| "1"
#data
<foo>1<dd>2</foo>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,17): unexpected-end-tag
(1,17): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <foo>
| "1"
| <dd>
| "2"

View file

@ -0,0 +1,49 @@
#data
<isindex>
#errors
(1,9): expected-doctype-but-got-start-tag
(1,9): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <isindex>
#data
<isindex name="A" action="B" prompt="C" foo="D">
#errors
(1,48): expected-doctype-but-got-start-tag
(1,48): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <isindex>
| action="B"
| foo="D"
| name="A"
| prompt="C"
#data
<form><isindex>
#errors
(1,6): expected-doctype-but-got-start-tag
(1,15): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <form>
| <isindex>
#data
<!doctype html><isindex>x</isindex>x
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <isindex>
| "x"
| "x"

View file

@ -0,0 +1,46 @@
#data
<!doctype html><p>foo<main>bar<p>baz
#errors
(1,36): expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <main>
| "bar"
| <p>
| "baz"
#data
<!doctype html><main><p>foo</main>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <main>
| <p>
| "foo"
| "bar"
#data
<!DOCTYPE html>xxx<svg><x><g><a><main><b>
#errors
* (1,42) unexpected HTML-like start tag token in foreign content
* (1,42) unexpected end of file
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "xxx"
| <svg svg>
| <svg x>
| <svg g>
| <svg a>
| <svg main>
| <b>

View file

@ -0,0 +1,81 @@
#data
<math><tr><td><mo><tr>
#errors
#document-fragment
td
#document
| <math math>
| <math tr>
| <math td>
| <math mo>
#data
<math><tr><td><mo><tr>
#errors
#document-fragment
tr
#document
| <math math>
| <math tr>
| <math td>
| <math mo>
#data
<math><thead><mo><tbody>
#errors
#document-fragment
thead
#document
| <math math>
| <math thead>
| <math mo>
#data
<math><tfoot><mo><tbody>
#errors
#document-fragment
tfoot
#document
| <math math>
| <math tfoot>
| <math mo>
#data
<math><tbody><mo><tfoot>
#errors
#document-fragment
tbody
#document
| <math math>
| <math tbody>
| <math mo>
#data
<math><tbody><mo></table>
#errors
#document-fragment
tbody
#document
| <math math>
| <math tbody>
| <math mo>
#data
<math><thead><mo></table>
#errors
#document-fragment
tbody
#document
| <math math>
| <math thead>
| <math mo>
#data
<math><tfoot><mo></table>
#errors
#document-fragment
tbody
#document
| <math math>
| <math tfoot>
| <math mo>

View file

@ -0,0 +1,257 @@
#data
<menuitem>
#errors
10: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
10: End of file seen and there were open elements.
10: Unclosed element “menuitem”.
#document
| <html>
| <head>
| <body>
| <menuitem>
#data
</menuitem>
#errors
11: End tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
11: Stray end tag “menuitem”.
#document
| <html>
| <head>
| <body>
#data
<!DOCTYPE html><body><menuitem>A
#errors
32: End of file seen and there were open elements.
31: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| "A"
#data
<!DOCTYPE html><body><menuitem>A<menuitem>B
#errors
43: End of file seen and there were open elements.
42: Unclosed element “menuitem”.
31: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| "A"
| <menuitem>
| "B"
#data
<!DOCTYPE html><body><menuitem>A<menu>B</menu>
#errors
46: End of file seen and there were open elements.
31: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| "A"
| <menu>
| "B"
#data
<!DOCTYPE html><body><menuitem>A<hr>B
#errors
37: End of file seen and there were open elements.
31: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| "A"
| <hr>
| "B"
#data
<!DOCTYPE html><li><menuitem><li>
#errors
33: End tag “li” implied, but there were open elements.
29: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <li>
| <menuitem>
| <li>
#data
<!DOCTYPE html><menuitem><p></menuitem>x
#errors
39: Stray end tag “menuitem”.
40: End of file seen and there were open elements.
25: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| <p>
| "x"
#data
<!DOCTYPE html><p><b></p><menuitem>
#errors
25: End tag “p” seen, but there were open elements.
21: Unclosed element “b”.
35: End of file seen and there were open elements.
35: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| <b>
| <b>
| <menuitem>
#data
<!DOCTYPE html><menuitem><asdf></menuitem>x
#errors
42: End tag “menuitem” seen, but there were open elements.
31: Unclosed element “asdf”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| <asdf>
| "x"
#data
<!DOCTYPE html></menuitem>
#errors
26: Stray end tag “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
#data
<!DOCTYPE html><html></menuitem>
#errors
26: Stray end tag “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
#data
<!DOCTYPE html><head></menuitem>
#errors
26: Stray end tag “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
#data
<!DOCTYPE html><select><menuitem></select>
#errors
33: Stray start tag “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!DOCTYPE html><option><menuitem>
#errors
33: End of file seen and there were open elements.
33: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <option>
| <menuitem>
#data
<!DOCTYPE html><menuitem><option>
#errors
33: End of file seen and there were open elements.
25: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| <option>
#data
<!DOCTYPE html><menuitem></body>
#errors
32: End tag for “body” seen, but there were unclosed elements.
25: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
#data
<!DOCTYPE html><menuitem></html>
#errors
32: End tag for “html” seen, but there were unclosed elements.
25: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
#data
<!DOCTYPE html><menuitem><p>
#errors
28: End of file seen and there were open elements.
25: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| <p>
#data
<!DOCTYPE html><menuitem><li>
#errors
29: End of file seen and there were open elements.
25: Unclosed element “menuitem”.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <menuitem>
| <li>

View file

@ -0,0 +1,16 @@
#data
<body><table><tr><td><svg><td><foreignObject><span></td>Foo
#errors
#document
| <html>
| <head>
| <body>
| "Foo"
| <table>
| <tbody>
| <tr>
| <td>
| <svg svg>
| <svg td>
| <svg foreignObject>
| <span>

View file

@ -0,0 +1,237 @@
#data
<head><noscript><!doctype html><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 31 Unexpected DOCTYPE. Ignored.
#script-off
#document
| <html>
| <head>
| <noscript>
| <!-- foo -->
| <body>
#data
<head><noscript><html class="foo"><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 34 html needs to be the first start tag.
#script-off
#document
| <html>
| class="foo"
| <head>
| <noscript>
| <!-- foo -->
| <body>
#data
<head><noscript></noscript>
#errors
(1,6): expected-doctype-but-got-tag
#script-off
#document
| <html>
| <head>
| <noscript>
| <body>
#data
<head><noscript> </noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| " "
| <body>
#data
<head><noscript><!--foo--></noscript>
#errors
(1,6): expected-doctype-but-got-tag
#script-off
#document
| <html>
| <head>
| <noscript>
| <!-- foo -->
| <body>
#data
<head><noscript><basefont><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| <basefont>
| <!-- foo -->
| <body>
#data
<head><noscript><bgsound><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| <bgsound>
| <!-- foo -->
| <body>
#data
<head><noscript><link><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| <link>
| <!-- foo -->
| <body>
#data
<head><noscript><meta><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| <meta>
| <!-- foo -->
| <body>
#data
<head><noscript><noframes>XXX</noscript></noframes></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| <noframes>
| "XXX</noscript>"
| <body>
#data
<head><noscript><style>XXX</style></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
#script-off
#document
| <html>
| <head>
| <noscript>
| <style>
| "XXX"
| <body>
#data
<head><noscript></br><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 21 Element br not allowed in a inhead-noscript context
Line: 1 Col: 21 Unexpected end tag (br). Treated as br element.
Line: 1 Col: 42 Unexpected end tag (noscript). Ignored.
#script-off
#document
| <html>
| <head>
| <noscript>
| <body>
| <br>
| <!-- foo -->
#data
<head><noscript><head class="foo"><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 34 Unexpected start tag (head).
#script-off
#document
| <html>
| <head>
| <noscript>
| <!-- foo -->
| <body>
#data
<head><noscript><noscript class="foo"><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 34 Unexpected start tag (noscript).
#script-off
#document
| <html>
| <head>
| <noscript>
| <!-- foo -->
| <body>
#data
<head><noscript></p><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 20 Unexpected end tag (p). Ignored.
#script-off
#document
| <html>
| <head>
| <noscript>
| <!-- foo -->
| <body>
#data
<head><noscript><p><!--foo--></noscript>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 19 Element p not allowed in a inhead-noscript context
Line: 1 Col: 40 Unexpected end tag (noscript). Ignored.
#script-off
#document
| <html>
| <head>
| <noscript>
| <body>
| <p>
| <!-- foo -->
#data
<head><noscript>XXX<!--foo--></noscript></head>
#errors
Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
Line: 1 Col: 19 Unexpected non-space character. Expected inhead-noscript content
Line: 1 Col: 30 Unexpected end tag (noscript). Ignored.
Line: 1 Col: 37 Unexpected end tag (head). Ignored.
#script-off
#document
| <html>
| <head>
| <noscript>
| <body>
| "XXX"
| <!-- foo -->
#data
<head><noscript>
#errors
(1,6): expected-doctype-but-got-tag
(1,6): eof-in-head-noscript
#script-off
#document
| <html>
| <head>
| <noscript>
| <body>

View file

@ -0,0 +1,46 @@
#data
<input type="hidden"><frameset>
#errors
(1,21): expected-doctype-but-got-start-tag
(1,31): unexpected-start-tag
(1,31): eof-in-frameset
#document
| <html>
| <head>
| <frameset>
#data
<!DOCTYPE html><table><caption><svg>foo</table>bar
#errors
(1,47): unexpected-end-tag
(1,47): end-table-tag-in-caption
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <caption>
| <svg svg>
| "foo"
| "bar"
#data
<table><tr><td><svg><desc><td></desc><circle>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,30): unexpected-cell-end-tag
(1,37): unexpected-end-tag
(1,45): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <svg svg>
| <svg desc>
| <td>
| <circle>

Binary file not shown.

View file

@ -0,0 +1,301 @@
#data
<html><ruby>a<rb>b<rb></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rb>
| "b"
| <rb>
#data
<html><ruby>a<rb>b<rt></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rb>
| "b"
| <rt>
#data
<html><ruby>a<rb>b<rtc></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rb>
| "b"
| <rtc>
#data
<html><ruby>a<rb>b<rp></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rb>
| "b"
| <rp>
#data
<html><ruby>a<rb>b<span></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
(1,31): unexpected-end-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rb>
| "b"
| <span>
#data
<html><ruby>a<rt>b<rb></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rt>
| "b"
| <rb>
#data
<html><ruby>a<rt>b<rt></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rt>
| "b"
| <rt>
#data
<html><ruby>a<rt>b<rtc></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rt>
| "b"
| <rtc>
#data
<html><ruby>a<rt>b<rp></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rt>
| "b"
| <rp>
#data
<html><ruby>a<rt>b<span></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
(1,31): unexpected-end-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rt>
| "b"
| <span>
#data
<html><ruby>a<rtc>b<rb></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rtc>
| "b"
| <rb>
#data
<html><ruby>a<rtc>b<rt>c<rt>d</ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rtc>
| "b"
| <rt>
| "c"
| <rt>
| "d"
#data
<html><ruby>a<rtc>b<rtc></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rtc>
| "b"
| <rtc>
#data
<html><ruby>a<rtc>b<rp></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rtc>
| "b"
| <rp>
#data
<html><ruby>a<rtc>b<span></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rtc>
| "b"
| <span>
#data
<html><ruby>a<rp>b<rb></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rp>
| "b"
| <rb>
#data
<html><ruby>a<rp>b<rt></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rp>
| "b"
| <rt>
#data
<html><ruby>a<rp>b<rtc></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rp>
| "b"
| <rtc>
#data
<html><ruby>a<rp>b<rp></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rp>
| "b"
| <rp>
#data
<html><ruby>a<rp>b<span></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
(1,31): unexpected-end-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| "a"
| <rp>
| "b"
| <span>
#data
<html><ruby><rtc><ruby>a<rb>b<rt></ruby></ruby></html>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <ruby>
| <rtc>
| <ruby>
| "a"
| <rb>
| "b"
| <rt>

View file

@ -0,0 +1,385 @@
#data
FOO<script>'Hello'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'Hello'"
| "BAR"
#data
FOO<script></script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "BAR"
#data
FOO<script></script >BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "BAR"
#data
FOO<script></script/>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,21): self-closing-flag-on-end-tag
#new-errors
(1:21) end-tag-with-trailing-solidus
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "BAR"
#data
FOO<script></script/ >BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,20): unexpected-character-after-solidus-in-tag
#new-errors
(1:21) unexpected-solidus-in-tag
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "BAR"
#data
FOO<script type="text/plain"></scriptx>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,42): expected-named-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "</scriptx>BAR"
#data
FOO<script></script foo=">" dd>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,31): attributes-in-end-tag
#new-errors
(1:31) end-tag-with-attributes
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "BAR"
#data
FOO<script>'<'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<'"
| "BAR"
#data
FOO<script>'<!'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!'"
| "BAR"
#data
FOO<script>'<!-'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!-'"
| "BAR"
#data
FOO<script>'<!--'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!--'"
| "BAR"
#data
FOO<script>'<!---'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!---'"
| "BAR"
#data
FOO<script>'<!-->'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!-->'"
| "BAR"
#data
FOO<script>'<!-->'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!-->'"
| "BAR"
#data
FOO<script>'<!-- potato'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!-- potato'"
| "BAR"
#data
FOO<script>'<!-- <sCrIpt'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!-- <sCrIpt'"
| "BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt>'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,56): expected-script-data-but-got-eof
(1,56): expected-named-closing-tag-but-got-eof
#new-errors
(1:57) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt>'</script>BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt> -'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,58): expected-script-data-but-got-eof
(1,58): expected-named-closing-tag-but-got-eof
#new-errors
(1:59) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt> -'</script>BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt> --'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,59): expected-script-data-but-got-eof
(1,59): expected-named-closing-tag-but-got-eof
#new-errors
(1:60) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt> --'</script>BAR"
#data
FOO<script>'<!-- <sCrIpt> -->'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "'<!-- <sCrIpt> -->'"
| "BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt> --!>'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,61): expected-script-data-but-got-eof
(1,61): expected-named-closing-tag-but-got-eof
#new-errors
(1:62) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt> --!>'</script>BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt> -- >'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,61): expected-script-data-but-got-eof
(1,61): expected-named-closing-tag-but-got-eof
#new-errors
(1:62) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt> -- >'</script>BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt '</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,56): expected-script-data-but-got-eof
(1,56): expected-named-closing-tag-but-got-eof
#new-errors
(1:57) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt '</script>BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt/'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,56): expected-script-data-but-got-eof
(1,56): expected-named-closing-tag-but-got-eof
#new-errors
(1:57) eof-in-script-html-comment-like-text
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt/'</script>BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt\'</script>BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt\'"
| "BAR"
#data
FOO<script type="text/plain">'<!-- <sCrIpt/'</script>BAR</script>QUX
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| type="text/plain"
| "'<!-- <sCrIpt/'</script>BAR"
| "QUX"
#data
FOO<script><!--<script>-></script>--></script>QUX
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <script>
| "<!--<script>-></script>-->"
| "QUX"

View file

@ -0,0 +1,16 @@
#data
<p><b id="A"><script>document.getElementById("A").id = "B"</script></p>TEXT</b>
#errors
#script-on
#document
| <html>
| <head>
| <body>
| <p>
| <b>
| id="B"
| <script>
| "document.getElementById("A").id = "B""
| <b>
| id="A"
| "TEXT"

View file

@ -0,0 +1,27 @@
#data
<p><font size=4><font size=4><font size=4><script>document.getElementsByTagName("font")[2].setAttribute("size", "5");</script><font size=4><p>X
#errors
#script-on
#document
| <html>
| <head>
| <body>
| <p>
| <font>
| size="4"
| <font>
| size="4"
| <font>
| size="5"
| <script>
| "document.getElementsByTagName("font")[2].setAttribute("size", "5");"
| <font>
| size="4"
| <p>
| <font>
| size="4"
| <font>
| size="4"
| <font>
| size="4"
| "X"

View file

@ -0,0 +1,30 @@
#data
1<script>document.write("2")</script>3
#errors
#script-on
#document
| <html>
| <head>
| <body>
| "1"
| <script>
| "document.write("2")"
| "23"
#data
1<script>document.write("<script>document.write('2')</scr"+ "ipt><script>document.write('3')</scr" + "ipt>")</script>4
#errors
#script-on
#document
| <html>
| <head>
| <body>
| "1"
| <script>
| "document.write("<script>document.write('2')</scr"+ "ipt><script>document.write('3')</scr" + "ipt>")"
| <script>
| "document.write('2')"
| "2"
| <script>
| "document.write('3')"
| "34"

View file

@ -0,0 +1,286 @@
#data
<table><th>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,11): unexpected-cell-in-table-body
(1,11): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <th>
#data
<table><td>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,11): unexpected-cell-in-table-body
(1,11): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
#data
<table><col foo='bar'>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,22): eof-in-table
#document
| <html>
| <head>
| <body>
| <table>
| <colgroup>
| <col>
| foo="bar"
#data
<table><colgroup></html>foo
#errors
(1,7): expected-doctype-but-got-start-tag
(1,24): unexpected-end-tag
(1,27): foster-parenting-character-in-table
(1,27): foster-parenting-character-in-table
(1,27): foster-parenting-character-in-table
(1,27): eof-in-table
#document
| <html>
| <head>
| <body>
| "foo"
| <table>
| <colgroup>
#data
<table></table><p>foo
#errors
(1,7): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <table>
| <p>
| "foo"
#data
<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,14): unexpected-end-tag
(1,24): unexpected-end-tag
(1,30): unexpected-end-tag
(1,41): unexpected-end-tag
(1,48): unexpected-end-tag
(1,56): unexpected-end-tag
(1,61): unexpected-end-tag
(1,69): unexpected-end-tag
(1,74): unexpected-end-tag
(1,82): unexpected-end-tag
(1,87): unexpected-end-tag
(1,91): unexpected-cell-in-table-body
(1,91): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
#data
<table><select><option>3</select></table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,15): unexpected-start-tag-implies-table-voodoo
#document
| <html>
| <head>
| <body>
| <select>
| <option>
| "3"
| <table>
#data
<table><select><table></table></select></table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,15): unexpected-start-tag-implies-table-voodoo
(1,22): unexpected-table-element-start-tag-in-select-in-table
(1,22): unexpected-start-tag-implies-end-tag
(1,39): unexpected-end-tag
(1,47): unexpected-end-tag
#document
| <html>
| <head>
| <body>
| <select>
| <table>
| <table>
#data
<table><select></table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,15): unexpected-start-tag-implies-table-voodoo
(1,23): unexpected-table-element-end-tag-in-select-in-table
#document
| <html>
| <head>
| <body>
| <select>
| <table>
#data
<table><select><option>A<tr><td>B</td></tr></table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,15): unexpected-start-tag-implies-table-voodoo
(1,28): unexpected-table-element-start-tag-in-select-in-table
#document
| <html>
| <head>
| <body>
| <select>
| <option>
| "A"
| <table>
| <tbody>
| <tr>
| <td>
| "B"
#data
<table><td></body></caption></col></colgroup></html>foo
#errors
(1,7): expected-doctype-but-got-start-tag
(1,11): unexpected-cell-in-table-body
(1,18): unexpected-end-tag
(1,28): unexpected-end-tag
(1,34): unexpected-end-tag
(1,45): unexpected-end-tag
(1,52): unexpected-end-tag
(1,55): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| "foo"
#data
<table><td>A</table>B
#errors
(1,7): expected-doctype-but-got-start-tag
(1,11): unexpected-cell-in-table-body
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| "A"
| "B"
#data
<table><tr><caption>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,20): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <caption>
#data
<table><tr></body></caption></col></colgroup></html></td></th><td>foo
#errors
(1,7): expected-doctype-but-got-start-tag
(1,18): unexpected-end-tag-in-table-row
(1,28): unexpected-end-tag-in-table-row
(1,34): unexpected-end-tag-in-table-row
(1,45): unexpected-end-tag-in-table-row
(1,52): unexpected-end-tag-in-table-row
(1,57): unexpected-end-tag-in-table-row
(1,62): unexpected-end-tag-in-table-row
(1,69): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| "foo"
#data
<table><td><tr>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,11): unexpected-cell-in-table-body
(1,15): eof-in-table
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <tr>
#data
<table><td><button><td>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,11): unexpected-cell-in-table-body
(1,23): unexpected-cell-end-tag
(1,23): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <button>
| <td>
#data
<table><tr><td><svg><desc><td>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,30): unexpected-cell-end-tag
(1,30): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <svg svg>
| <svg desc>
| <td>

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,849 @@
#data
<!DOCTYPE html><svg></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
#data
<!DOCTYPE html><svg></svg><![CDATA[a]]>
#errors
(1,28) expected-dashes-or-doctype
#new-errors
(1:35) cdata-in-html-content
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <!-- [CDATA[a]] -->
#data
<!DOCTYPE html><body><svg></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
#data
<!DOCTYPE html><body><select><svg></svg></select>
#errors
(1,34) unexpected-start-tag-in-select
(1,40) unexpected-end-tag-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!DOCTYPE html><body><select><option><svg></svg></option></select>
#errors
(1,42) unexpected-start-tag-in-select
(1,48) unexpected-end-tag-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
| <option>
#data
<!DOCTYPE html><body><table><svg></svg></table>
#errors
(1,33) foster-parenting-start-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <table>
#data
<!DOCTYPE html><body><table><svg><g>foo</g></svg></table>
#errors
(1,33) foster-parenting-start-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <table>
#data
<!DOCTYPE html><body><table><svg><g>foo</g><g>bar</g></svg></table>
#errors
(1,33) foster-parenting-start-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <table>
#data
<!DOCTYPE html><body><table><tbody><svg><g>foo</g><g>bar</g></svg></tbody></table>
#errors
(1,40) foster-parenting-start-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <table>
| <tbody>
#data
<!DOCTYPE html><body><table><tbody><tr><svg><g>foo</g><g>bar</g></svg></tr></tbody></table>
#errors
(1,44) foster-parenting-start-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <table>
| <tbody>
| <tr>
#data
<!DOCTYPE html><body><table><tbody><tr><td><svg><g>foo</g><g>bar</g></svg></td></tr></tbody></table>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
#data
<!DOCTYPE html><body><table><tbody><tr><td><svg><g>foo</g><g>bar</g></svg><p>baz</td></tr></tbody></table>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <p>
| "baz"
#data
<!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g></svg><p>baz</caption></table>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <caption>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <p>
| "baz"
#data
<!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
#errors
(1,65) unexpected-html-element-in-foreign-content
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <caption>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <p>
| "baz"
| <p>
| "quux"
#data
<!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g>baz</table><p>quux
#errors
(1,73) unexpected-end-tag
(1,73) expected-one-end-tag-but-got-another
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <caption>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| "baz"
| <p>
| "quux"
#data
<!DOCTYPE html><body><table><colgroup><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
#errors
(1,43) foster-parenting-start-tag svg
(1,66) unexpected HTML-like start tag token in foreign content
(1,66) foster-parenting-start-tag
(1,67) foster-parenting-character
(1,68) foster-parenting-character
(1,69) foster-parenting-character
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <p>
| "baz"
| <table>
| <colgroup>
| <p>
| "quux"
#data
<!DOCTYPE html><body><table><tr><td><select><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
#errors
(1,49) unexpected-start-tag-in-select
(1,52) unexpected-start-tag-in-select
(1,59) unexpected-end-tag-in-select
(1,62) unexpected-start-tag-in-select
(1,69) unexpected-end-tag-in-select
(1,72) unexpected-start-tag-in-select
(1,83) unexpected-table-element-end-tag-in-select-in-table
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <select>
| "foobarbaz"
| <p>
| "quux"
#data
<!DOCTYPE html><body><table><select><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
#errors
(1,36) unexpected-start-tag-implies-table-voodoo
(1,41) unexpected-start-tag-in-select
(1,44) unexpected-start-tag-in-select
(1,51) unexpected-end-tag-in-select
(1,54) unexpected-start-tag-in-select
(1,61) unexpected-end-tag-in-select
(1,64) unexpected-start-tag-in-select
(1,75) unexpected-table-element-end-tag-in-select-in-table
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
| "foobarbaz"
| <table>
| <p>
| "quux"
#data
<!DOCTYPE html><body></body></html><svg><g>foo</g><g>bar</g><p>baz
#errors
(1,40) expected-eof-but-got-start-tag
(1,63) unexpected-html-element-in-foreign-content
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <p>
| "baz"
#data
<!DOCTYPE html><body></body><svg><g>foo</g><g>bar</g><p>baz
#errors
(1,33) unexpected-start-tag-after-body
(1,56) unexpected-html-element-in-foreign-content
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg g>
| "foo"
| <svg g>
| "bar"
| <p>
| "baz"
#data
<!DOCTYPE html><frameset><svg><g></g><g></g><p><span>
#errors
(1,30) unexpected-start-tag-in-frameset
(1,33) unexpected-start-tag-in-frameset
(1,37) unexpected-end-tag-in-frameset
(1,40) unexpected-start-tag-in-frameset
(1,44) unexpected-end-tag-in-frameset
(1,47) unexpected-start-tag-in-frameset
(1,53) unexpected-start-tag-in-frameset
(1,53) eof-in-frameset
#document
| <!DOCTYPE html>
| <html>
| <head>
| <frameset>
#data
<!DOCTYPE html><frameset></frameset><svg><g></g><g></g><p><span>
#errors
(1,41) unexpected-start-tag-after-frameset
(1,44) unexpected-start-tag-after-frameset
(1,48) unexpected-end-tag-after-frameset
(1,51) unexpected-start-tag-after-frameset
(1,55) unexpected-end-tag-after-frameset
(1,58) unexpected-start-tag-after-frameset
(1,64) unexpected-start-tag-after-frameset
#document
| <!DOCTYPE html>
| <html>
| <head>
| <frameset>
#data
<!DOCTYPE html><body xlink:href=foo><svg xlink:href=foo></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| xlink:href="foo"
| <svg svg>
| xlink href="foo"
#data
<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo></g></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| xlink:href="foo"
| xml:lang="en"
| <svg svg>
| <svg g>
| xlink href="foo"
| xml lang="en"
#data
<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo /></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| xlink:href="foo"
| xml:lang="en"
| <svg svg>
| <svg g>
| xlink href="foo"
| xml lang="en"
#data
<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo />bar</svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| xlink:href="foo"
| xml:lang="en"
| <svg svg>
| <svg g>
| xlink href="foo"
| xml lang="en"
| "bar"
#data
<svg></path>
#errors
(1,5) expected-doctype-but-got-start-tag
(1,12) unexpected-end-tag
(1,12) unexpected-end-tag
(1,12) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <svg svg>
#data
<div><svg></div>a
#errors
(1,5) expected-doctype-but-got-start-tag
(1,16) unexpected-end-tag
(1,16) end-tag-too-early
#document
| <html>
| <head>
| <body>
| <div>
| <svg svg>
| "a"
#data
<div><svg><path></div>a
#errors
(1,5) expected-doctype-but-got-start-tag
(1,22) unexpected-end-tag
(1,22) end-tag-too-early
#document
| <html>
| <head>
| <body>
| <div>
| <svg svg>
| <svg path>
| "a"
#data
<div><svg><path></svg><path>
#errors
(1,5) expected-doctype-but-got-start-tag
(1,22) unexpected-end-tag
(1,28) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <svg svg>
| <svg path>
| <path>
#data
<div><svg><path><foreignObject><math></div>a
#errors
(1,5) expected-doctype-but-got-start-tag
(1,43) unexpected-end-tag
(1,43) end-tag-too-early
(1,44) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <svg svg>
| <svg path>
| <svg foreignObject>
| <math math>
| "a"
#data
<div><svg><path><foreignObject><p></div>a
#errors
(1,5) expected-doctype-but-got-start-tag
(1,40) end-tag-too-early
(1,41) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <svg svg>
| <svg path>
| <svg foreignObject>
| <p>
| "a"
#data
<!DOCTYPE html><svg><desc><div><svg><ul>a
#errors
(1,40) unexpected-html-element-in-foreign-content
(1,41) expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg desc>
| <div>
| <svg svg>
| <ul>
| "a"
#data
<!DOCTYPE html><svg><desc><svg><ul>a
#errors
(1,35) unexpected-html-element-in-foreign-content
(1,36) expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg desc>
| <svg svg>
| <ul>
| "a"
#data
<!DOCTYPE html><p><svg><desc><p>
#errors
(1,32) expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| <svg svg>
| <svg desc>
| <p>
#data
<!DOCTYPE html><p><svg><title><p>
#errors
(1,33) expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| <svg svg>
| <svg title>
| <p>
#data
<div><svg><path><foreignObject><p></foreignObject><p>
#errors
(1,5) expected-doctype-but-got-start-tag
(1,50) unexpected-end-tag
(1,53) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <svg svg>
| <svg path>
| <svg foreignObject>
| <p>
| <p>
#data
<math><mi><div><object><div><span></span></div></object></div></mi><mi>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,71) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mi>
| <div>
| <object>
| <div>
| <span>
| <math mi>
#data
<math><mi><svg><foreignObject><div><div></div></div></foreignObject></svg></mi><mi>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,83) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mi>
| <svg svg>
| <svg foreignObject>
| <div>
| <div>
| <math mi>
#data
<svg><script></script><path>
#errors
(1,5) expected-doctype-but-got-start-tag
(1,28) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <svg svg>
| <svg script>
| <svg path>
#data
<table><svg></svg><tr>
#errors
(1,7) expected-doctype-but-got-start-tag
(1,12) unexpected-start-tag-implies-table-voodoo
(1,22) eof-in-table
#document
| <html>
| <head>
| <body>
| <svg svg>
| <table>
| <tbody>
| <tr>
#data
<math><mi><mglyph>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,18) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mi>
| <math mglyph>
#data
<math><mi><malignmark>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,22) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mi>
| <math malignmark>
#data
<math><mo><mglyph>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,18) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mo>
| <math mglyph>
#data
<math><mo><malignmark>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,22) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mo>
| <math malignmark>
#data
<math><mn><mglyph>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,18) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mn>
| <math mglyph>
#data
<math><mn><malignmark>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,22) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mn>
| <math malignmark>
#data
<math><ms><mglyph>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,18) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math ms>
| <math mglyph>
#data
<math><ms><malignmark>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,22) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math ms>
| <math malignmark>
#data
<math><mtext><mglyph>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,21) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mtext>
| <math mglyph>
#data
<math><mtext><malignmark>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,25) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math mtext>
| <math malignmark>
#data
<math><annotation-xml><svg></svg></annotation-xml><mi>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,54) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math annotation-xml>
| <svg svg>
| <math mi>
#data
<math><annotation-xml><svg><foreignObject><div><math><mi></mi></math><span></span></div></foreignObject><path></path></svg></annotation-xml><mi>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,144) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math annotation-xml>
| <svg svg>
| <svg foreignObject>
| <div>
| <math math>
| <math mi>
| <span>
| <svg path>
| <math mi>
#data
<math><annotation-xml><svg><foreignObject><math><mi><svg></svg></mi><mo></mo></math><span></span></foreignObject><path></path></svg></annotation-xml><mi>
#errors
(1,6) expected-doctype-but-got-start-tag
(1,153) expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <math math>
| <math annotation-xml>
| <svg svg>
| <svg foreignObject>
| <math math>
| <math mi>
| <svg svg>
| <math mo>
| <span>
| <svg path>
| <math mi>

View file

@ -0,0 +1,523 @@
#data
<!DOCTYPE html><body><svg attributeName='' attributeType='' baseFrequency='' baseProfile='' calcMode='' clipPathUnits='' diffuseConstant='' edgeMode='' filterUnits='' glyphRef='' gradientTransform='' gradientUnits='' kernelMatrix='' kernelUnitLength='' keyPoints='' keySplines='' keyTimes='' lengthAdjust='' limitingConeAngle='' markerHeight='' markerUnits='' markerWidth='' maskContentUnits='' maskUnits='' numOctaves='' pathLength='' patternContentUnits='' patternTransform='' patternUnits='' pointsAtX='' pointsAtY='' pointsAtZ='' preserveAlpha='' preserveAspectRatio='' primitiveUnits='' refX='' refY='' repeatCount='' repeatDur='' requiredExtensions='' requiredFeatures='' specularConstant='' specularExponent='' spreadMethod='' startOffset='' stdDeviation='' stitchTiles='' surfaceScale='' systemLanguage='' tableValues='' targetX='' targetY='' textLength='' viewBox='' viewTarget='' xChannelSelector='' yChannelSelector='' zoomAndPan=''></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| attributeName=""
| attributeType=""
| baseFrequency=""
| baseProfile=""
| calcMode=""
| clipPathUnits=""
| diffuseConstant=""
| edgeMode=""
| filterUnits=""
| glyphRef=""
| gradientTransform=""
| gradientUnits=""
| kernelMatrix=""
| kernelUnitLength=""
| keyPoints=""
| keySplines=""
| keyTimes=""
| lengthAdjust=""
| limitingConeAngle=""
| markerHeight=""
| markerUnits=""
| markerWidth=""
| maskContentUnits=""
| maskUnits=""
| numOctaves=""
| pathLength=""
| patternContentUnits=""
| patternTransform=""
| patternUnits=""
| pointsAtX=""
| pointsAtY=""
| pointsAtZ=""
| preserveAlpha=""
| preserveAspectRatio=""
| primitiveUnits=""
| refX=""
| refY=""
| repeatCount=""
| repeatDur=""
| requiredExtensions=""
| requiredFeatures=""
| specularConstant=""
| specularExponent=""
| spreadMethod=""
| startOffset=""
| stdDeviation=""
| stitchTiles=""
| surfaceScale=""
| systemLanguage=""
| tableValues=""
| targetX=""
| targetY=""
| textLength=""
| viewBox=""
| viewTarget=""
| xChannelSelector=""
| yChannelSelector=""
| zoomAndPan=""
#data
<!DOCTYPE html><BODY><SVG ATTRIBUTENAME='' ATTRIBUTETYPE='' BASEFREQUENCY='' BASEPROFILE='' CALCMODE='' CLIPPATHUNITS='' DIFFUSECONSTANT='' EDGEMODE='' FILTERUNITS='' GLYPHREF='' GRADIENTTRANSFORM='' GRADIENTUNITS='' KERNELMATRIX='' KERNELUNITLENGTH='' KEYPOINTS='' KEYSPLINES='' KEYTIMES='' LENGTHADJUST='' LIMITINGCONEANGLE='' MARKERHEIGHT='' MARKERUNITS='' MARKERWIDTH='' MASKCONTENTUNITS='' MASKUNITS='' NUMOCTAVES='' PATHLENGTH='' PATTERNCONTENTUNITS='' PATTERNTRANSFORM='' PATTERNUNITS='' POINTSATX='' POINTSATY='' POINTSATZ='' PRESERVEALPHA='' PRESERVEASPECTRATIO='' PRIMITIVEUNITS='' REFX='' REFY='' REPEATCOUNT='' REPEATDUR='' REQUIREDEXTENSIONS='' REQUIREDFEATURES='' SPECULARCONSTANT='' SPECULAREXPONENT='' SPREADMETHOD='' STARTOFFSET='' STDDEVIATION='' STITCHTILES='' SURFACESCALE='' SYSTEMLANGUAGE='' TABLEVALUES='' TARGETX='' TARGETY='' TEXTLENGTH='' VIEWBOX='' VIEWTARGET='' XCHANNELSELECTOR='' YCHANNELSELECTOR='' ZOOMANDPAN=''></SVG>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| attributeName=""
| attributeType=""
| baseFrequency=""
| baseProfile=""
| calcMode=""
| clipPathUnits=""
| diffuseConstant=""
| edgeMode=""
| filterUnits=""
| glyphRef=""
| gradientTransform=""
| gradientUnits=""
| kernelMatrix=""
| kernelUnitLength=""
| keyPoints=""
| keySplines=""
| keyTimes=""
| lengthAdjust=""
| limitingConeAngle=""
| markerHeight=""
| markerUnits=""
| markerWidth=""
| maskContentUnits=""
| maskUnits=""
| numOctaves=""
| pathLength=""
| patternContentUnits=""
| patternTransform=""
| patternUnits=""
| pointsAtX=""
| pointsAtY=""
| pointsAtZ=""
| preserveAlpha=""
| preserveAspectRatio=""
| primitiveUnits=""
| refX=""
| refY=""
| repeatCount=""
| repeatDur=""
| requiredExtensions=""
| requiredFeatures=""
| specularConstant=""
| specularExponent=""
| spreadMethod=""
| startOffset=""
| stdDeviation=""
| stitchTiles=""
| surfaceScale=""
| systemLanguage=""
| tableValues=""
| targetX=""
| targetY=""
| textLength=""
| viewBox=""
| viewTarget=""
| xChannelSelector=""
| yChannelSelector=""
| zoomAndPan=""
#data
<!DOCTYPE html><body><svg attributename='' attributetype='' basefrequency='' baseprofile='' calcmode='' clippathunits='' diffuseconstant='' edgemode='' filterunits='' filterres='' glyphref='' gradienttransform='' gradientunits='' kernelmatrix='' kernelunitlength='' keypoints='' keysplines='' keytimes='' lengthadjust='' limitingconeangle='' markerheight='' markerunits='' markerwidth='' maskcontentunits='' maskunits='' numoctaves='' pathlength='' patterncontentunits='' patterntransform='' patternunits='' pointsatx='' pointsaty='' pointsatz='' preservealpha='' preserveaspectratio='' primitiveunits='' refx='' refy='' repeatcount='' repeatdur='' requiredextensions='' requiredfeatures='' specularconstant='' specularexponent='' spreadmethod='' startoffset='' stddeviation='' stitchtiles='' surfacescale='' systemlanguage='' tablevalues='' targetx='' targety='' textlength='' viewbox='' viewtarget='' xchannelselector='' ychannelselector='' zoomandpan=''></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| attributeName=""
| attributeType=""
| baseFrequency=""
| baseProfile=""
| calcMode=""
| clipPathUnits=""
| diffuseConstant=""
| edgeMode=""
| filterUnits=""
| filterres=""
| glyphRef=""
| gradientTransform=""
| gradientUnits=""
| kernelMatrix=""
| kernelUnitLength=""
| keyPoints=""
| keySplines=""
| keyTimes=""
| lengthAdjust=""
| limitingConeAngle=""
| markerHeight=""
| markerUnits=""
| markerWidth=""
| maskContentUnits=""
| maskUnits=""
| numOctaves=""
| pathLength=""
| patternContentUnits=""
| patternTransform=""
| patternUnits=""
| pointsAtX=""
| pointsAtY=""
| pointsAtZ=""
| preserveAlpha=""
| preserveAspectRatio=""
| primitiveUnits=""
| refX=""
| refY=""
| repeatCount=""
| repeatDur=""
| requiredExtensions=""
| requiredFeatures=""
| specularConstant=""
| specularExponent=""
| spreadMethod=""
| startOffset=""
| stdDeviation=""
| stitchTiles=""
| surfaceScale=""
| systemLanguage=""
| tableValues=""
| targetX=""
| targetY=""
| textLength=""
| viewBox=""
| viewTarget=""
| xChannelSelector=""
| yChannelSelector=""
| zoomAndPan=""
#data
<!DOCTYPE html><body><math attributeName='' attributeType='' baseFrequency='' baseProfile='' calcMode='' clipPathUnits='' diffuseConstant='' edgeMode='' filterUnits='' glyphRef='' gradientTransform='' gradientUnits='' kernelMatrix='' kernelUnitLength='' keyPoints='' keySplines='' keyTimes='' lengthAdjust='' limitingConeAngle='' markerHeight='' markerUnits='' markerWidth='' maskContentUnits='' maskUnits='' numOctaves='' pathLength='' patternContentUnits='' patternTransform='' patternUnits='' pointsAtX='' pointsAtY='' pointsAtZ='' preserveAlpha='' preserveAspectRatio='' primitiveUnits='' refX='' refY='' repeatCount='' repeatDur='' requiredExtensions='' requiredFeatures='' specularConstant='' specularExponent='' spreadMethod='' startOffset='' stdDeviation='' stitchTiles='' surfaceScale='' systemLanguage='' tableValues='' targetX='' targetY='' textLength='' viewBox='' viewTarget='' xChannelSelector='' yChannelSelector='' zoomAndPan=''></math>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <math math>
| attributename=""
| attributetype=""
| basefrequency=""
| baseprofile=""
| calcmode=""
| clippathunits=""
| diffuseconstant=""
| edgemode=""
| filterunits=""
| glyphref=""
| gradienttransform=""
| gradientunits=""
| kernelmatrix=""
| kernelunitlength=""
| keypoints=""
| keysplines=""
| keytimes=""
| lengthadjust=""
| limitingconeangle=""
| markerheight=""
| markerunits=""
| markerwidth=""
| maskcontentunits=""
| maskunits=""
| numoctaves=""
| pathlength=""
| patterncontentunits=""
| patterntransform=""
| patternunits=""
| pointsatx=""
| pointsaty=""
| pointsatz=""
| preservealpha=""
| preserveaspectratio=""
| primitiveunits=""
| refx=""
| refy=""
| repeatcount=""
| repeatdur=""
| requiredextensions=""
| requiredfeatures=""
| specularconstant=""
| specularexponent=""
| spreadmethod=""
| startoffset=""
| stddeviation=""
| stitchtiles=""
| surfacescale=""
| systemlanguage=""
| tablevalues=""
| targetx=""
| targety=""
| textlength=""
| viewbox=""
| viewtarget=""
| xchannelselector=""
| ychannelselector=""
| zoomandpan=""
#data
<!DOCTYPE html><body><svg contentScriptType='' contentStyleType='' externalResourcesRequired='' filterRes=''></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| contentscripttype=""
| contentstyletype=""
| externalresourcesrequired=""
| filterres=""
#data
<!DOCTYPE html><body><svg CONTENTSCRIPTTYPE='' CONTENTSTYLETYPE='' EXTERNALRESOURCESREQUIRED='' FILTERRES=''></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| contentscripttype=""
| contentstyletype=""
| externalresourcesrequired=""
| filterres=""
#data
<!DOCTYPE html><body><svg contentscripttype='' contentstyletype='' externalresourcesrequired='' filterres=''></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| contentscripttype=""
| contentstyletype=""
| externalresourcesrequired=""
| filterres=""
#data
<!DOCTYPE html><body><math contentScriptType='' contentStyleType='' externalResourcesRequired='' filterRes=''></math>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <math math>
| contentscripttype=""
| contentstyletype=""
| externalresourcesrequired=""
| filterres=""
#data
<!DOCTYPE html><body><svg><altGlyph /><altGlyphDef /><altGlyphItem /><animateColor /><animateMotion /><animateTransform /><clipPath /><feBlend /><feColorMatrix /><feComponentTransfer /><feComposite /><feConvolveMatrix /><feDiffuseLighting /><feDisplacementMap /><feDistantLight /><feFlood /><feFuncA /><feFuncB /><feFuncG /><feFuncR /><feGaussianBlur /><feImage /><feMerge /><feMergeNode /><feMorphology /><feOffset /><fePointLight /><feSpecularLighting /><feSpotLight /><feTile /><feTurbulence /><foreignObject /><glyphRef /><linearGradient /><radialGradient /><textPath /></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg altGlyph>
| <svg altGlyphDef>
| <svg altGlyphItem>
| <svg animateColor>
| <svg animateMotion>
| <svg animateTransform>
| <svg clipPath>
| <svg feBlend>
| <svg feColorMatrix>
| <svg feComponentTransfer>
| <svg feComposite>
| <svg feConvolveMatrix>
| <svg feDiffuseLighting>
| <svg feDisplacementMap>
| <svg feDistantLight>
| <svg feFlood>
| <svg feFuncA>
| <svg feFuncB>
| <svg feFuncG>
| <svg feFuncR>
| <svg feGaussianBlur>
| <svg feImage>
| <svg feMerge>
| <svg feMergeNode>
| <svg feMorphology>
| <svg feOffset>
| <svg fePointLight>
| <svg feSpecularLighting>
| <svg feSpotLight>
| <svg feTile>
| <svg feTurbulence>
| <svg foreignObject>
| <svg glyphRef>
| <svg linearGradient>
| <svg radialGradient>
| <svg textPath>
#data
<!DOCTYPE html><body><svg><altglyph /><altglyphdef /><altglyphitem /><animatecolor /><animatemotion /><animatetransform /><clippath /><feblend /><fecolormatrix /><fecomponenttransfer /><fecomposite /><feconvolvematrix /><fediffuselighting /><fedisplacementmap /><fedistantlight /><feflood /><fefunca /><fefuncb /><fefuncg /><fefuncr /><fegaussianblur /><feimage /><femerge /><femergenode /><femorphology /><feoffset /><fepointlight /><fespecularlighting /><fespotlight /><fetile /><feturbulence /><foreignobject /><glyphref /><lineargradient /><radialgradient /><textpath /></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg altGlyph>
| <svg altGlyphDef>
| <svg altGlyphItem>
| <svg animateColor>
| <svg animateMotion>
| <svg animateTransform>
| <svg clipPath>
| <svg feBlend>
| <svg feColorMatrix>
| <svg feComponentTransfer>
| <svg feComposite>
| <svg feConvolveMatrix>
| <svg feDiffuseLighting>
| <svg feDisplacementMap>
| <svg feDistantLight>
| <svg feFlood>
| <svg feFuncA>
| <svg feFuncB>
| <svg feFuncG>
| <svg feFuncR>
| <svg feGaussianBlur>
| <svg feImage>
| <svg feMerge>
| <svg feMergeNode>
| <svg feMorphology>
| <svg feOffset>
| <svg fePointLight>
| <svg feSpecularLighting>
| <svg feSpotLight>
| <svg feTile>
| <svg feTurbulence>
| <svg foreignObject>
| <svg glyphRef>
| <svg linearGradient>
| <svg radialGradient>
| <svg textPath>
#data
<!DOCTYPE html><BODY><SVG><ALTGLYPH /><ALTGLYPHDEF /><ALTGLYPHITEM /><ANIMATECOLOR /><ANIMATEMOTION /><ANIMATETRANSFORM /><CLIPPATH /><FEBLEND /><FECOLORMATRIX /><FECOMPONENTTRANSFER /><FECOMPOSITE /><FECONVOLVEMATRIX /><FEDIFFUSELIGHTING /><FEDISPLACEMENTMAP /><FEDISTANTLIGHT /><FEFLOOD /><FEFUNCA /><FEFUNCB /><FEFUNCG /><FEFUNCR /><FEGAUSSIANBLUR /><FEIMAGE /><FEMERGE /><FEMERGENODE /><FEMORPHOLOGY /><FEOFFSET /><FEPOINTLIGHT /><FESPECULARLIGHTING /><FESPOTLIGHT /><FETILE /><FETURBULENCE /><FOREIGNOBJECT /><GLYPHREF /><LINEARGRADIENT /><RADIALGRADIENT /><TEXTPATH /></SVG>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg altGlyph>
| <svg altGlyphDef>
| <svg altGlyphItem>
| <svg animateColor>
| <svg animateMotion>
| <svg animateTransform>
| <svg clipPath>
| <svg feBlend>
| <svg feColorMatrix>
| <svg feComponentTransfer>
| <svg feComposite>
| <svg feConvolveMatrix>
| <svg feDiffuseLighting>
| <svg feDisplacementMap>
| <svg feDistantLight>
| <svg feFlood>
| <svg feFuncA>
| <svg feFuncB>
| <svg feFuncG>
| <svg feFuncR>
| <svg feGaussianBlur>
| <svg feImage>
| <svg feMerge>
| <svg feMergeNode>
| <svg feMorphology>
| <svg feOffset>
| <svg fePointLight>
| <svg feSpecularLighting>
| <svg feSpotLight>
| <svg feTile>
| <svg feTurbulence>
| <svg foreignObject>
| <svg glyphRef>
| <svg linearGradient>
| <svg radialGradient>
| <svg textPath>
#data
<!DOCTYPE html><body><math><altGlyph /><altGlyphDef /><altGlyphItem /><animateColor /><animateMotion /><animateTransform /><clipPath /><feBlend /><feColorMatrix /><feComponentTransfer /><feComposite /><feConvolveMatrix /><feDiffuseLighting /><feDisplacementMap /><feDistantLight /><feFlood /><feFuncA /><feFuncB /><feFuncG /><feFuncR /><feGaussianBlur /><feImage /><feMerge /><feMergeNode /><feMorphology /><feOffset /><fePointLight /><feSpecularLighting /><feSpotLight /><feTile /><feTurbulence /><foreignObject /><glyphRef /><linearGradient /><radialGradient /><textPath /></math>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <math math>
| <math altglyph>
| <math altglyphdef>
| <math altglyphitem>
| <math animatecolor>
| <math animatemotion>
| <math animatetransform>
| <math clippath>
| <math feblend>
| <math fecolormatrix>
| <math fecomponenttransfer>
| <math fecomposite>
| <math feconvolvematrix>
| <math fediffuselighting>
| <math fedisplacementmap>
| <math fedistantlight>
| <math feflood>
| <math fefunca>
| <math fefuncb>
| <math fefuncg>
| <math fefuncr>
| <math fegaussianblur>
| <math feimage>
| <math femerge>
| <math femergenode>
| <math femorphology>
| <math feoffset>
| <math fepointlight>
| <math fespecularlighting>
| <math fespotlight>
| <math fetile>
| <math feturbulence>
| <math foreignobject>
| <math glyphref>
| <math lineargradient>
| <math radialgradient>
| <math textpath>
#data
<!DOCTYPE html><body><svg><solidColor /></svg>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <svg svg>
| <svg solidcolor>

View file

@ -0,0 +1,62 @@
#data
<!DOCTYPE html><body><p>foo<math><mtext><i>baz</i></mtext><annotation-xml><svg><desc><b>eggs</b></desc><g><foreignObject><P>spam<TABLE><tr><td><img></td></table></foreignObject></g><g>quux</g></svg></annotation-xml></math>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| "foo"
| <math math>
| <math mtext>
| <i>
| "baz"
| <math annotation-xml>
| <svg svg>
| <svg desc>
| <b>
| "eggs"
| <svg g>
| <svg foreignObject>
| <p>
| "spam"
| <table>
| <tbody>
| <tr>
| <td>
| <img>
| <svg g>
| "quux"
| "bar"
#data
<!DOCTYPE html><body>foo<math><mtext><i>baz</i></mtext><annotation-xml><svg><desc><b>eggs</b></desc><g><foreignObject><P>spam<TABLE><tr><td><img></td></table></foreignObject></g><g>quux</g></svg></annotation-xml></math>bar
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "foo"
| <math math>
| <math mtext>
| <i>
| "baz"
| <math annotation-xml>
| <svg svg>
| <svg desc>
| <b>
| "eggs"
| <svg g>
| <svg foreignObject>
| <p>
| "spam"
| <table>
| <tbody>
| <tr>
| <td>
| <img>
| <svg g>
| "quux"
| "bar"

View file

@ -0,0 +1,75 @@
#data
<!DOCTYPE html><html><body><xyz:abc></xyz:abc>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <xyz:abc>
#data
<!DOCTYPE html><html><body><xyz:abc></xyz:abc><span></span>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <xyz:abc>
| <span>
#data
<!DOCTYPE html><html><html abc:def=gh><xyz:abc></xyz:abc>
#errors
(1,38): non-html-root
#document
| <!DOCTYPE html>
| <html>
| abc:def="gh"
| <head>
| <body>
| <xyz:abc>
#data
<!DOCTYPE html><html xml:lang=bar><html xml:lang=foo>
#errors
(1,53): non-html-root
#document
| <!DOCTYPE html>
| <html>
| xml:lang="bar"
| <head>
| <body>
#data
<!DOCTYPE html><html 123=456>
#errors
#document
| <!DOCTYPE html>
| <html>
| 123="456"
| <head>
| <body>
#data
<!DOCTYPE html><html 123=456><html 789=012>
#errors
(1,43): non-html-root
#document
| <!DOCTYPE html>
| <html>
| 123="456"
| 789="012"
| <head>
| <body>
#data
<!DOCTYPE html><html><body 789=012>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| 789="012"

View file

@ -0,0 +1,216 @@
#data
<!DOCTYPE html><p><b><i><u></p> <p>X
#errors
(1,31): unexpected-end-tag
(1,36): expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| <b>
| <i>
| <u>
| <b>
| <i>
| <u>
| " "
| <p>
| "X"
#data
<p><b><i><u></p>
<p>X
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): unexpected-end-tag
(2,4): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <p>
| <b>
| <i>
| <u>
| <b>
| <i>
| <u>
| "
"
| <p>
| "X"
#data
<!doctype html></html> <head>
#errors
(1,29): expected-eof-but-got-start-tag
(1,29): unexpected-start-tag-ignored
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " "
#data
<!doctype html></body><meta>
#errors
(1,28): unexpected-start-tag-after-body
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <meta>
#data
<html></html><!-- foo -->
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <!-- foo -->
#data
<!doctype html></body><title>X</title>
#errors
(1,29): unexpected-start-tag-after-body
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <title>
| "X"
#data
<!doctype html><table> X<meta></table>
#errors
(1,23): foster-parenting-character
(1,24): foster-parenting-character
(1,30): foster-parenting-start-character
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " X"
| <meta>
| <table>
#data
<!doctype html><table> x</table>
#errors
(1,23): foster-parenting-character
(1,24): foster-parenting-character
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " x"
| <table>
#data
<!doctype html><table> x </table>
#errors
(1,23): foster-parenting-character
(1,24): foster-parenting-character
(1,25): foster-parenting-character
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " x "
| <table>
#data
<!doctype html><table><tr> x</table>
#errors
(1,27): foster-parenting-character
(1,28): foster-parenting-character
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " x"
| <table>
| <tbody>
| <tr>
#data
<!doctype html><table>X<style> <tr>x </style> </table>
#errors
(1,23): foster-parenting-character
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "X"
| <table>
| <style>
| " <tr>x "
| " "
#data
<!doctype html><div><table><a>foo</a> <tr><td>bar</td> </tr></table></div>
#errors
(1,30): foster-parenting-start-tag
(1,31): foster-parenting-character
(1,32): foster-parenting-character
(1,33): foster-parenting-character
(1,37): foster-parenting-end-tag
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <div>
| <a>
| "foo"
| <table>
| " "
| <tbody>
| <tr>
| <td>
| "bar"
| " "
#data
<frame></frame></frame><frameset><frame><frameset><frame></frameset><noframes></frameset><noframes>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,7): unexpected-start-tag-ignored
(1,15): unexpected-end-tag
(1,23): unexpected-end-tag
(1,33): unexpected-start-tag
(1,99): expected-named-closing-tag-but-got-eof
(1,99): eof-in-frameset
#document
| <html>
| <head>
| <frameset>
| <frame>
| <frameset>
| <frame>
| <noframes>
| "</frameset><noframes>"
#data
<!DOCTYPE html><object></html>
#errors
(1,30): expected-body-in-scope
(1,30): expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <object>

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,179 @@
#data
<!doctype html><table><tbody><select><tr>
#errors
(1,37): unexpected-start-tag-implies-table-voodoo
(1,41): unexpected-table-element-start-tag-in-select-in-table
(1,41): eof-in-table
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
| <table>
| <tbody>
| <tr>
#data
<!doctype html><table><tr><select><td>
#errors
(1,34): unexpected-start-tag-implies-table-voodoo
(1,38): unexpected-table-element-start-tag-in-select-in-table
(1,38): expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
| <table>
| <tbody>
| <tr>
| <td>
#data
<!doctype html><table><tr><td><select><td>
#errors
(1,42): unexpected-table-element-start-tag-in-select-in-table
(1,42): expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| <select>
| <td>
#data
<!doctype html><table><tr><th><select><td>
#errors
(1,42): unexpected-table-element-start-tag-in-select-in-table
(1,42): expected-closing-tag-but-got-eof
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <th>
| <select>
| <td>
#data
<!doctype html><table><caption><select><tr>
#errors
(1,43): unexpected-table-element-start-tag-in-select-in-table
(1,43): eof-in-table
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <caption>
| <select>
| <tbody>
| <tr>
#data
<!doctype html><select><tr>
#errors
(1,27): unexpected-start-tag-in-select
(1,27): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><select><td>
#errors
(1,27): unexpected-start-tag-in-select
(1,27): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><select><th>
#errors
(1,27): unexpected-start-tag-in-select
(1,27): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><select><tbody>
#errors
(1,30): unexpected-start-tag-in-select
(1,30): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><select><thead>
#errors
(1,30): unexpected-start-tag-in-select
(1,30): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><select><tfoot>
#errors
(1,30): unexpected-start-tag-in-select
(1,30): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><select><caption>
#errors
(1,32): unexpected-start-tag-in-select
(1,32): eof-in-select
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <select>
#data
<!doctype html><table><tr></table>a
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| "a"

Some files were not shown because too many files have changed in this diff Show more