diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py index aec2d340..d619fb2c 100644 --- a/lib/bleach/__init__.py +++ b/lib/bleach/__init__.py @@ -1,401 +1,131 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals -import logging -import re +import packaging.version -import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer.htmlserializer import HTMLSerializer - -from . import callbacks as linkify_callbacks -from .encoding import force_unicode -from .sanitizer import BleachSanitizer +from bleach.linkifier import ( + DEFAULT_CALLBACKS, + Linker, +) +from bleach.sanitizer import ( + ALLOWED_ATTRIBUTES, + ALLOWED_PROTOCOLS, + ALLOWED_STYLES, + ALLOWED_TAGS, + Cleaner, +) -VERSION = (1, 4, 2) -__version__ = '.'.join([str(n) for n in VERSION]) - -__all__ = ['clean', 'linkify'] - -log = logging.getLogger('bleach') - -ALLOWED_TAGS = [ - 'a', - 'abbr', - 'acronym', - 'b', - 'blockquote', - 'code', - 'em', - 'i', - 'li', - 'ol', - 'strong', - 'ul', -] - -ALLOWED_ATTRIBUTES = { - 'a': ['href', 'title'], - 'abbr': ['title'], - 'acronym': ['title'], -} - -ALLOWED_STYLES = [] - -ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] - -TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az - ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat - cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk - dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg - gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il - im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp - kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk - ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne - net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post - pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl - sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to - tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws - xn xxx ye yt yu za zm zw""".split() - -# Make sure that .com doesn't get matched by .co first -TLDS.reverse() - -PROTOCOLS = HTMLSanitizer.acceptable_protocols - -url_re = re.compile( - r"""\(* # Match any opening parentheses. - \b(?"]*)? - # /path/zz (excluding "unsafe" chars from RFC 1738, - # except for # and ~, which happen in practice) - """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), - re.IGNORECASE | re.VERBOSE | re.UNICODE) - -proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) - -punct_re = re.compile(r'([\.,]+)$') - -email_re = re.compile( - r"""(? tag replaced by the text within it - adj = replace_nodes(tree, _text, node, - current_child) - current_child -= 1 - # pull back current_child by 1 to scan the - # new nodes again. - else: - text = force_unicode(attrs.pop('_text')) - for attr_key, attr_val in attrs.items(): - node.set(attr_key, attr_val) - - for n in reversed(list(node)): - node.remove(n) - text = parser.parseFragment(text) - node.text = text.text - for n in text: - node.append(n) - _seen.add(node) - - elif current_child >= 0: - if node.tag == ETREE_TAG('pre') and skip_pre: - linkify_nodes(node, False) - elif not (node in _seen): - linkify_nodes(node, True) - - current_child += 1 - - def email_repl(match): - addr = match.group(0).replace('"', '"') - link = { - '_text': addr, - 'href': 'mailto:{0!s}'.format(addr), - } - link = apply_callbacks(link, True) - - if link is None: - return addr - - _href = link.pop('href') - _text = link.pop('_text') - - repl = '{2!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl.format(_href, attribs, _text) - - def link_repl(match): - url = match.group(0) - open_brackets = close_brackets = 0 - if url.startswith('('): - _wrapping = strip_wrapping_parentheses(url) - url, open_brackets, close_brackets = _wrapping - end = '' - m = re.search(punct_re, url) - if m: - end = m.group(0) - url = url[0:m.start()] - if re.search(proto_re, url): - href = url - else: - href = ''.join(['http://', url]) - - link = { - '_text': url, - 'href': href, - } - - link = apply_callbacks(link, True) - - if link is None: - return '(' * open_brackets + url + ')' * close_brackets - - _text = link.pop('_text') - _href = link.pop('href') - - repl = '{0!s}{3!s}{4!s}{5!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - - return repl.format('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) - - try: - linkify_nodes(forest) - except RuntimeError as e: - # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: {0!r}'.format(e)) - - return _render(forest) - - -def _render(tree): - """Try rendering as HTML, then XML, then give up.""" - return force_unicode(_serialize(tree)) - - -def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('etree') - stream = walker(domtree) - serializer = HTMLSerializer(quote_attr_values=True, - alphabetical_attributes=True, - omit_optional_tags=False) - return serializer.render(stream) + linker = Linker(callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email) + return linker.linkify(text) diff --git a/lib/bleach/_vendor/README.rst b/lib/bleach/_vendor/README.rst new file mode 100644 index 00000000..e53aede0 --- /dev/null +++ b/lib/bleach/_vendor/README.rst @@ -0,0 +1,61 @@ +======================= +Vendored library policy +======================= + +To simplify Bleach development, we're now vendoring certain libraries that +we use. + +Vendored libraries must follow these rules: + +1. Vendored libraries must be pure Python--no compiling. +2. Source code for the libary is included in this directory. +3. License must be included in this repo and in the Bleach distribution. +4. Requirements of the library become requirements of Bleach. +5. No modifications to the library may be made. + + +Adding/Updating a vendored library +================================== + +Way to vendor a library or update a version: + +1. Update ``vendor.txt`` with the library, version, and hash. You can use + `hashin `_. +2. Remove all old files and directories of the old version. +3. Run ``pip_install_vendor.sh`` and check everything it produced in including + the ``.dist-info`` directory and contents. +4. Update the bleach minor version in the next release. + + +Reviewing a change involving a vendored library +=============================================== + +Way to verify a vendored library addition/update: + +1. Pull down the branch. +2. Delete all the old files and directories of the old version. +3. Run ``pip_install_vendor.sh``. +4. Run ``git diff`` and verify there are no changes. + + +NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info + + +Removing/Unvendoring a vendored library +======================================= + +A vendored library might be removed for any of the following reasons: + +* it violates the vendoring policy (e.g. an incompatible license + change) +* a suitable replacement is found +* bleach has the resources to test and QA new bleach releases against + multiple versions of the previously vendored library + +To unvendor a library: + +1. Remove the library and its hashes from ``vendor.txt``. +2. Remove library files and directories from this directory. +3. Run ``install_vendor.sh`` and check the previously vendored library including + the ``.dist-info`` directory and contents is not installed. +4. Update the bleach minor version in the next release. diff --git a/lib/bleach/_vendor/__init__.py b/lib/bleach/_vendor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst b/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst new file mode 100644 index 00000000..90401390 --- /dev/null +++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/AUTHORS.rst @@ -0,0 +1,66 @@ +Credits +======= + +``html5lib`` is written and maintained by: + +- James Graham +- Sam Sneddon +- Łukasz Langa +- Will Kahn-Greene + + +Patches and suggestions +----------------------- +(In chronological order, by first commit:) + +- Anne van Kesteren +- Lachlan Hunt +- lantis63 +- Sam Ruby +- Thomas Broyer +- Tim Fletcher +- Mark Pilgrim +- Ryan King +- Philip Taylor +- Edward Z. Yang +- fantasai +- Philip Jägenstedt +- Ms2ger +- Mohammad Taha Jahangir +- Andy Wingo +- Andreas Madsack +- Karim Valiev +- Juan Carlos Garcia Segovia +- Mike West +- Marc DM +- Simon Sapin +- Michael[tm] Smith +- Ritwik Gupta +- Marc Abramowitz +- Tony Lopes +- lilbludevil +- Kevin +- Drew Hubl +- Austin Kumbera +- Jim Baker +- Jon Dufresne +- Donald Stufft +- Alex Gaynor +- Nik Nyby +- Jakub Wilk +- Sigmund Cherem +- Gabi Davar +- Florian Mounier +- neumond +- Vitalik Verhovodov +- Kovid Goyal +- Adam Chainz +- John Vandenberg +- Eric Amorde +- Benedikt Morbach +- Jonathan Vanasco +- Tom Most +- Ville Skyttä +- Hugo van Kemenade +- Mark Vasilkov + diff --git a/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER b/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER new file mode 100644 index 00000000..a1b589e3 --- /dev/null +++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA b/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA new file mode 100644 index 00000000..ee83c1f8 --- /dev/null +++ b/lib/bleach/_vendor/html5lib-1.1.dist-info/METADATA @@ -0,0 +1,552 @@ +Metadata-Version: 2.1 +Name: html5lib +Version: 1.1 +Summary: HTML parser based on the WHATWG HTML specification +Home-page: https://github.com/html5lib/html5lib-python +Maintainer: James Graham +Maintainer-email: james@hoppipolla.co.uk +License: MIT License +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing :: Markup :: HTML +Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.* +Requires-Dist: six (>=1.9) +Requires-Dist: webencodings +Provides-Extra: all +Requires-Dist: genshi ; extra == 'all' +Requires-Dist: chardet (>=2.2) ; extra == 'all' +Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all' +Provides-Extra: chardet +Requires-Dist: chardet (>=2.2) ; extra == 'chardet' +Provides-Extra: genshi +Requires-Dist: genshi ; extra == 'genshi' +Provides-Extra: lxml +Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml' + +html5lib +======== + +.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master + :target: https://travis-ci.org/html5lib/html5lib-python + + +html5lib is a pure-python library for parsing HTML. It is designed to +conform to the WHATWG HTML specification, as is implemented by all major +web browsers. + + +Usage +----- + +Simple usage follows this pattern: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + document = html5lib.parse(f) + +or: + +.. code-block:: python + + import html5lib + document = html5lib.parse("

Hello World!") + +By default, the ``document`` will be an ``xml.etree`` element instance. +Whenever possible, html5lib chooses the accelerated ``ElementTree`` +implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x). + +Two other tree types are supported: ``xml.dom.minidom`` and +``lxml.etree``. To use an alternative format, specify the name of +a treebuilder: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + lxml_etree_document = html5lib.parse(f, treebuilder="lxml") + +When using with ``urllib2`` (Python 2), the charset from HTTP should be +pass into html5lib as follows: + +.. code-block:: python + + from contextlib import closing + from urllib2 import urlopen + import html5lib + + with closing(urlopen("http://example.com/")) as f: + document = html5lib.parse(f, transport_encoding=f.info().getparam("charset")) + +When using with ``urllib.request`` (Python 3), the charset from HTTP +should be pass into html5lib as follows: + +.. code-block:: python + + from urllib.request import urlopen + import html5lib + + with urlopen("http://example.com/") as f: + document = html5lib.parse(f, transport_encoding=f.info().get_content_charset()) + +To have more control over the parser, create a parser object explicitly. +For instance, to make the parser raise exceptions on parse errors, use: + +.. code-block:: python + + import html5lib + with open("mydocument.html", "rb") as f: + parser = html5lib.HTMLParser(strict=True) + document = parser.parse(f) + +When you're instantiating parser objects explicitly, pass a treebuilder +class as the ``tree`` keyword argument to use an alternative document +format: + +.. code-block:: python + + import html5lib + parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) + minidom_document = parser.parse("

Hello World!") + +More documentation is available at https://html5lib.readthedocs.io/. + + +Installation +------------ + +html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install: + +.. code-block:: bash + + $ pip install html5lib + +The goal is to support a (non-strict) superset of the versions that `pip +supports +`_. + +Optional Dependencies +--------------------- + +The following third-party libraries may be used for additional +functionality: + +- ``lxml`` is supported as a tree format (for both building and + walking) under CPython (but *not* PyPy where it is known to cause + segfaults); + +- ``genshi`` has a treewalker (but not builder); and + +- ``chardet`` can be used as a fallback when character encoding cannot + be determined. + + +Bugs +---- + +Please report any bugs on the `issue tracker +`_. + + +Tests +----- + +Unit tests require the ``pytest`` and ``mock`` libraries and can be +run using the ``py.test`` command in the root directory. + +Test data are contained in a separate `html5lib-tests +`_ repository and included +as a submodule, thus for git checkouts they must be initialized:: + + $ git submodule init + $ git submodule update + +If you have all compatible Python implementations available on your +system, you can run tests on all of them using the ``tox`` utility, +which can be found on PyPI. + + +Questions? +---------- + +There's a mailing list available for support on Google Groups, +`html5lib-discuss `_, +though you may get a quicker response asking on IRC in `#whatwg on +irc.freenode.net `_. + +Change Log +---------- + +1.1 +~~~ + +UNRELEASED + +Breaking changes: + +* Drop support for Python 3.3. (#358) +* Drop support for Python 3.4. (#421) + +Deprecations: + +* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and + ``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach + `. Please let us know if Bleach doesn't suffice for your + use. (#443) + +Other changes: + +* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure + ``html5lib`` keeps working in future Python versions. (#403) +* Drop optional ``datrie`` dependency. (#442) + + +1.0.1 +~~~~~ + +Released on December 7, 2017 + +Breaking changes: + +* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!) +* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!) + +Features: + +* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most, + Will Kahn-Greene!) +* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!) +* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!) +* Support Python 3.6. (#333) (Thank you, Jon Dufresne!) +* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!) +* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon + Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!) +* Semver-compliant version number. + +Bug fixes: + +* Add support for setuptools < 18.5 to support environment markers. (Thank you, + John Vandenberg!) +* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!) +* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank + you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!) +* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will + Kahn-Greene!) +* Include license file in generated wheel package. (#350) (Thank you, Jon + Dufresne!) +* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!) +* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you, + Komal Dembla, Hugo!) + + +1.0 +~~~ + +Released and unreleased on December 7, 2017. Badly packaged release. + + +0.999999999/1.0b10 +~~~~~~~~~~~~~~~~~~ + +Released on July 15, 2016 + +* Fix attribute order going to the tree builder to be document order + instead of reverse document order(!). + + +0.99999999/1.0b9 +~~~~~~~~~~~~~~~~ + +Released on July 14, 2016 + +* **Added ordereddict as a mandatory dependency on Python 2.6.** + +* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all`` + extras that will do the right thing based on the specific + interpreter implementation. + +* Now requires the ``mock`` package for the testsuite. + +* Cease supporting DATrie under PyPy. + +* **Remove PullDOM support, as this hasn't ever been properly + tested, doesn't entirely work, and as far as I can tell is + completely unused by anyone.** + +* Move testsuite to ``py.test``. + +* **Fix #124: move to webencodings for decoding the input byte stream; + this makes html5lib compliant with the Encoding Standard, and + introduces a required dependency on webencodings.** + +* **Cease supporting Python 3.2 (in both CPython and PyPy forms).** + +* **Fix comments containing double-dash with lxml 3.5 and above.** + +* **Use scripting disabled by default (as we don't implement + scripting).** + +* **Fix #11, avoiding the XSS bug potentially caused by serializer + allowing attribute values to be escaped out of in old browser versions, + changing the quote_attr_values option on serializer to take one of + three values, "always" (the old True value), "legacy" (the new option, + and the new default), and "spec" (the old False value, and the old + default).** + +* **Fix #72 by rewriting the sanitizer to apply only to treewalkers + (instead of the tokenizer); as such, this will require amending all + callers of it to use it via the treewalker API.** + +* **Drop support of charade, now that chardet is supported once more.** + +* **Replace the charset keyword argument on parse and related methods + with a set of keyword arguments: override_encoding, transport_encoding, + same_origin_parent_encoding, likely_encoding, and default_encoding.** + +* **Move filters._base, treebuilder._base, and treewalkers._base to .base + to clarify their status as public.** + +* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the + sanitizer.htmlsanitizer module and move that to sanitizer. This means + anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no + code changes.** + +* **Rename treewalkers.lxmletree to .etree_lxml and + treewalkers.genshistream to .genshi to have a consistent API.** + +* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer, + utils) to be underscore prefixed to clarify their status as private. + + +0.9999999/1.0b8 +~~~~~~~~~~~~~~~ + +Released on September 10, 2015 + +* Fix #195: fix the sanitizer to drop broken URLs (it threw an + exception between 0.9999 and 0.999999). + + +0.999999/1.0b7 +~~~~~~~~~~~~~~ + +Released on July 7, 2015 + +* Fix #189: fix the sanitizer to allow relative URLs again (as it did + prior to 0.9999/1.0b5). + + +0.99999/1.0b6 +~~~~~~~~~~~~~ + +Released on April 30, 2015 + +* Fix #188: fix the sanitizer to not throw an exception when sanitizing + bogus data URLs. + + +0.9999/1.0b5 +~~~~~~~~~~~~ + +Released on April 29, 2015 + +* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how + this sounds, this has no known security implications. No known version + of IE (5.5 to current), Firefox (3 to current), Safari (6 to current), + Chrome (1 to current), or Opera (12 to current) will run any script + provided in these attributes. + +* Pass error message to the ParseError exception in strict parsing mode. + +* Allow data URIs in the sanitizer, with a whitelist of content-types. + +* Add support for Python implementations that don't support lone + surrogates (read: Jython). Fixes #2. + +* Remove localization of error messages. This functionality was totally + unused (and untested that everything was localizable), so we may as + well follow numerous browsers in not supporting translating technical + strings. + +* Expose treewalkers.pprint as a public API. + +* Add a documentEncoding property to HTML5Parser, fix #121. + + +0.999 +~~~~~ + +Released on December 23, 2013 + +* Fix #127: add work-around for CPython issue #20007: .read(0) on + http.client.HTTPResponse drops the rest of the content. + +* Fix #115: lxml treewalker can now deal with fragments containing, at + their root level, text nodes with non-ASCII characters on Python 2. + + +0.99 +~~~~ + +Released on September 10, 2013 + +* No library changes from 1.0b3; released as 0.99 as pip has changed + behaviour from 1.4 to avoid installing pre-release versions per + PEP 440. + + +1.0b3 +~~~~~ + +Released on July 24, 2013 + +* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any + implementation using it should be moved to + ``NonRecursiveTreeWalker``, as everything bundled with html5lib has + for years. + +* Fix #67 so that ``BufferedStream`` to correctly returns a bytes + object, thereby fixing any case where html5lib is passed a + non-seekable RawIOBase-like object. + + +1.0b2 +~~~~~ + +Released on June 27, 2013 + +* Removed reordering of attributes within the serializer. There is now + an ``alphabetical_attributes`` option which preserves the previous + behaviour through a new filter. This allows attribute order to be + preserved through html5lib if the tree builder preserves order. + +* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by + ``treeadapters.sax.to_sax`` which is generic and supports any + treewalker; it also resolves all known bugs with ``dom2sax``. + +* Fix treewalker assertions on hitting bytes strings on + Python 2. Previous to 1.0b1, treewalkers coped with mixed + bytes/unicode data on Python 2; this reintroduces this prior + behaviour on Python 2. Behaviour is unchanged on Python 3. + + +1.0b1 +~~~~~ + +Released on May 17, 2013 + +* Implementation updated to implement the `HTML specification + `_ as of 5th May + 2013 (`SVN `_ revision r7867). + +* Python 3.2+ supported in a single codebase using the ``six`` library. + +* Removed support for Python 2.5 and older. + +* Removed the deprecated Beautiful Soup 3 treebuilder. + ``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that + since it doesn't support namespaces, foreign content like SVG and + MathML is parsed incorrectly. + +* Removed ``simpletree`` from the package. The default tree builder is + now ``etree`` (using the ``xml.etree.cElementTree`` implementation if + available, and ``xml.etree.ElementTree`` otherwise). + +* Removed the ``XHTMLSerializer`` as it never actually guaranteed its + output was well-formed XML, and hence provided little of use. + +* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no + longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will + return the default DOM treebuilder, which uses ``xml.dom.minidom``. + +* Optional heuristic character encoding detection now based on + ``charade`` for Python 2.6 - 3.3 compatibility. + +* Optional ``Genshi`` treewalker support fixed. + +* Many bugfixes, including: + + * #33: null in attribute value breaks XML AttValue; + + * #4: nested, indirect descendant,