diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index fcc27457..2a436d34 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -1,6 +1,5 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
+"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
+
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
@@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
-Beautiful Soup works with Python 2.7 and up. It works better if lxml
+Beautiful Soup works with Python 3.5 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
+documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.1"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__version__ = "4.10.0"
+__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
+
+from collections import Counter
import os
import re
import sys
import traceback
import warnings
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 2.
+if sys.version_info.major < 3:
+ raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
+
from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit
from .element import (
@@ -42,28 +46,49 @@ from .element import (
NavigableString,
PageElement,
ProcessingInstruction,
+ PYTHON_SPECIFIC_ENCODINGS,
ResultSet,
+ Script,
+ Stylesheet,
SoupStrainer,
Tag,
+ TemplateString,
)
-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+# Define some custom warnings.
+class GuessedAtParserWarning(UserWarning):
+ """The warning issued when BeautifulSoup has to guess what parser to
+ use -- probably because no parser was specified in the constructor.
+ """
+
+class MarkupResemblesLocatorWarning(UserWarning):
+ """The warning issued when BeautifulSoup is given 'markup' that
+ actually looks like a resource locator -- a URL or a path to a file
+ on disk.
+ """
+
class BeautifulSoup(Tag):
- """
- This class defines the basic interface called by the tree builders.
+ """A data structure representing a parsed HTML or XML document.
- These methods will be called by the parser:
- reset()
- feed(markup)
+ Most of the methods you'll call on a BeautifulSoup object are inherited from
+ PageElement or Tag.
+
+ Internally, this class defines the basic interface called by the
+ tree builders when converting an HTML/XML document into a data
+ structure. The interface abstracts away the differences between
+ parsers. To write a new tree builder, you'll need to understand
+ these methods as a whole.
+
+ These methods will be called by the BeautifulSoup constructor:
+ * reset()
+ * feed(markup)
The tree builder may call these methods from its feed() implementation:
- handle_starttag(name, attrs) # See note about return value
- handle_endtag(name)
- handle_data(data) # Appends to the current data node
- endData(containerClass) # Ends the current data node
+ * handle_starttag(name, attrs) # See note about return value
+ * handle_endtag(name)
+ * handle_data(data) # Appends to the current data node
+ * endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
@@ -73,68 +98,75 @@ class BeautifulSoup(Tag):
like HTML's tag), call handle_starttag and then
handle_endtag.
"""
+
+ # Since BeautifulSoup subclasses Tag, it's possible to treat it as
+ # a Tag with a .name. This name makes it clear the BeautifulSoup
+ # object isn't a real markup tag.
ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
+
+ # A string containing all ASCII whitespace characters, used in
+ # endData() to detect data chunks that seem 'empty'.
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
"""Constructor.
:param markup: A string or a file-like object representing
- markup to be parsed.
+ markup to be parsed.
- :param features: Desirable features of the parser to be used. This
- may be the name of a specific parser ("lxml", "lxml-xml",
- "html.parser", or "html5lib") or it may be the type of markup
- to be used ("html", "html5", "xml"). It's recommended that you
- name a specific parser, so that Beautiful Soup gives you the
- same results across platforms and virtual environments.
+ :param features: Desirable features of the parser to be
+ used. This may be the name of a specific parser ("lxml",
+ "lxml-xml", "html.parser", or "html5lib") or it may be the
+ type of markup to be used ("html", "html5", "xml"). It's
+ recommended that you name a specific parser, so that
+ Beautiful Soup gives you the same results across platforms
+ and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or
- instance to use) instead of looking one up based on
- `features`. You only need to use this if you've implemented a
- custom TreeBuilder.
+ instance to use) instead of looking one up based on
+ `features`. You only need to use this if you've implemented a
+ custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
- matching the SoupStrainer will be considered. This is useful
- when parsing part of a document that would otherwise be too
- large to fit into memory.
+ matching the SoupStrainer will be considered. This is useful
+ when parsing part of a document that would otherwise be too
+ large to fit into memory.
:param from_encoding: A string indicating the encoding of the
- document to be parsed. Pass this in if Beautiful Soup is
- guessing wrongly about the document's encoding.
+ document to be parsed. Pass this in if Beautiful Soup is
+ guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
- encodings known to be wrong. Pass this in if you don't know
- the document's encoding but you know Beautiful Soup's guess is
- wrong.
+ encodings known to be wrong. Pass this in if you don't know
+ the document's encoding but you know Beautiful Soup's guess is
+ wrong.
:param element_classes: A dictionary mapping BeautifulSoup
- classes like Tag and NavigableString to other classes you'd
- like to be instantiated instead as the parse tree is
- built. This is useful for using subclasses to modify the
- default behavior of Tag or NavigableString.
+ classes like Tag and NavigableString, to other classes you'd
+ like to be instantiated instead as the parse tree is
+ built. This is useful for subclassing Tag or NavigableString
+ to modify default behavior.
:param kwargs: For backwards compatibility purposes, the
- constructor accepts certain keyword arguments used in
- Beautiful Soup 3. None of these arguments do anything in
- Beautiful Soup 4; they will result in a warning and then be ignored.
-
- Apart from this, any keyword arguments passed into the BeautifulSoup
- constructor are propagated to the TreeBuilder constructor. This
- makes it possible to configure a TreeBuilder beyond saying
- which one to use.
-
+ constructor accepts certain keyword arguments used in
+ Beautiful Soup 3. None of these arguments do anything in
+ Beautiful Soup 4; they will result in a warning and then be
+ ignored.
+
+ Apart from this, any keyword arguments passed into the
+ BeautifulSoup constructor are propagated to the TreeBuilder
+ constructor. This makes it possible to configure a
+ TreeBuilder by passing in arguments, not just by saying which
+ one to use.
"""
-
if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn(
@@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
- ):
+ ) and markup:
+ # The user did not tell us which TreeBuilder to use,
+ # and we had to guess. Issue a warning.
if builder.is_xml:
markup_type = "XML"
else:
@@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
parser=builder.NAME,
markup_type=markup_type
)
- warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+ warnings.warn(
+ self.NO_PARSER_SPECIFIED_WARNING % values,
+ GuessedAtParserWarning, stacklevel=2
+ )
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
else:
possible_filename = markup
is_file = False
+ is_directory = False
try:
is_file = os.path.exists(possible_filename)
+ if is_file:
+ is_directory = os.path.isdir(possible_filename)
except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
- if is_file:
- if isinstance(markup, str):
- markup = markup.encode("utf8")
+ if is_directory:
+ warnings.warn(
+ '"%s" looks like a directory name, not markup. You may'
+ ' want to open a file found in this directory and pass'
+ ' the filehandle into Beautiful Soup.' % (
+ self._decode_markup(markup)
+ ),
+ MarkupResemblesLocatorWarning
+ )
+ elif is_file:
warnings.warn(
'"%s" looks like a filename, not markup. You should'
' probably open this file and pass the filehandle into'
- ' Beautiful Soup.' % markup)
+ ' Beautiful Soup.' % self._decode_markup(markup),
+ MarkupResemblesLocatorWarning
+ )
self._check_markup_is_url(markup)
rejections = []
@@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
self.builder.soup = None
def __copy__(self):
+ """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
@@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
d['builder'] = None
return d
- @staticmethod
- def _check_markup_is_url(markup):
- """
- Check if markup looks like it's actually a url and raise a warning
- if so. Markup can be unicode or str (py2) / bytes (py3).
+ @classmethod
+ def _decode_markup(cls, markup):
+ """Ensure `markup` is bytes so it's safe to send into warnings.warn.
+
+ TODO: warnings.warn had this problem back in 2010 but it might not
+ anymore.
+ """
+ if isinstance(markup, bytes):
+ decoded = markup.decode('utf-8', 'replace')
+ else:
+ decoded = markup
+ return decoded
+
+ @classmethod
+ def _check_markup_is_url(cls, markup):
+ """Error-handling method to raise a warning if incoming markup looks
+ like a URL.
+
+ :param markup: A string.
"""
if isinstance(markup, bytes):
space = b' '
@@ -364,18 +428,20 @@ class BeautifulSoup(Tag):
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
- if isinstance(markup, bytes):
- decoded_markup = markup.decode('utf-8', 'replace')
- else:
- decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
- ' that document to Beautiful Soup.' % decoded_markup
+ ' that document to Beautiful Soup.' % cls._decode_markup(
+ markup
+ ),
+ MarkupResemblesLocatorWarning
)
def _feed(self):
+ """Internal method that parses previously set markup, creating a large
+ number of Tag and NavigableString objects.
+ """
# Convert the document to Unicode.
self.builder.reset()
@@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
self.popTag()
def reset(self):
+ """Reset this object to a state as though it had never parsed any
+ markup.
+ """
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
self.current_data = []
self.currentTag = None
self.tagStack = []
+ self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
+ self.string_container_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs):
- """Create a new tag associated with this soup."""
+ """Create a new Tag associated with this BeautifulSoup object.
+
+ :param name: The name of the new Tag.
+ :param namespace: The URI of the new Tag's XML namespace, if any.
+ :param prefix: The prefix for the new Tag's XML namespace, if any.
+ :param attrs: A dictionary of this Tag's attribute values; can
+ be used instead of `kwattrs` for attributes like 'class'
+ that are reserved words in Python.
+ :param sourceline: The line number where this tag was
+ (purportedly) found in its source document.
+ :param sourcepos: The character position within `sourceline` where this
+ tag was (purportedly) found.
+ :param kwattrs: Keyword arguments for the new Tag's attribute values.
+
+ """
kwattrs.update(attrs)
return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos
)
- def new_string(self, s, subclass=None):
- """Create a new NavigableString associated with this soup."""
- subclass = subclass or self.element_classes.get(
- NavigableString, NavigableString
+ def string_container(self, base_class=None):
+ container = base_class or NavigableString
+
+ # There may be a general override of NavigableString.
+ container = self.element_classes.get(
+ container, container
)
- return subclass(s)
- def insert_before(self, successor):
+ # On top of that, we may be inside a tag that needs a special
+ # container class.
+ if self.string_container_stack and container is NavigableString:
+ container = self.builder.string_containers.get(
+ self.string_container_stack[-1].name, container
+ )
+ return container
+
+ def new_string(self, s, subclass=None):
+ """Create a new NavigableString associated with this BeautifulSoup
+ object.
+ """
+ container = self.string_container(subclass)
+ return container(s)
+
+ def insert_before(self, *args):
+ """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+ it because there is nothing before or after it in the parse tree.
+ """
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
- def insert_after(self, successor):
+ def insert_after(self, *args):
+ """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
+ it because there is nothing before or after it in the parse tree.
+ """
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
+ """Internal method called by _popToTag when a tag is closed."""
tag = self.tagStack.pop()
+ if tag.name in self.open_tag_counter:
+ self.open_tag_counter[tag.name] -= 1
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
- #print "Pop", tag.name
+ if self.string_container_stack and tag == self.string_container_stack[-1]:
+ self.string_container_stack.pop()
+ #print("Pop", tag.name)
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
- #print "Push", tag.name
+ """Internal method called by handle_starttag when a tag is opened."""
+ #print("Push", tag.name)
if self.currentTag is not None:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
+ if tag.name != self.ROOT_TAG_NAME:
+ self.open_tag_counter[tag.name] += 1
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
+ if tag.name in self.builder.string_containers:
+ self.string_container_stack.append(tag)
def endData(self, containerClass=None):
-
- # Default container is NavigableString.
- containerClass = containerClass or NavigableString
-
- # The user may want us to instantiate some alias for the
- # container class.
- containerClass = self.element_classes.get(
- containerClass, containerClass
- )
-
+ """Method called by the TreeBuilder when the end of a data segment
+ occurs.
+ """
if self.current_data:
current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains
@@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
not self.parse_only.search(current_data)):
return
+ containerClass = self.string_container(containerClass)
o = containerClass(current_data)
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None):
- """Add an object to the parse tree."""
+ """Method called by the TreeBuilder to integrate an object into the parse tree."""
if parent is None:
parent = self.currentTag
if most_recent_element is not None:
@@ -545,10 +656,19 @@ class BeautifulSoup(Tag):
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
- instance of the given tag. If inclusivePop is false, pops the tag
- stack up to but *not* including the most recent instqance of
- the given tag."""
- #print "Popping to %s" % name
+ instance of the given tag.
+
+ If there are no open tags with the given name, nothing will be
+ popped.
+
+ :param name: Pop up to the most recent tag with this name.
+ :param nsprefix: The namespace prefix that goes with `name`.
+ :param inclusivePop: It this is false, pops the tag stack up
+ to but *not* including the most recent instqance of the
+ given tag.
+
+ """
+ #print("Popping to %s" % name)
if name == self.ROOT_TAG_NAME:
# The BeautifulSoup object itself can never be popped.
return
@@ -557,6 +677,8 @@ class BeautifulSoup(Tag):
stack_size = len(self.tagStack)
for i in range(stack_size - 1, 0, -1):
+ if not self.open_tag_counter.get(name):
+ break
t = self.tagStack[i]
if (name == t.name and nsprefix == t.prefix):
if inclusivePop:
@@ -568,15 +690,22 @@ class BeautifulSoup(Tag):
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None):
- """Push a start tag on to the stack.
+ """Called by the tree builder when a new tag is encountered.
- If this method returns None, the tag was rejected by the
+ :param name: Name of the tag.
+ :param nsprefix: Namespace prefix for the tag.
+ :param attrs: A dictionary of attribute values.
+ :param sourceline: The line number where this tag was found in its
+ source document.
+ :param sourcepos: The character position within `sourceline` where this
+ tag was found.
+
+ If this method returns None, the tag was rejected by an active
SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
-
- # print "Start tag %s: %s" % (name, attrs)
+ # print("Start tag %s: %s" % (name, attrs))
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
@@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
return tag
def handle_endtag(self, name, nsprefix=None):
- #print "End tag: " + name
+ """Called by the tree builder when an ending tag is encountered.
+
+ :param name: Name of the tag.
+ :param nsprefix: Namespace prefix for the tag.
+ """
+ #print("End tag: " + name)
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
+ """Called by the tree builder when a chunk of textual data is encountered."""
self.current_data.append(data)
-
+
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Returns a string or Unicode representation of this document.
- To get Unicode, pass None for encoding."""
+ """Returns a string or Unicode representation of the parse tree
+ as an HTML or XML document.
+ :param pretty_print: If this is True, indentation will be used to
+ make the document more readable.
+ :param eventual_encoding: The encoding of the final document.
+ If this is None, the document will be a Unicode string.
+ """
if self.is_xml:
# Print the XML declaration
encoding_part = ''
+ if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
+ # This is a special Python encoding; it can't actually
+ # go into an XML document because it means nothing
+ # outside of Python.
+ eventual_encoding = None
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = '\n' % encoding_part
@@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
-# Alias to make it easier to type import: 'from bs4 import _soup'
+# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
@@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):
class StopParsing(Exception):
+ """Exception raised by a TreeBuilder if it's unable to continue parsing."""
pass
class FeatureNotFound(ValueError):
+ """Exception raised by the BeautifulSoup constructor if no parser with the
+ requested features is found.
+ """
pass
-#By default, act as an HTML pretty-printer.
+#If this file is run as a script, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
- print(soup.prettify())
+ print((soup.prettify()))
diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
index 03a4c1e0..bd44905e 100644
--- a/lib/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -7,8 +7,11 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ Stylesheet,
+ Script,
+ TemplateString,
nonwhitespace_re
- )
+)
__all__ = [
'HTMLTreeBuilder',
@@ -27,18 +30,33 @@ HTML_5 = 'html5'
class TreeBuilderRegistry(object):
-
+ """A way of looking up TreeBuilder subclasses by their name or by desired
+ features.
+ """
+
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
- """Register a treebuilder based on its advertised features."""
+ """Register a treebuilder based on its advertised features.
+
+ :param treebuilder_class: A subclass of Treebuilder. its .features
+ attribute should list its features.
+ """
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
+ """Look up a TreeBuilder subclass with the desired features.
+
+ :param features: A list of features to look for. If none are
+ provided, the most recently registered TreeBuilder subclass
+ will be used.
+ :return: A TreeBuilder subclass, or None if there's no
+ registered subclass with all the requested features.
+ """
if len(self.builders) == 0:
# There are no builders at all.
return None
@@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
- """Turn a document into a Beautiful Soup object tree."""
+ """Turn a textual document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
@@ -96,7 +114,12 @@ class TreeBuilder(object):
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+ # Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+ # The textual contents of tags with these names should be
+ # instantiated with some class other than NavigableString.
+ DEFAULT_STRING_CONTAINERS = {}
USE_DEFAULT = object()
@@ -105,30 +128,39 @@ class TreeBuilder(object):
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
- store_line_numbers=USE_DEFAULT):
+ store_line_numbers=USE_DEFAULT,
+ string_containers=USE_DEFAULT,
+ ):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
- TreeBuilder will not turn any values for attributes like
- 'class' into lists. Setting this do a dictionary will
- customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
- for an example.
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this to a dictionary will
+ customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+ for an example.
- Internally, these are called "CDATA list attributes", but that
- probably doesn't make sense to an end-user, so the argument name
- is `multi_valued_attributes`.
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
- the way
tags are treated in HTML. Tags in this list
- will have
+ the way tags are treated in HTML. Tags in this list
+ are immune from pretty-printing; their contents will always be
+ output as-is.
+
+ :param string_containers: A dictionary mapping tag names to
+ the classes that should be instantiated to contain the textual
+ contents of those tags. The default is to use NavigableString
+ for every tag, no matter what the name. You can override the
+ default by changing DEFAULT_STRING_CONTAINERS.
:param store_line_numbers: If the parser keeps track of the
- line numbers and positions of the original markup, that
- information will, by default, be stored in each corresponding
- `Tag` object. You can turn this off by passing
- store_line_numbers=False. If the parser you're using doesn't
- keep track of this information, then setting store_line_numbers=True
- will do nothing.
+ line numbers and positions of the original markup, that
+ information will, by default, be stored in each corresponding
+ `Tag` object. You can turn this off by passing
+ store_line_numbers=False. If the parser you're using doesn't
+ keep track of this information, then setting store_line_numbers=True
+ will do nothing.
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
@@ -139,15 +171,25 @@ class TreeBuilder(object):
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
- self.store_line_numbers = store_line_numbers
+ self.store_line_numbers = store_line_numbers
+ if string_containers == self.USE_DEFAULT:
+ string_containers = self.DEFAULT_STRING_CONTAINERS
+ self.string_containers = string_containers
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
+
+ :param soup: A BeautifulSoup object.
"""
self.soup = soup
def reset(self):
+ """Do any work necessary to reset the underlying parser
+ for a new document.
+
+ By default, this does nothing.
+ """
pass
def can_be_empty_element(self, tag_name):
@@ -159,23 +201,57 @@ class TreeBuilder(object):
For instance: an HTMLBuilder does not consider a tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty
tag
- will be presented as "
", not "
".
+ will be presented as "
", not "
" or "".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
- empty-element tag if and only if it has no contents.
- " " will become " ", and "bar " will
+ empty-element tag if and only if it has no children.
+ " " will become " ", and "bar " will
be left alone.
+
+ :param tag_name: The name of a markup tag.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
+ """Run some incoming markup through some parsing process,
+ populating the `BeautifulSoup` object in self.soup.
+
+ This method is not implemented in TreeBuilder; it must be
+ implemented in subclasses.
+
+ :return: None.
+ """
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: Some markup -- probably a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding. NOTE: This argument is not used by the
+ calling code and can probably be removed.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
+
+ By default, the only strategy is to parse the markup
+ as-is. See `LXMLTreeBuilderForXML` and
+ `HTMLParserTreeBuilder` for implementations that take into
+ account the quirks of particular parsers.
+ """
yield markup, None, None, False
def test_fragment_to_document(self, fragment):
@@ -188,16 +264,36 @@ class TreeBuilder(object):
results against other HTML fragments.
This method should not be used outside of tests.
+
+ :param fragment: A string -- fragment of HTML.
+ :return: A string -- a full HTML document.
"""
return fragment
def set_up_substitutions(self, tag):
+ """Set up any substitutions that will need to be performed on
+ a `Tag` when it's output as a string.
+
+ By default, this does nothing. See `HTMLTreeBuilder` for a
+ case where this is used.
+
+ :param tag: A `Tag`
+ :return: Whether or not a substitution was performed.
+ """
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
- """Replaces class="foo bar" with class=["foo", "bar"]
+ """When an attribute value is associated with a tag that can
+ have multiple values for that attribute, convert the string
+ value to a list of strings.
- Modifies its input in place.
+ Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+ NOTE: This method modifies its input in place.
+
+ :param tag_name: The name of a tag.
+ :param attrs: A dictionary containing the tag's attributes.
+ Any appropriate attribute values will be modified in place.
"""
if not attrs:
return attrs
@@ -225,7 +321,11 @@ class TreeBuilder(object):
return attrs
class SAXTreeBuilder(TreeBuilder):
- """A Beautiful Soup treebuilder that listens for SAX events."""
+ """A Beautiful Soup treebuilder that listens for SAX events.
+
+ This is not currently used for anything, but it demonstrates
+ how a simple TreeBuilder would work.
+ """
def feed(self, markup):
raise NotImplementedError()
@@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
- #print "Start %s, %r" % (name, attrs)
+ #print("Start %s, %r" % (name, attrs))
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
- #print "End %s" % name
+ #print("End %s" % name)
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
@@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder):
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
+ # The HTML standard defines an unusual content model for these tags.
+ # We represent this by using a string class other than NavigableString
+ # inside these tags.
+ #
+ # I made this list by going through the HTML spec
+ # (https://html.spec.whatwg.org/#metadata-content) and looking for
+ # "metadata content" elements that can contain strings.
+ #
+ # TODO: Arguably could go here but it seems
+ # qualitatively different from the other tags.
+ DEFAULT_STRING_CONTAINERS = {
+ 'style': Stylesheet,
+ 'script': Script,
+ 'template': TemplateString,
+ }
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
@@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
+ """Replace the declared encoding in a tag with a placeholder,
+ to be substituted when the tag is output to a string.
+
+ An HTML document may come in to Beautiful Soup as one
+ encoding, but exit in a different encoding, and the tag
+ needs to be changed to reflect this.
+
+ :param tag: A `Tag`
+ :return: Whether or not a substitution was performed.
+ """
# We are only interested in tags
if tag.name != 'meta':
return False
@@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
- # I'm fairly sure this is not the best way to do this.
- this_module = sys.modules['bs4.builder']
+ this_module = sys.modules[__name__]
for name in module.__all__:
obj = getattr(module, name)
@@ -363,6 +488,9 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
+ """An Exception to be raised when the underlying parser simply
+ refuses to parse the given markup.
+ """
def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
@@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
+# want to use HTMLParser as a last resort.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index 43199189..69aefd72 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -39,7 +39,18 @@ except ImportError as e:
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
+ """Use html5lib to build a tree.
+
+ Note that this TreeBuilder does not support some features common
+ to HTML TreeBuilders. Some of these features could theoretically
+ be implemented, but at the very least it's quite difficult,
+ because html5lib moves the parse tree around as it's being built.
+
+ * This TreeBuilder doesn't use different subclasses of NavigableString
+ based on the name of the tag in which the string was found.
+
+ * You can't use a SoupStrainer to parse only part of a document.
+ """
NAME = "html5lib"
@@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
+ # TODO: What are **kwargs exactly? Should they be passed in
+ # here in addition to/instead of being passed to the BeautifulSoup
+ # constructor?
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
@@ -316,9 +330,7 @@ class Element(treebuilder_base.Node):
return AttrList(self.element)
def setAttributes(self, attributes):
-
if attributes is not None and len(attributes) > 0:
-
converted_attributes = []
for name, value in list(attributes.items()):
if isinstance(name, tuple):
@@ -363,9 +375,9 @@ class Element(treebuilder_base.Node):
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
- # print "MOVE", self.element.contents
- # print "FROM", self.element
- # print "TO", new_parent.element
+ # print("MOVE", self.element.contents)
+ # print("FROM", self.element)
+ # print("TO", new_parent.element)
element = self.element
new_parent_element = new_parent.element
@@ -423,9 +435,9 @@ class Element(treebuilder_base.Node):
element.contents = []
element.next_element = final_next_element
- # print "DONE WITH MOVE"
- # print "FROM", self.element
- # print "TO", new_parent_element
+ # print("DONE WITH MOVE")
+ # print("FROM", self.element)
+ # print("TO", new_parent_element)
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
index f1b473fe..70e9be84 100644
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -8,7 +8,7 @@ __all__ = [
'HTMLParserTreeBuilder',
]
-from future.moves.html.parser import HTMLParser
+from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
@@ -53,8 +53,30 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
+ """A subclass of the Python standard library's HTMLParser class, which
+ listens for HTMLParser events and translates them into calls
+ to Beautiful Soup's tree construction API.
+ """
+ # Strategies for handling duplicate attributes
+ IGNORE = 'ignore'
+ REPLACE = 'replace'
+
def __init__(self, *args, **kwargs):
+ """Constructor.
+
+ :param on_duplicate_attribute: A strategy for what to do if a
+ tag includes the same attribute more than once. Accepted
+ values are: REPLACE (replace earlier values with later
+ ones, the default), IGNORE (keep the earliest value
+ encountered), or a callable. A callable must take three
+ arguments: the dictionary of attributes already processed,
+ the name of the duplicate attribute, and the most recent value
+ encountered.
+ """
+ self.on_duplicate_attribute = kwargs.pop(
+ 'on_duplicate_attribute', self.REPLACE
+ )
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
@@ -67,20 +89,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element = []
def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although this
- requirement doesn't appear to be documented.
+ """In Python 3, HTMLParser subclasses must implement error(), although
+ this requirement doesn't appear to be documented.
- In Python 2, HTMLParser implements error() as raising an exception.
+ In Python 2, HTMLParser implements error() by raising an exception,
+ which we don't want to do.
- In any event, this method is called only on very strange markup and our best strategy
- is to pretend it didn't happen and keep going.
+ In any event, this method is called only on very strange
+ markup and our best strategy is to pretend it didn't happen
+ and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
- # This is only called when the markup looks like
- # .
+ """Handle an incoming empty-element tag.
+ This is only called when the markup looks like .
+
+ :param name: Name of the tag.
+ :param attrs: Dictionary of the tag's attributes.
+ """
# is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call
@@ -89,6 +117,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True):
+ """Handle an opening tag, e.g. ''
+
+ :param name: Name of the tag.
+ :param attrs: Dictionary of the tag's attributes.
+ :param handle_empty_element: True if this tag is known to be
+ an empty-element tag (i.e. there is not expected to be any
+ closing tag).
+ """
# XXX namespace
attr_dict = {}
for key, value in attrs:
@@ -96,9 +132,21 @@ class BeautifulSoupHTMLParser(HTMLParser):
# for consistency with the other tree builders.
if value is None:
value = ''
- attr_dict[key] = value
+ if key in attr_dict:
+ # A single attribute shows up multiple times in this
+ # tag. How to handle it depends on the
+ # on_duplicate_attribute setting.
+ on_dupe = self.on_duplicate_attribute
+ if on_dupe == self.IGNORE:
+ pass
+ elif on_dupe in (None, self.REPLACE):
+ attr_dict[key] = value
+ else:
+ on_dupe(attr_dict, key, value)
+ else:
+ attr_dict[key] = value
attrvalue = '""'
- #print "START", name
+ #print("START", name)
sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline,
@@ -121,20 +169,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True):
- #print "END", name
+ """Handle a closing tag, e.g. ' '
+
+ :param name: A tag name.
+ :param check_already_closed: True if this tag is expected to
+ be the closing portion of an empty-element tag,
+ e.g. ' '.
+ """
+ #print("END", name)
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
# We've already called handle_endtag() for it, so just
# check it off the list.
- # print "ALREADY CLOSED", name
+ #print("ALREADY CLOSED", name)
self.already_closed_empty_element.remove(name)
else:
self.soup.handle_endtag(name)
def handle_data(self, data):
+ """Handle some textual data that shows up between tags."""
self.soup.handle_data(data)
def handle_charref(self, name):
+ """Handle a numeric character reference by converting it to the
+ corresponding Unicode character and treating it as textual
+ data.
+
+ :param name: Character number, possibly in hexadecimal.
+ """
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
@@ -168,6 +230,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_entityref(self, name):
+ """Handle a named entity reference by converting it to the
+ corresponding Unicode character(s) and treating it as textual
+ data.
+
+ :param name: Name of the entity reference.
+ """
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
@@ -181,21 +249,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_comment(self, data):
+ """Handle an HTML comment.
+
+ :param data: The text of the comment.
+ """
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
+ """Handle a DOCTYPE declaration.
+
+ :param data: The text of the declaration.
+ """
self.soup.endData()
- if data.startswith("DOCTYPE "):
- data = data[len("DOCTYPE "):]
- elif data == 'DOCTYPE':
- # i.e. ""
- data = ''
+ data = data[len("DOCTYPE "):]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
+ """Handle a declaration of unknown type -- probably a CDATA block.
+
+ :param data: The text of the declaration.
+ """
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
@@ -206,13 +282,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData(cls)
def handle_pi(self, data):
+ """Handle a processing instruction.
+
+ :param data: The text of the instruction.
+ """
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+ """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
+ found in the Python standard library.
+ """
is_xml = False
picklable = True
NAME = HTMLPARSER
@@ -221,36 +303,88 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
# The html.parser knows which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
-
+
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ """Constructor.
+
+ :param parser_args: Positional arguments to pass into
+ the BeautifulSoupHTMLParser constructor, once it's
+ invoked.
+ :param parser_kwargs: Keyword arguments to pass into
+ the BeautifulSoupHTMLParser constructor, once it's
+ invoked.
+ :param kwargs: Keyword arguments for the superclass constructor.
+ """
+ # Some keyword arguments will be pulled out of kwargs and placed
+ # into parser_kwargs.
+ extra_parser_kwargs = dict()
+ for arg in ('on_duplicate_attribute',):
+ if arg in kwargs:
+ value = kwargs.pop(arg)
+ extra_parser_kwargs[arg] = value
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
+ parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
-
+
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
- """
- :return: A 4-tuple (markup, original encoding, encoding
- declared within markup, whether any characters had to be
- replaced with REPLACEMENT CHARACTER).
+
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: Some markup -- probably a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
"""
if isinstance(markup, str):
+ # Parse Unicode as-is.
yield (markup, None, None, False)
return
+ # Ask UnicodeDammit to sniff the most likely encoding.
+
+ # This was provided by the end-user; treat it as a known
+ # definite encoding per the algorithm laid out in the HTML5
+ # spec. (See the EncodingDetector class for details.)
+ known_definite_encodings = [user_specified_encoding]
+
+ # This was found in the document; treat it as a slightly lower-priority
+ # user encoding.
+ user_encodings = [document_declared_encoding]
+
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True,
- exclude_encodings=exclude_encodings)
+ dammit = UnicodeDammit(
+ markup,
+ known_definite_encodings=known_definite_encodings,
+ user_encodings=user_encodings,
+ is_html=True,
+ exclude_encodings=exclude_encodings
+ )
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
+ """Run some incoming markup through some parsing process,
+ populating the `BeautifulSoup` object in self.soup.
+ """
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py
index f5257963..11c9a696 100644
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
+ # See: https://bugs.launchpad.net/lxml/+bug/1846906
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
+
+ :param soup: A `BeautifulSoup`.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
@@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
while parsing the document.
This might be useful later on when creating CSS selectors.
+
+ :param mapping: A dictionary mapping namespace prefixes to URIs.
"""
for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces:
@@ -84,20 +89,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup._namespaces[key] = value
def default_parser(self, encoding):
- # This can either return a parser object or a class, which
- # will be instantiated with default arguments.
+ """Find the default parser for the given encoding.
+
+ :param encoding: A string.
+ :return: Either a parser object or a class, which
+ will be instantiated with default arguments.
+ """
if self._default_parser is not None:
return self._default_parser
return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding):
+ """Instantiate an appropriate parser for the given encoding.
+
+ :param encoding: A string.
+ :return: A parser object such as an `etree.XMLParser`.
+ """
# Use the default parser.
parser = self.default_parser(encoding)
if isinstance(parser, Callable):
# Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False, encoding=encoding)
+ parser = parser(
+ target=self, strip_cdata=False, recover=True, encoding=encoding
+ )
return parser
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
@@ -122,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None):
- """
- :yield: A series of 4-tuples.
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ lxml really wants to get a bytestring and convert it to
+ Unicode itself. So instead of using UnicodeDammit to convert
+ the bytestring to Unicode using different encodings, this
+ implementation uses EncodingDetector to iterate over the
+ encodings, and tell lxml to try to parse the document as each
+ one in turn.
+
+ :param markup: Some markup -- hopefully a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
- Each 4-tuple represents a strategy for parsing the document.
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
"""
- # Instead of using UnicodeDammit to convert the bytestring to
- # Unicode using different encodings, use EncodingDetector to
- # iterate over the encodings, and tell lxml to try to parse
- # the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
@@ -150,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
- try_encodings = [user_specified_encoding, document_declared_encoding]
+ # This was provided by the end-user; treat it as a known
+ # definite encoding per the algorithm laid out in the HTML5
+ # spec. (See the EncodingDetector class for details.)
+ known_definite_encodings = [user_specified_encoding]
+
+ # This was found in the document; treat it as a slightly lower-priority
+ # user encoding.
+ user_encodings = [document_declared_encoding]
detector = EncodingDetector(
- markup, try_encodings, is_html, exclude_encodings)
+ markup, known_definite_encodings=known_definite_encodings,
+ user_encodings=user_encodings, is_html=is_html,
+ exclude_encodings=exclude_encodings
+ )
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)
diff --git a/lib/bs4/check_block.py b/lib/bs4/check_block.py
deleted file mode 100644
index a60a7b74..00000000
--- a/lib/bs4/check_block.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import requests
-data = requests.get("https://www.crummy.com/").content
-from bs4 import _s
-data = [x for x in _s(data).block_text()]
diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py
index e8cdd147..e017408b 100644
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@@ -9,9 +9,9 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
+from html.entities import codepoint2name
+from collections import defaultdict
import codecs
-from future.moves.html.entities import codepoint2name
-from future.builtins import chr
import re
import logging
import string
@@ -44,6 +44,9 @@ except ImportError:
return None
# Available from http://cjkpython.i18n.org/.
+#
+# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
+# is GPL-licensed. Check whether this is still necessary.
try:
import iconv_codec
except ImportError:
@@ -51,7 +54,7 @@ except ImportError:
# Build bytestring and Unicode versions of regular expressions for finding
# a declared encoding inside an XML or HTML document.
-xml_encoding = '^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
+xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
encoding_res = dict()
encoding_res[bytes] = {
@@ -63,35 +66,2368 @@ encoding_res[str] = {
'xml' : re.compile(xml_encoding, re.I)
}
-class EntitySubstitution(object):
+try:
+ from html.entities import html5
+except ImportError:
+ # This is a copy of html.entities.html5 from Python 3.9. There's
+ # no equivalent table in Python 2, so we'll just provide a copy
+ # here.
+ html5 = {
+ 'Aacute': '\xc1',
+ 'aacute': '\xe1',
+ 'Aacute;': '\xc1',
+ 'aacute;': '\xe1',
+ 'Abreve;': '\u0102',
+ 'abreve;': '\u0103',
+ 'ac;': '\u223e',
+ 'acd;': '\u223f',
+ 'acE;': '\u223e\u0333',
+ 'Acirc': '\xc2',
+ 'acirc': '\xe2',
+ 'Acirc;': '\xc2',
+ 'acirc;': '\xe2',
+ 'acute': '\xb4',
+ 'acute;': '\xb4',
+ 'Acy;': '\u0410',
+ 'acy;': '\u0430',
+ 'AElig': '\xc6',
+ 'aelig': '\xe6',
+ 'AElig;': '\xc6',
+ 'aelig;': '\xe6',
+ 'af;': '\u2061',
+ 'Afr;': '\U0001d504',
+ 'afr;': '\U0001d51e',
+ 'Agrave': '\xc0',
+ 'agrave': '\xe0',
+ 'Agrave;': '\xc0',
+ 'agrave;': '\xe0',
+ 'alefsym;': '\u2135',
+ 'aleph;': '\u2135',
+ 'Alpha;': '\u0391',
+ 'alpha;': '\u03b1',
+ 'Amacr;': '\u0100',
+ 'amacr;': '\u0101',
+ 'amalg;': '\u2a3f',
+ 'AMP': '&',
+ 'amp': '&',
+ 'AMP;': '&',
+ 'amp;': '&',
+ 'And;': '\u2a53',
+ 'and;': '\u2227',
+ 'andand;': '\u2a55',
+ 'andd;': '\u2a5c',
+ 'andslope;': '\u2a58',
+ 'andv;': '\u2a5a',
+ 'ang;': '\u2220',
+ 'ange;': '\u29a4',
+ 'angle;': '\u2220',
+ 'angmsd;': '\u2221',
+ 'angmsdaa;': '\u29a8',
+ 'angmsdab;': '\u29a9',
+ 'angmsdac;': '\u29aa',
+ 'angmsdad;': '\u29ab',
+ 'angmsdae;': '\u29ac',
+ 'angmsdaf;': '\u29ad',
+ 'angmsdag;': '\u29ae',
+ 'angmsdah;': '\u29af',
+ 'angrt;': '\u221f',
+ 'angrtvb;': '\u22be',
+ 'angrtvbd;': '\u299d',
+ 'angsph;': '\u2222',
+ 'angst;': '\xc5',
+ 'angzarr;': '\u237c',
+ 'Aogon;': '\u0104',
+ 'aogon;': '\u0105',
+ 'Aopf;': '\U0001d538',
+ 'aopf;': '\U0001d552',
+ 'ap;': '\u2248',
+ 'apacir;': '\u2a6f',
+ 'apE;': '\u2a70',
+ 'ape;': '\u224a',
+ 'apid;': '\u224b',
+ 'apos;': "'",
+ 'ApplyFunction;': '\u2061',
+ 'approx;': '\u2248',
+ 'approxeq;': '\u224a',
+ 'Aring': '\xc5',
+ 'aring': '\xe5',
+ 'Aring;': '\xc5',
+ 'aring;': '\xe5',
+ 'Ascr;': '\U0001d49c',
+ 'ascr;': '\U0001d4b6',
+ 'Assign;': '\u2254',
+ 'ast;': '*',
+ 'asymp;': '\u2248',
+ 'asympeq;': '\u224d',
+ 'Atilde': '\xc3',
+ 'atilde': '\xe3',
+ 'Atilde;': '\xc3',
+ 'atilde;': '\xe3',
+ 'Auml': '\xc4',
+ 'auml': '\xe4',
+ 'Auml;': '\xc4',
+ 'auml;': '\xe4',
+ 'awconint;': '\u2233',
+ 'awint;': '\u2a11',
+ 'backcong;': '\u224c',
+ 'backepsilon;': '\u03f6',
+ 'backprime;': '\u2035',
+ 'backsim;': '\u223d',
+ 'backsimeq;': '\u22cd',
+ 'Backslash;': '\u2216',
+ 'Barv;': '\u2ae7',
+ 'barvee;': '\u22bd',
+ 'Barwed;': '\u2306',
+ 'barwed;': '\u2305',
+ 'barwedge;': '\u2305',
+ 'bbrk;': '\u23b5',
+ 'bbrktbrk;': '\u23b6',
+ 'bcong;': '\u224c',
+ 'Bcy;': '\u0411',
+ 'bcy;': '\u0431',
+ 'bdquo;': '\u201e',
+ 'becaus;': '\u2235',
+ 'Because;': '\u2235',
+ 'because;': '\u2235',
+ 'bemptyv;': '\u29b0',
+ 'bepsi;': '\u03f6',
+ 'bernou;': '\u212c',
+ 'Bernoullis;': '\u212c',
+ 'Beta;': '\u0392',
+ 'beta;': '\u03b2',
+ 'beth;': '\u2136',
+ 'between;': '\u226c',
+ 'Bfr;': '\U0001d505',
+ 'bfr;': '\U0001d51f',
+ 'bigcap;': '\u22c2',
+ 'bigcirc;': '\u25ef',
+ 'bigcup;': '\u22c3',
+ 'bigodot;': '\u2a00',
+ 'bigoplus;': '\u2a01',
+ 'bigotimes;': '\u2a02',
+ 'bigsqcup;': '\u2a06',
+ 'bigstar;': '\u2605',
+ 'bigtriangledown;': '\u25bd',
+ 'bigtriangleup;': '\u25b3',
+ 'biguplus;': '\u2a04',
+ 'bigvee;': '\u22c1',
+ 'bigwedge;': '\u22c0',
+ 'bkarow;': '\u290d',
+ 'blacklozenge;': '\u29eb',
+ 'blacksquare;': '\u25aa',
+ 'blacktriangle;': '\u25b4',
+ 'blacktriangledown;': '\u25be',
+ 'blacktriangleleft;': '\u25c2',
+ 'blacktriangleright;': '\u25b8',
+ 'blank;': '\u2423',
+ 'blk12;': '\u2592',
+ 'blk14;': '\u2591',
+ 'blk34;': '\u2593',
+ 'block;': '\u2588',
+ 'bne;': '=\u20e5',
+ 'bnequiv;': '\u2261\u20e5',
+ 'bNot;': '\u2aed',
+ 'bnot;': '\u2310',
+ 'Bopf;': '\U0001d539',
+ 'bopf;': '\U0001d553',
+ 'bot;': '\u22a5',
+ 'bottom;': '\u22a5',
+ 'bowtie;': '\u22c8',
+ 'boxbox;': '\u29c9',
+ 'boxDL;': '\u2557',
+ 'boxDl;': '\u2556',
+ 'boxdL;': '\u2555',
+ 'boxdl;': '\u2510',
+ 'boxDR;': '\u2554',
+ 'boxDr;': '\u2553',
+ 'boxdR;': '\u2552',
+ 'boxdr;': '\u250c',
+ 'boxH;': '\u2550',
+ 'boxh;': '\u2500',
+ 'boxHD;': '\u2566',
+ 'boxHd;': '\u2564',
+ 'boxhD;': '\u2565',
+ 'boxhd;': '\u252c',
+ 'boxHU;': '\u2569',
+ 'boxHu;': '\u2567',
+ 'boxhU;': '\u2568',
+ 'boxhu;': '\u2534',
+ 'boxminus;': '\u229f',
+ 'boxplus;': '\u229e',
+ 'boxtimes;': '\u22a0',
+ 'boxUL;': '\u255d',
+ 'boxUl;': '\u255c',
+ 'boxuL;': '\u255b',
+ 'boxul;': '\u2518',
+ 'boxUR;': '\u255a',
+ 'boxUr;': '\u2559',
+ 'boxuR;': '\u2558',
+ 'boxur;': '\u2514',
+ 'boxV;': '\u2551',
+ 'boxv;': '\u2502',
+ 'boxVH;': '\u256c',
+ 'boxVh;': '\u256b',
+ 'boxvH;': '\u256a',
+ 'boxvh;': '\u253c',
+ 'boxVL;': '\u2563',
+ 'boxVl;': '\u2562',
+ 'boxvL;': '\u2561',
+ 'boxvl;': '\u2524',
+ 'boxVR;': '\u2560',
+ 'boxVr;': '\u255f',
+ 'boxvR;': '\u255e',
+ 'boxvr;': '\u251c',
+ 'bprime;': '\u2035',
+ 'Breve;': '\u02d8',
+ 'breve;': '\u02d8',
+ 'brvbar': '\xa6',
+ 'brvbar;': '\xa6',
+ 'Bscr;': '\u212c',
+ 'bscr;': '\U0001d4b7',
+ 'bsemi;': '\u204f',
+ 'bsim;': '\u223d',
+ 'bsime;': '\u22cd',
+ 'bsol;': '\\',
+ 'bsolb;': '\u29c5',
+ 'bsolhsub;': '\u27c8',
+ 'bull;': '\u2022',
+ 'bullet;': '\u2022',
+ 'bump;': '\u224e',
+ 'bumpE;': '\u2aae',
+ 'bumpe;': '\u224f',
+ 'Bumpeq;': '\u224e',
+ 'bumpeq;': '\u224f',
+ 'Cacute;': '\u0106',
+ 'cacute;': '\u0107',
+ 'Cap;': '\u22d2',
+ 'cap;': '\u2229',
+ 'capand;': '\u2a44',
+ 'capbrcup;': '\u2a49',
+ 'capcap;': '\u2a4b',
+ 'capcup;': '\u2a47',
+ 'capdot;': '\u2a40',
+ 'CapitalDifferentialD;': '\u2145',
+ 'caps;': '\u2229\ufe00',
+ 'caret;': '\u2041',
+ 'caron;': '\u02c7',
+ 'Cayleys;': '\u212d',
+ 'ccaps;': '\u2a4d',
+ 'Ccaron;': '\u010c',
+ 'ccaron;': '\u010d',
+ 'Ccedil': '\xc7',
+ 'ccedil': '\xe7',
+ 'Ccedil;': '\xc7',
+ 'ccedil;': '\xe7',
+ 'Ccirc;': '\u0108',
+ 'ccirc;': '\u0109',
+ 'Cconint;': '\u2230',
+ 'ccups;': '\u2a4c',
+ 'ccupssm;': '\u2a50',
+ 'Cdot;': '\u010a',
+ 'cdot;': '\u010b',
+ 'cedil': '\xb8',
+ 'cedil;': '\xb8',
+ 'Cedilla;': '\xb8',
+ 'cemptyv;': '\u29b2',
+ 'cent': '\xa2',
+ 'cent;': '\xa2',
+ 'CenterDot;': '\xb7',
+ 'centerdot;': '\xb7',
+ 'Cfr;': '\u212d',
+ 'cfr;': '\U0001d520',
+ 'CHcy;': '\u0427',
+ 'chcy;': '\u0447',
+ 'check;': '\u2713',
+ 'checkmark;': '\u2713',
+ 'Chi;': '\u03a7',
+ 'chi;': '\u03c7',
+ 'cir;': '\u25cb',
+ 'circ;': '\u02c6',
+ 'circeq;': '\u2257',
+ 'circlearrowleft;': '\u21ba',
+ 'circlearrowright;': '\u21bb',
+ 'circledast;': '\u229b',
+ 'circledcirc;': '\u229a',
+ 'circleddash;': '\u229d',
+ 'CircleDot;': '\u2299',
+ 'circledR;': '\xae',
+ 'circledS;': '\u24c8',
+ 'CircleMinus;': '\u2296',
+ 'CirclePlus;': '\u2295',
+ 'CircleTimes;': '\u2297',
+ 'cirE;': '\u29c3',
+ 'cire;': '\u2257',
+ 'cirfnint;': '\u2a10',
+ 'cirmid;': '\u2aef',
+ 'cirscir;': '\u29c2',
+ 'ClockwiseContourIntegral;': '\u2232',
+ 'CloseCurlyDoubleQuote;': '\u201d',
+ 'CloseCurlyQuote;': '\u2019',
+ 'clubs;': '\u2663',
+ 'clubsuit;': '\u2663',
+ 'Colon;': '\u2237',
+ 'colon;': ':',
+ 'Colone;': '\u2a74',
+ 'colone;': '\u2254',
+ 'coloneq;': '\u2254',
+ 'comma;': ',',
+ 'commat;': '@',
+ 'comp;': '\u2201',
+ 'compfn;': '\u2218',
+ 'complement;': '\u2201',
+ 'complexes;': '\u2102',
+ 'cong;': '\u2245',
+ 'congdot;': '\u2a6d',
+ 'Congruent;': '\u2261',
+ 'Conint;': '\u222f',
+ 'conint;': '\u222e',
+ 'ContourIntegral;': '\u222e',
+ 'Copf;': '\u2102',
+ 'copf;': '\U0001d554',
+ 'coprod;': '\u2210',
+ 'Coproduct;': '\u2210',
+ 'COPY': '\xa9',
+ 'copy': '\xa9',
+ 'COPY;': '\xa9',
+ 'copy;': '\xa9',
+ 'copysr;': '\u2117',
+ 'CounterClockwiseContourIntegral;': '\u2233',
+ 'crarr;': '\u21b5',
+ 'Cross;': '\u2a2f',
+ 'cross;': '\u2717',
+ 'Cscr;': '\U0001d49e',
+ 'cscr;': '\U0001d4b8',
+ 'csub;': '\u2acf',
+ 'csube;': '\u2ad1',
+ 'csup;': '\u2ad0',
+ 'csupe;': '\u2ad2',
+ 'ctdot;': '\u22ef',
+ 'cudarrl;': '\u2938',
+ 'cudarrr;': '\u2935',
+ 'cuepr;': '\u22de',
+ 'cuesc;': '\u22df',
+ 'cularr;': '\u21b6',
+ 'cularrp;': '\u293d',
+ 'Cup;': '\u22d3',
+ 'cup;': '\u222a',
+ 'cupbrcap;': '\u2a48',
+ 'CupCap;': '\u224d',
+ 'cupcap;': '\u2a46',
+ 'cupcup;': '\u2a4a',
+ 'cupdot;': '\u228d',
+ 'cupor;': '\u2a45',
+ 'cups;': '\u222a\ufe00',
+ 'curarr;': '\u21b7',
+ 'curarrm;': '\u293c',
+ 'curlyeqprec;': '\u22de',
+ 'curlyeqsucc;': '\u22df',
+ 'curlyvee;': '\u22ce',
+ 'curlywedge;': '\u22cf',
+ 'curren': '\xa4',
+ 'curren;': '\xa4',
+ 'curvearrowleft;': '\u21b6',
+ 'curvearrowright;': '\u21b7',
+ 'cuvee;': '\u22ce',
+ 'cuwed;': '\u22cf',
+ 'cwconint;': '\u2232',
+ 'cwint;': '\u2231',
+ 'cylcty;': '\u232d',
+ 'Dagger;': '\u2021',
+ 'dagger;': '\u2020',
+ 'daleth;': '\u2138',
+ 'Darr;': '\u21a1',
+ 'dArr;': '\u21d3',
+ 'darr;': '\u2193',
+ 'dash;': '\u2010',
+ 'Dashv;': '\u2ae4',
+ 'dashv;': '\u22a3',
+ 'dbkarow;': '\u290f',
+ 'dblac;': '\u02dd',
+ 'Dcaron;': '\u010e',
+ 'dcaron;': '\u010f',
+ 'Dcy;': '\u0414',
+ 'dcy;': '\u0434',
+ 'DD;': '\u2145',
+ 'dd;': '\u2146',
+ 'ddagger;': '\u2021',
+ 'ddarr;': '\u21ca',
+ 'DDotrahd;': '\u2911',
+ 'ddotseq;': '\u2a77',
+ 'deg': '\xb0',
+ 'deg;': '\xb0',
+ 'Del;': '\u2207',
+ 'Delta;': '\u0394',
+ 'delta;': '\u03b4',
+ 'demptyv;': '\u29b1',
+ 'dfisht;': '\u297f',
+ 'Dfr;': '\U0001d507',
+ 'dfr;': '\U0001d521',
+ 'dHar;': '\u2965',
+ 'dharl;': '\u21c3',
+ 'dharr;': '\u21c2',
+ 'DiacriticalAcute;': '\xb4',
+ 'DiacriticalDot;': '\u02d9',
+ 'DiacriticalDoubleAcute;': '\u02dd',
+ 'DiacriticalGrave;': '`',
+ 'DiacriticalTilde;': '\u02dc',
+ 'diam;': '\u22c4',
+ 'Diamond;': '\u22c4',
+ 'diamond;': '\u22c4',
+ 'diamondsuit;': '\u2666',
+ 'diams;': '\u2666',
+ 'die;': '\xa8',
+ 'DifferentialD;': '\u2146',
+ 'digamma;': '\u03dd',
+ 'disin;': '\u22f2',
+ 'div;': '\xf7',
+ 'divide': '\xf7',
+ 'divide;': '\xf7',
+ 'divideontimes;': '\u22c7',
+ 'divonx;': '\u22c7',
+ 'DJcy;': '\u0402',
+ 'djcy;': '\u0452',
+ 'dlcorn;': '\u231e',
+ 'dlcrop;': '\u230d',
+ 'dollar;': '$',
+ 'Dopf;': '\U0001d53b',
+ 'dopf;': '\U0001d555',
+ 'Dot;': '\xa8',
+ 'dot;': '\u02d9',
+ 'DotDot;': '\u20dc',
+ 'doteq;': '\u2250',
+ 'doteqdot;': '\u2251',
+ 'DotEqual;': '\u2250',
+ 'dotminus;': '\u2238',
+ 'dotplus;': '\u2214',
+ 'dotsquare;': '\u22a1',
+ 'doublebarwedge;': '\u2306',
+ 'DoubleContourIntegral;': '\u222f',
+ 'DoubleDot;': '\xa8',
+ 'DoubleDownArrow;': '\u21d3',
+ 'DoubleLeftArrow;': '\u21d0',
+ 'DoubleLeftRightArrow;': '\u21d4',
+ 'DoubleLeftTee;': '\u2ae4',
+ 'DoubleLongLeftArrow;': '\u27f8',
+ 'DoubleLongLeftRightArrow;': '\u27fa',
+ 'DoubleLongRightArrow;': '\u27f9',
+ 'DoubleRightArrow;': '\u21d2',
+ 'DoubleRightTee;': '\u22a8',
+ 'DoubleUpArrow;': '\u21d1',
+ 'DoubleUpDownArrow;': '\u21d5',
+ 'DoubleVerticalBar;': '\u2225',
+ 'DownArrow;': '\u2193',
+ 'Downarrow;': '\u21d3',
+ 'downarrow;': '\u2193',
+ 'DownArrowBar;': '\u2913',
+ 'DownArrowUpArrow;': '\u21f5',
+ 'DownBreve;': '\u0311',
+ 'downdownarrows;': '\u21ca',
+ 'downharpoonleft;': '\u21c3',
+ 'downharpoonright;': '\u21c2',
+ 'DownLeftRightVector;': '\u2950',
+ 'DownLeftTeeVector;': '\u295e',
+ 'DownLeftVector;': '\u21bd',
+ 'DownLeftVectorBar;': '\u2956',
+ 'DownRightTeeVector;': '\u295f',
+ 'DownRightVector;': '\u21c1',
+ 'DownRightVectorBar;': '\u2957',
+ 'DownTee;': '\u22a4',
+ 'DownTeeArrow;': '\u21a7',
+ 'drbkarow;': '\u2910',
+ 'drcorn;': '\u231f',
+ 'drcrop;': '\u230c',
+ 'Dscr;': '\U0001d49f',
+ 'dscr;': '\U0001d4b9',
+ 'DScy;': '\u0405',
+ 'dscy;': '\u0455',
+ 'dsol;': '\u29f6',
+ 'Dstrok;': '\u0110',
+ 'dstrok;': '\u0111',
+ 'dtdot;': '\u22f1',
+ 'dtri;': '\u25bf',
+ 'dtrif;': '\u25be',
+ 'duarr;': '\u21f5',
+ 'duhar;': '\u296f',
+ 'dwangle;': '\u29a6',
+ 'DZcy;': '\u040f',
+ 'dzcy;': '\u045f',
+ 'dzigrarr;': '\u27ff',
+ 'Eacute': '\xc9',
+ 'eacute': '\xe9',
+ 'Eacute;': '\xc9',
+ 'eacute;': '\xe9',
+ 'easter;': '\u2a6e',
+ 'Ecaron;': '\u011a',
+ 'ecaron;': '\u011b',
+ 'ecir;': '\u2256',
+ 'Ecirc': '\xca',
+ 'ecirc': '\xea',
+ 'Ecirc;': '\xca',
+ 'ecirc;': '\xea',
+ 'ecolon;': '\u2255',
+ 'Ecy;': '\u042d',
+ 'ecy;': '\u044d',
+ 'eDDot;': '\u2a77',
+ 'Edot;': '\u0116',
+ 'eDot;': '\u2251',
+ 'edot;': '\u0117',
+ 'ee;': '\u2147',
+ 'efDot;': '\u2252',
+ 'Efr;': '\U0001d508',
+ 'efr;': '\U0001d522',
+ 'eg;': '\u2a9a',
+ 'Egrave': '\xc8',
+ 'egrave': '\xe8',
+ 'Egrave;': '\xc8',
+ 'egrave;': '\xe8',
+ 'egs;': '\u2a96',
+ 'egsdot;': '\u2a98',
+ 'el;': '\u2a99',
+ 'Element;': '\u2208',
+ 'elinters;': '\u23e7',
+ 'ell;': '\u2113',
+ 'els;': '\u2a95',
+ 'elsdot;': '\u2a97',
+ 'Emacr;': '\u0112',
+ 'emacr;': '\u0113',
+ 'empty;': '\u2205',
+ 'emptyset;': '\u2205',
+ 'EmptySmallSquare;': '\u25fb',
+ 'emptyv;': '\u2205',
+ 'EmptyVerySmallSquare;': '\u25ab',
+ 'emsp13;': '\u2004',
+ 'emsp14;': '\u2005',
+ 'emsp;': '\u2003',
+ 'ENG;': '\u014a',
+ 'eng;': '\u014b',
+ 'ensp;': '\u2002',
+ 'Eogon;': '\u0118',
+ 'eogon;': '\u0119',
+ 'Eopf;': '\U0001d53c',
+ 'eopf;': '\U0001d556',
+ 'epar;': '\u22d5',
+ 'eparsl;': '\u29e3',
+ 'eplus;': '\u2a71',
+ 'epsi;': '\u03b5',
+ 'Epsilon;': '\u0395',
+ 'epsilon;': '\u03b5',
+ 'epsiv;': '\u03f5',
+ 'eqcirc;': '\u2256',
+ 'eqcolon;': '\u2255',
+ 'eqsim;': '\u2242',
+ 'eqslantgtr;': '\u2a96',
+ 'eqslantless;': '\u2a95',
+ 'Equal;': '\u2a75',
+ 'equals;': '=',
+ 'EqualTilde;': '\u2242',
+ 'equest;': '\u225f',
+ 'Equilibrium;': '\u21cc',
+ 'equiv;': '\u2261',
+ 'equivDD;': '\u2a78',
+ 'eqvparsl;': '\u29e5',
+ 'erarr;': '\u2971',
+ 'erDot;': '\u2253',
+ 'Escr;': '\u2130',
+ 'escr;': '\u212f',
+ 'esdot;': '\u2250',
+ 'Esim;': '\u2a73',
+ 'esim;': '\u2242',
+ 'Eta;': '\u0397',
+ 'eta;': '\u03b7',
+ 'ETH': '\xd0',
+ 'eth': '\xf0',
+ 'ETH;': '\xd0',
+ 'eth;': '\xf0',
+ 'Euml': '\xcb',
+ 'euml': '\xeb',
+ 'Euml;': '\xcb',
+ 'euml;': '\xeb',
+ 'euro;': '\u20ac',
+ 'excl;': '!',
+ 'exist;': '\u2203',
+ 'Exists;': '\u2203',
+ 'expectation;': '\u2130',
+ 'ExponentialE;': '\u2147',
+ 'exponentiale;': '\u2147',
+ 'fallingdotseq;': '\u2252',
+ 'Fcy;': '\u0424',
+ 'fcy;': '\u0444',
+ 'female;': '\u2640',
+ 'ffilig;': '\ufb03',
+ 'fflig;': '\ufb00',
+ 'ffllig;': '\ufb04',
+ 'Ffr;': '\U0001d509',
+ 'ffr;': '\U0001d523',
+ 'filig;': '\ufb01',
+ 'FilledSmallSquare;': '\u25fc',
+ 'FilledVerySmallSquare;': '\u25aa',
+ 'fjlig;': 'fj',
+ 'flat;': '\u266d',
+ 'fllig;': '\ufb02',
+ 'fltns;': '\u25b1',
+ 'fnof;': '\u0192',
+ 'Fopf;': '\U0001d53d',
+ 'fopf;': '\U0001d557',
+ 'ForAll;': '\u2200',
+ 'forall;': '\u2200',
+ 'fork;': '\u22d4',
+ 'forkv;': '\u2ad9',
+ 'Fouriertrf;': '\u2131',
+ 'fpartint;': '\u2a0d',
+ 'frac12': '\xbd',
+ 'frac12;': '\xbd',
+ 'frac13;': '\u2153',
+ 'frac14': '\xbc',
+ 'frac14;': '\xbc',
+ 'frac15;': '\u2155',
+ 'frac16;': '\u2159',
+ 'frac18;': '\u215b',
+ 'frac23;': '\u2154',
+ 'frac25;': '\u2156',
+ 'frac34': '\xbe',
+ 'frac34;': '\xbe',
+ 'frac35;': '\u2157',
+ 'frac38;': '\u215c',
+ 'frac45;': '\u2158',
+ 'frac56;': '\u215a',
+ 'frac58;': '\u215d',
+ 'frac78;': '\u215e',
+ 'frasl;': '\u2044',
+ 'frown;': '\u2322',
+ 'Fscr;': '\u2131',
+ 'fscr;': '\U0001d4bb',
+ 'gacute;': '\u01f5',
+ 'Gamma;': '\u0393',
+ 'gamma;': '\u03b3',
+ 'Gammad;': '\u03dc',
+ 'gammad;': '\u03dd',
+ 'gap;': '\u2a86',
+ 'Gbreve;': '\u011e',
+ 'gbreve;': '\u011f',
+ 'Gcedil;': '\u0122',
+ 'Gcirc;': '\u011c',
+ 'gcirc;': '\u011d',
+ 'Gcy;': '\u0413',
+ 'gcy;': '\u0433',
+ 'Gdot;': '\u0120',
+ 'gdot;': '\u0121',
+ 'gE;': '\u2267',
+ 'ge;': '\u2265',
+ 'gEl;': '\u2a8c',
+ 'gel;': '\u22db',
+ 'geq;': '\u2265',
+ 'geqq;': '\u2267',
+ 'geqslant;': '\u2a7e',
+ 'ges;': '\u2a7e',
+ 'gescc;': '\u2aa9',
+ 'gesdot;': '\u2a80',
+ 'gesdoto;': '\u2a82',
+ 'gesdotol;': '\u2a84',
+ 'gesl;': '\u22db\ufe00',
+ 'gesles;': '\u2a94',
+ 'Gfr;': '\U0001d50a',
+ 'gfr;': '\U0001d524',
+ 'Gg;': '\u22d9',
+ 'gg;': '\u226b',
+ 'ggg;': '\u22d9',
+ 'gimel;': '\u2137',
+ 'GJcy;': '\u0403',
+ 'gjcy;': '\u0453',
+ 'gl;': '\u2277',
+ 'gla;': '\u2aa5',
+ 'glE;': '\u2a92',
+ 'glj;': '\u2aa4',
+ 'gnap;': '\u2a8a',
+ 'gnapprox;': '\u2a8a',
+ 'gnE;': '\u2269',
+ 'gne;': '\u2a88',
+ 'gneq;': '\u2a88',
+ 'gneqq;': '\u2269',
+ 'gnsim;': '\u22e7',
+ 'Gopf;': '\U0001d53e',
+ 'gopf;': '\U0001d558',
+ 'grave;': '`',
+ 'GreaterEqual;': '\u2265',
+ 'GreaterEqualLess;': '\u22db',
+ 'GreaterFullEqual;': '\u2267',
+ 'GreaterGreater;': '\u2aa2',
+ 'GreaterLess;': '\u2277',
+ 'GreaterSlantEqual;': '\u2a7e',
+ 'GreaterTilde;': '\u2273',
+ 'Gscr;': '\U0001d4a2',
+ 'gscr;': '\u210a',
+ 'gsim;': '\u2273',
+ 'gsime;': '\u2a8e',
+ 'gsiml;': '\u2a90',
+ 'GT': '>',
+ 'gt': '>',
+ 'GT;': '>',
+ 'Gt;': '\u226b',
+ 'gt;': '>',
+ 'gtcc;': '\u2aa7',
+ 'gtcir;': '\u2a7a',
+ 'gtdot;': '\u22d7',
+ 'gtlPar;': '\u2995',
+ 'gtquest;': '\u2a7c',
+ 'gtrapprox;': '\u2a86',
+ 'gtrarr;': '\u2978',
+ 'gtrdot;': '\u22d7',
+ 'gtreqless;': '\u22db',
+ 'gtreqqless;': '\u2a8c',
+ 'gtrless;': '\u2277',
+ 'gtrsim;': '\u2273',
+ 'gvertneqq;': '\u2269\ufe00',
+ 'gvnE;': '\u2269\ufe00',
+ 'Hacek;': '\u02c7',
+ 'hairsp;': '\u200a',
+ 'half;': '\xbd',
+ 'hamilt;': '\u210b',
+ 'HARDcy;': '\u042a',
+ 'hardcy;': '\u044a',
+ 'hArr;': '\u21d4',
+ 'harr;': '\u2194',
+ 'harrcir;': '\u2948',
+ 'harrw;': '\u21ad',
+ 'Hat;': '^',
+ 'hbar;': '\u210f',
+ 'Hcirc;': '\u0124',
+ 'hcirc;': '\u0125',
+ 'hearts;': '\u2665',
+ 'heartsuit;': '\u2665',
+ 'hellip;': '\u2026',
+ 'hercon;': '\u22b9',
+ 'Hfr;': '\u210c',
+ 'hfr;': '\U0001d525',
+ 'HilbertSpace;': '\u210b',
+ 'hksearow;': '\u2925',
+ 'hkswarow;': '\u2926',
+ 'hoarr;': '\u21ff',
+ 'homtht;': '\u223b',
+ 'hookleftarrow;': '\u21a9',
+ 'hookrightarrow;': '\u21aa',
+ 'Hopf;': '\u210d',
+ 'hopf;': '\U0001d559',
+ 'horbar;': '\u2015',
+ 'HorizontalLine;': '\u2500',
+ 'Hscr;': '\u210b',
+ 'hscr;': '\U0001d4bd',
+ 'hslash;': '\u210f',
+ 'Hstrok;': '\u0126',
+ 'hstrok;': '\u0127',
+ 'HumpDownHump;': '\u224e',
+ 'HumpEqual;': '\u224f',
+ 'hybull;': '\u2043',
+ 'hyphen;': '\u2010',
+ 'Iacute': '\xcd',
+ 'iacute': '\xed',
+ 'Iacute;': '\xcd',
+ 'iacute;': '\xed',
+ 'ic;': '\u2063',
+ 'Icirc': '\xce',
+ 'icirc': '\xee',
+ 'Icirc;': '\xce',
+ 'icirc;': '\xee',
+ 'Icy;': '\u0418',
+ 'icy;': '\u0438',
+ 'Idot;': '\u0130',
+ 'IEcy;': '\u0415',
+ 'iecy;': '\u0435',
+ 'iexcl': '\xa1',
+ 'iexcl;': '\xa1',
+ 'iff;': '\u21d4',
+ 'Ifr;': '\u2111',
+ 'ifr;': '\U0001d526',
+ 'Igrave': '\xcc',
+ 'igrave': '\xec',
+ 'Igrave;': '\xcc',
+ 'igrave;': '\xec',
+ 'ii;': '\u2148',
+ 'iiiint;': '\u2a0c',
+ 'iiint;': '\u222d',
+ 'iinfin;': '\u29dc',
+ 'iiota;': '\u2129',
+ 'IJlig;': '\u0132',
+ 'ijlig;': '\u0133',
+ 'Im;': '\u2111',
+ 'Imacr;': '\u012a',
+ 'imacr;': '\u012b',
+ 'image;': '\u2111',
+ 'ImaginaryI;': '\u2148',
+ 'imagline;': '\u2110',
+ 'imagpart;': '\u2111',
+ 'imath;': '\u0131',
+ 'imof;': '\u22b7',
+ 'imped;': '\u01b5',
+ 'Implies;': '\u21d2',
+ 'in;': '\u2208',
+ 'incare;': '\u2105',
+ 'infin;': '\u221e',
+ 'infintie;': '\u29dd',
+ 'inodot;': '\u0131',
+ 'Int;': '\u222c',
+ 'int;': '\u222b',
+ 'intcal;': '\u22ba',
+ 'integers;': '\u2124',
+ 'Integral;': '\u222b',
+ 'intercal;': '\u22ba',
+ 'Intersection;': '\u22c2',
+ 'intlarhk;': '\u2a17',
+ 'intprod;': '\u2a3c',
+ 'InvisibleComma;': '\u2063',
+ 'InvisibleTimes;': '\u2062',
+ 'IOcy;': '\u0401',
+ 'iocy;': '\u0451',
+ 'Iogon;': '\u012e',
+ 'iogon;': '\u012f',
+ 'Iopf;': '\U0001d540',
+ 'iopf;': '\U0001d55a',
+ 'Iota;': '\u0399',
+ 'iota;': '\u03b9',
+ 'iprod;': '\u2a3c',
+ 'iquest': '\xbf',
+ 'iquest;': '\xbf',
+ 'Iscr;': '\u2110',
+ 'iscr;': '\U0001d4be',
+ 'isin;': '\u2208',
+ 'isindot;': '\u22f5',
+ 'isinE;': '\u22f9',
+ 'isins;': '\u22f4',
+ 'isinsv;': '\u22f3',
+ 'isinv;': '\u2208',
+ 'it;': '\u2062',
+ 'Itilde;': '\u0128',
+ 'itilde;': '\u0129',
+ 'Iukcy;': '\u0406',
+ 'iukcy;': '\u0456',
+ 'Iuml': '\xcf',
+ 'iuml': '\xef',
+ 'Iuml;': '\xcf',
+ 'iuml;': '\xef',
+ 'Jcirc;': '\u0134',
+ 'jcirc;': '\u0135',
+ 'Jcy;': '\u0419',
+ 'jcy;': '\u0439',
+ 'Jfr;': '\U0001d50d',
+ 'jfr;': '\U0001d527',
+ 'jmath;': '\u0237',
+ 'Jopf;': '\U0001d541',
+ 'jopf;': '\U0001d55b',
+ 'Jscr;': '\U0001d4a5',
+ 'jscr;': '\U0001d4bf',
+ 'Jsercy;': '\u0408',
+ 'jsercy;': '\u0458',
+ 'Jukcy;': '\u0404',
+ 'jukcy;': '\u0454',
+ 'Kappa;': '\u039a',
+ 'kappa;': '\u03ba',
+ 'kappav;': '\u03f0',
+ 'Kcedil;': '\u0136',
+ 'kcedil;': '\u0137',
+ 'Kcy;': '\u041a',
+ 'kcy;': '\u043a',
+ 'Kfr;': '\U0001d50e',
+ 'kfr;': '\U0001d528',
+ 'kgreen;': '\u0138',
+ 'KHcy;': '\u0425',
+ 'khcy;': '\u0445',
+ 'KJcy;': '\u040c',
+ 'kjcy;': '\u045c',
+ 'Kopf;': '\U0001d542',
+ 'kopf;': '\U0001d55c',
+ 'Kscr;': '\U0001d4a6',
+ 'kscr;': '\U0001d4c0',
+ 'lAarr;': '\u21da',
+ 'Lacute;': '\u0139',
+ 'lacute;': '\u013a',
+ 'laemptyv;': '\u29b4',
+ 'lagran;': '\u2112',
+ 'Lambda;': '\u039b',
+ 'lambda;': '\u03bb',
+ 'Lang;': '\u27ea',
+ 'lang;': '\u27e8',
+ 'langd;': '\u2991',
+ 'langle;': '\u27e8',
+ 'lap;': '\u2a85',
+ 'Laplacetrf;': '\u2112',
+ 'laquo': '\xab',
+ 'laquo;': '\xab',
+ 'Larr;': '\u219e',
+ 'lArr;': '\u21d0',
+ 'larr;': '\u2190',
+ 'larrb;': '\u21e4',
+ 'larrbfs;': '\u291f',
+ 'larrfs;': '\u291d',
+ 'larrhk;': '\u21a9',
+ 'larrlp;': '\u21ab',
+ 'larrpl;': '\u2939',
+ 'larrsim;': '\u2973',
+ 'larrtl;': '\u21a2',
+ 'lat;': '\u2aab',
+ 'lAtail;': '\u291b',
+ 'latail;': '\u2919',
+ 'late;': '\u2aad',
+ 'lates;': '\u2aad\ufe00',
+ 'lBarr;': '\u290e',
+ 'lbarr;': '\u290c',
+ 'lbbrk;': '\u2772',
+ 'lbrace;': '{',
+ 'lbrack;': '[',
+ 'lbrke;': '\u298b',
+ 'lbrksld;': '\u298f',
+ 'lbrkslu;': '\u298d',
+ 'Lcaron;': '\u013d',
+ 'lcaron;': '\u013e',
+ 'Lcedil;': '\u013b',
+ 'lcedil;': '\u013c',
+ 'lceil;': '\u2308',
+ 'lcub;': '{',
+ 'Lcy;': '\u041b',
+ 'lcy;': '\u043b',
+ 'ldca;': '\u2936',
+ 'ldquo;': '\u201c',
+ 'ldquor;': '\u201e',
+ 'ldrdhar;': '\u2967',
+ 'ldrushar;': '\u294b',
+ 'ldsh;': '\u21b2',
+ 'lE;': '\u2266',
+ 'le;': '\u2264',
+ 'LeftAngleBracket;': '\u27e8',
+ 'LeftArrow;': '\u2190',
+ 'Leftarrow;': '\u21d0',
+ 'leftarrow;': '\u2190',
+ 'LeftArrowBar;': '\u21e4',
+ 'LeftArrowRightArrow;': '\u21c6',
+ 'leftarrowtail;': '\u21a2',
+ 'LeftCeiling;': '\u2308',
+ 'LeftDoubleBracket;': '\u27e6',
+ 'LeftDownTeeVector;': '\u2961',
+ 'LeftDownVector;': '\u21c3',
+ 'LeftDownVectorBar;': '\u2959',
+ 'LeftFloor;': '\u230a',
+ 'leftharpoondown;': '\u21bd',
+ 'leftharpoonup;': '\u21bc',
+ 'leftleftarrows;': '\u21c7',
+ 'LeftRightArrow;': '\u2194',
+ 'Leftrightarrow;': '\u21d4',
+ 'leftrightarrow;': '\u2194',
+ 'leftrightarrows;': '\u21c6',
+ 'leftrightharpoons;': '\u21cb',
+ 'leftrightsquigarrow;': '\u21ad',
+ 'LeftRightVector;': '\u294e',
+ 'LeftTee;': '\u22a3',
+ 'LeftTeeArrow;': '\u21a4',
+ 'LeftTeeVector;': '\u295a',
+ 'leftthreetimes;': '\u22cb',
+ 'LeftTriangle;': '\u22b2',
+ 'LeftTriangleBar;': '\u29cf',
+ 'LeftTriangleEqual;': '\u22b4',
+ 'LeftUpDownVector;': '\u2951',
+ 'LeftUpTeeVector;': '\u2960',
+ 'LeftUpVector;': '\u21bf',
+ 'LeftUpVectorBar;': '\u2958',
+ 'LeftVector;': '\u21bc',
+ 'LeftVectorBar;': '\u2952',
+ 'lEg;': '\u2a8b',
+ 'leg;': '\u22da',
+ 'leq;': '\u2264',
+ 'leqq;': '\u2266',
+ 'leqslant;': '\u2a7d',
+ 'les;': '\u2a7d',
+ 'lescc;': '\u2aa8',
+ 'lesdot;': '\u2a7f',
+ 'lesdoto;': '\u2a81',
+ 'lesdotor;': '\u2a83',
+ 'lesg;': '\u22da\ufe00',
+ 'lesges;': '\u2a93',
+ 'lessapprox;': '\u2a85',
+ 'lessdot;': '\u22d6',
+ 'lesseqgtr;': '\u22da',
+ 'lesseqqgtr;': '\u2a8b',
+ 'LessEqualGreater;': '\u22da',
+ 'LessFullEqual;': '\u2266',
+ 'LessGreater;': '\u2276',
+ 'lessgtr;': '\u2276',
+ 'LessLess;': '\u2aa1',
+ 'lesssim;': '\u2272',
+ 'LessSlantEqual;': '\u2a7d',
+ 'LessTilde;': '\u2272',
+ 'lfisht;': '\u297c',
+ 'lfloor;': '\u230a',
+ 'Lfr;': '\U0001d50f',
+ 'lfr;': '\U0001d529',
+ 'lg;': '\u2276',
+ 'lgE;': '\u2a91',
+ 'lHar;': '\u2962',
+ 'lhard;': '\u21bd',
+ 'lharu;': '\u21bc',
+ 'lharul;': '\u296a',
+ 'lhblk;': '\u2584',
+ 'LJcy;': '\u0409',
+ 'ljcy;': '\u0459',
+ 'Ll;': '\u22d8',
+ 'll;': '\u226a',
+ 'llarr;': '\u21c7',
+ 'llcorner;': '\u231e',
+ 'Lleftarrow;': '\u21da',
+ 'llhard;': '\u296b',
+ 'lltri;': '\u25fa',
+ 'Lmidot;': '\u013f',
+ 'lmidot;': '\u0140',
+ 'lmoust;': '\u23b0',
+ 'lmoustache;': '\u23b0',
+ 'lnap;': '\u2a89',
+ 'lnapprox;': '\u2a89',
+ 'lnE;': '\u2268',
+ 'lne;': '\u2a87',
+ 'lneq;': '\u2a87',
+ 'lneqq;': '\u2268',
+ 'lnsim;': '\u22e6',
+ 'loang;': '\u27ec',
+ 'loarr;': '\u21fd',
+ 'lobrk;': '\u27e6',
+ 'LongLeftArrow;': '\u27f5',
+ 'Longleftarrow;': '\u27f8',
+ 'longleftarrow;': '\u27f5',
+ 'LongLeftRightArrow;': '\u27f7',
+ 'Longleftrightarrow;': '\u27fa',
+ 'longleftrightarrow;': '\u27f7',
+ 'longmapsto;': '\u27fc',
+ 'LongRightArrow;': '\u27f6',
+ 'Longrightarrow;': '\u27f9',
+ 'longrightarrow;': '\u27f6',
+ 'looparrowleft;': '\u21ab',
+ 'looparrowright;': '\u21ac',
+ 'lopar;': '\u2985',
+ 'Lopf;': '\U0001d543',
+ 'lopf;': '\U0001d55d',
+ 'loplus;': '\u2a2d',
+ 'lotimes;': '\u2a34',
+ 'lowast;': '\u2217',
+ 'lowbar;': '_',
+ 'LowerLeftArrow;': '\u2199',
+ 'LowerRightArrow;': '\u2198',
+ 'loz;': '\u25ca',
+ 'lozenge;': '\u25ca',
+ 'lozf;': '\u29eb',
+ 'lpar;': '(',
+ 'lparlt;': '\u2993',
+ 'lrarr;': '\u21c6',
+ 'lrcorner;': '\u231f',
+ 'lrhar;': '\u21cb',
+ 'lrhard;': '\u296d',
+ 'lrm;': '\u200e',
+ 'lrtri;': '\u22bf',
+ 'lsaquo;': '\u2039',
+ 'Lscr;': '\u2112',
+ 'lscr;': '\U0001d4c1',
+ 'Lsh;': '\u21b0',
+ 'lsh;': '\u21b0',
+ 'lsim;': '\u2272',
+ 'lsime;': '\u2a8d',
+ 'lsimg;': '\u2a8f',
+ 'lsqb;': '[',
+ 'lsquo;': '\u2018',
+ 'lsquor;': '\u201a',
+ 'Lstrok;': '\u0141',
+ 'lstrok;': '\u0142',
+ 'LT': '<',
+ 'lt': '<',
+ 'LT;': '<',
+ 'Lt;': '\u226a',
+ 'lt;': '<',
+ 'ltcc;': '\u2aa6',
+ 'ltcir;': '\u2a79',
+ 'ltdot;': '\u22d6',
+ 'lthree;': '\u22cb',
+ 'ltimes;': '\u22c9',
+ 'ltlarr;': '\u2976',
+ 'ltquest;': '\u2a7b',
+ 'ltri;': '\u25c3',
+ 'ltrie;': '\u22b4',
+ 'ltrif;': '\u25c2',
+ 'ltrPar;': '\u2996',
+ 'lurdshar;': '\u294a',
+ 'luruhar;': '\u2966',
+ 'lvertneqq;': '\u2268\ufe00',
+ 'lvnE;': '\u2268\ufe00',
+ 'macr': '\xaf',
+ 'macr;': '\xaf',
+ 'male;': '\u2642',
+ 'malt;': '\u2720',
+ 'maltese;': '\u2720',
+ 'Map;': '\u2905',
+ 'map;': '\u21a6',
+ 'mapsto;': '\u21a6',
+ 'mapstodown;': '\u21a7',
+ 'mapstoleft;': '\u21a4',
+ 'mapstoup;': '\u21a5',
+ 'marker;': '\u25ae',
+ 'mcomma;': '\u2a29',
+ 'Mcy;': '\u041c',
+ 'mcy;': '\u043c',
+ 'mdash;': '\u2014',
+ 'mDDot;': '\u223a',
+ 'measuredangle;': '\u2221',
+ 'MediumSpace;': '\u205f',
+ 'Mellintrf;': '\u2133',
+ 'Mfr;': '\U0001d510',
+ 'mfr;': '\U0001d52a',
+ 'mho;': '\u2127',
+ 'micro': '\xb5',
+ 'micro;': '\xb5',
+ 'mid;': '\u2223',
+ 'midast;': '*',
+ 'midcir;': '\u2af0',
+ 'middot': '\xb7',
+ 'middot;': '\xb7',
+ 'minus;': '\u2212',
+ 'minusb;': '\u229f',
+ 'minusd;': '\u2238',
+ 'minusdu;': '\u2a2a',
+ 'MinusPlus;': '\u2213',
+ 'mlcp;': '\u2adb',
+ 'mldr;': '\u2026',
+ 'mnplus;': '\u2213',
+ 'models;': '\u22a7',
+ 'Mopf;': '\U0001d544',
+ 'mopf;': '\U0001d55e',
+ 'mp;': '\u2213',
+ 'Mscr;': '\u2133',
+ 'mscr;': '\U0001d4c2',
+ 'mstpos;': '\u223e',
+ 'Mu;': '\u039c',
+ 'mu;': '\u03bc',
+ 'multimap;': '\u22b8',
+ 'mumap;': '\u22b8',
+ 'nabla;': '\u2207',
+ 'Nacute;': '\u0143',
+ 'nacute;': '\u0144',
+ 'nang;': '\u2220\u20d2',
+ 'nap;': '\u2249',
+ 'napE;': '\u2a70\u0338',
+ 'napid;': '\u224b\u0338',
+ 'napos;': '\u0149',
+ 'napprox;': '\u2249',
+ 'natur;': '\u266e',
+ 'natural;': '\u266e',
+ 'naturals;': '\u2115',
+ 'nbsp': '\xa0',
+ 'nbsp;': '\xa0',
+ 'nbump;': '\u224e\u0338',
+ 'nbumpe;': '\u224f\u0338',
+ 'ncap;': '\u2a43',
+ 'Ncaron;': '\u0147',
+ 'ncaron;': '\u0148',
+ 'Ncedil;': '\u0145',
+ 'ncedil;': '\u0146',
+ 'ncong;': '\u2247',
+ 'ncongdot;': '\u2a6d\u0338',
+ 'ncup;': '\u2a42',
+ 'Ncy;': '\u041d',
+ 'ncy;': '\u043d',
+ 'ndash;': '\u2013',
+ 'ne;': '\u2260',
+ 'nearhk;': '\u2924',
+ 'neArr;': '\u21d7',
+ 'nearr;': '\u2197',
+ 'nearrow;': '\u2197',
+ 'nedot;': '\u2250\u0338',
+ 'NegativeMediumSpace;': '\u200b',
+ 'NegativeThickSpace;': '\u200b',
+ 'NegativeThinSpace;': '\u200b',
+ 'NegativeVeryThinSpace;': '\u200b',
+ 'nequiv;': '\u2262',
+ 'nesear;': '\u2928',
+ 'nesim;': '\u2242\u0338',
+ 'NestedGreaterGreater;': '\u226b',
+ 'NestedLessLess;': '\u226a',
+ 'NewLine;': '\n',
+ 'nexist;': '\u2204',
+ 'nexists;': '\u2204',
+ 'Nfr;': '\U0001d511',
+ 'nfr;': '\U0001d52b',
+ 'ngE;': '\u2267\u0338',
+ 'nge;': '\u2271',
+ 'ngeq;': '\u2271',
+ 'ngeqq;': '\u2267\u0338',
+ 'ngeqslant;': '\u2a7e\u0338',
+ 'nges;': '\u2a7e\u0338',
+ 'nGg;': '\u22d9\u0338',
+ 'ngsim;': '\u2275',
+ 'nGt;': '\u226b\u20d2',
+ 'ngt;': '\u226f',
+ 'ngtr;': '\u226f',
+ 'nGtv;': '\u226b\u0338',
+ 'nhArr;': '\u21ce',
+ 'nharr;': '\u21ae',
+ 'nhpar;': '\u2af2',
+ 'ni;': '\u220b',
+ 'nis;': '\u22fc',
+ 'nisd;': '\u22fa',
+ 'niv;': '\u220b',
+ 'NJcy;': '\u040a',
+ 'njcy;': '\u045a',
+ 'nlArr;': '\u21cd',
+ 'nlarr;': '\u219a',
+ 'nldr;': '\u2025',
+ 'nlE;': '\u2266\u0338',
+ 'nle;': '\u2270',
+ 'nLeftarrow;': '\u21cd',
+ 'nleftarrow;': '\u219a',
+ 'nLeftrightarrow;': '\u21ce',
+ 'nleftrightarrow;': '\u21ae',
+ 'nleq;': '\u2270',
+ 'nleqq;': '\u2266\u0338',
+ 'nleqslant;': '\u2a7d\u0338',
+ 'nles;': '\u2a7d\u0338',
+ 'nless;': '\u226e',
+ 'nLl;': '\u22d8\u0338',
+ 'nlsim;': '\u2274',
+ 'nLt;': '\u226a\u20d2',
+ 'nlt;': '\u226e',
+ 'nltri;': '\u22ea',
+ 'nltrie;': '\u22ec',
+ 'nLtv;': '\u226a\u0338',
+ 'nmid;': '\u2224',
+ 'NoBreak;': '\u2060',
+ 'NonBreakingSpace;': '\xa0',
+ 'Nopf;': '\u2115',
+ 'nopf;': '\U0001d55f',
+ 'not': '\xac',
+ 'Not;': '\u2aec',
+ 'not;': '\xac',
+ 'NotCongruent;': '\u2262',
+ 'NotCupCap;': '\u226d',
+ 'NotDoubleVerticalBar;': '\u2226',
+ 'NotElement;': '\u2209',
+ 'NotEqual;': '\u2260',
+ 'NotEqualTilde;': '\u2242\u0338',
+ 'NotExists;': '\u2204',
+ 'NotGreater;': '\u226f',
+ 'NotGreaterEqual;': '\u2271',
+ 'NotGreaterFullEqual;': '\u2267\u0338',
+ 'NotGreaterGreater;': '\u226b\u0338',
+ 'NotGreaterLess;': '\u2279',
+ 'NotGreaterSlantEqual;': '\u2a7e\u0338',
+ 'NotGreaterTilde;': '\u2275',
+ 'NotHumpDownHump;': '\u224e\u0338',
+ 'NotHumpEqual;': '\u224f\u0338',
+ 'notin;': '\u2209',
+ 'notindot;': '\u22f5\u0338',
+ 'notinE;': '\u22f9\u0338',
+ 'notinva;': '\u2209',
+ 'notinvb;': '\u22f7',
+ 'notinvc;': '\u22f6',
+ 'NotLeftTriangle;': '\u22ea',
+ 'NotLeftTriangleBar;': '\u29cf\u0338',
+ 'NotLeftTriangleEqual;': '\u22ec',
+ 'NotLess;': '\u226e',
+ 'NotLessEqual;': '\u2270',
+ 'NotLessGreater;': '\u2278',
+ 'NotLessLess;': '\u226a\u0338',
+ 'NotLessSlantEqual;': '\u2a7d\u0338',
+ 'NotLessTilde;': '\u2274',
+ 'NotNestedGreaterGreater;': '\u2aa2\u0338',
+ 'NotNestedLessLess;': '\u2aa1\u0338',
+ 'notni;': '\u220c',
+ 'notniva;': '\u220c',
+ 'notnivb;': '\u22fe',
+ 'notnivc;': '\u22fd',
+ 'NotPrecedes;': '\u2280',
+ 'NotPrecedesEqual;': '\u2aaf\u0338',
+ 'NotPrecedesSlantEqual;': '\u22e0',
+ 'NotReverseElement;': '\u220c',
+ 'NotRightTriangle;': '\u22eb',
+ 'NotRightTriangleBar;': '\u29d0\u0338',
+ 'NotRightTriangleEqual;': '\u22ed',
+ 'NotSquareSubset;': '\u228f\u0338',
+ 'NotSquareSubsetEqual;': '\u22e2',
+ 'NotSquareSuperset;': '\u2290\u0338',
+ 'NotSquareSupersetEqual;': '\u22e3',
+ 'NotSubset;': '\u2282\u20d2',
+ 'NotSubsetEqual;': '\u2288',
+ 'NotSucceeds;': '\u2281',
+ 'NotSucceedsEqual;': '\u2ab0\u0338',
+ 'NotSucceedsSlantEqual;': '\u22e1',
+ 'NotSucceedsTilde;': '\u227f\u0338',
+ 'NotSuperset;': '\u2283\u20d2',
+ 'NotSupersetEqual;': '\u2289',
+ 'NotTilde;': '\u2241',
+ 'NotTildeEqual;': '\u2244',
+ 'NotTildeFullEqual;': '\u2247',
+ 'NotTildeTilde;': '\u2249',
+ 'NotVerticalBar;': '\u2224',
+ 'npar;': '\u2226',
+ 'nparallel;': '\u2226',
+ 'nparsl;': '\u2afd\u20e5',
+ 'npart;': '\u2202\u0338',
+ 'npolint;': '\u2a14',
+ 'npr;': '\u2280',
+ 'nprcue;': '\u22e0',
+ 'npre;': '\u2aaf\u0338',
+ 'nprec;': '\u2280',
+ 'npreceq;': '\u2aaf\u0338',
+ 'nrArr;': '\u21cf',
+ 'nrarr;': '\u219b',
+ 'nrarrc;': '\u2933\u0338',
+ 'nrarrw;': '\u219d\u0338',
+ 'nRightarrow;': '\u21cf',
+ 'nrightarrow;': '\u219b',
+ 'nrtri;': '\u22eb',
+ 'nrtrie;': '\u22ed',
+ 'nsc;': '\u2281',
+ 'nsccue;': '\u22e1',
+ 'nsce;': '\u2ab0\u0338',
+ 'Nscr;': '\U0001d4a9',
+ 'nscr;': '\U0001d4c3',
+ 'nshortmid;': '\u2224',
+ 'nshortparallel;': '\u2226',
+ 'nsim;': '\u2241',
+ 'nsime;': '\u2244',
+ 'nsimeq;': '\u2244',
+ 'nsmid;': '\u2224',
+ 'nspar;': '\u2226',
+ 'nsqsube;': '\u22e2',
+ 'nsqsupe;': '\u22e3',
+ 'nsub;': '\u2284',
+ 'nsubE;': '\u2ac5\u0338',
+ 'nsube;': '\u2288',
+ 'nsubset;': '\u2282\u20d2',
+ 'nsubseteq;': '\u2288',
+ 'nsubseteqq;': '\u2ac5\u0338',
+ 'nsucc;': '\u2281',
+ 'nsucceq;': '\u2ab0\u0338',
+ 'nsup;': '\u2285',
+ 'nsupE;': '\u2ac6\u0338',
+ 'nsupe;': '\u2289',
+ 'nsupset;': '\u2283\u20d2',
+ 'nsupseteq;': '\u2289',
+ 'nsupseteqq;': '\u2ac6\u0338',
+ 'ntgl;': '\u2279',
+ 'Ntilde': '\xd1',
+ 'ntilde': '\xf1',
+ 'Ntilde;': '\xd1',
+ 'ntilde;': '\xf1',
+ 'ntlg;': '\u2278',
+ 'ntriangleleft;': '\u22ea',
+ 'ntrianglelefteq;': '\u22ec',
+ 'ntriangleright;': '\u22eb',
+ 'ntrianglerighteq;': '\u22ed',
+ 'Nu;': '\u039d',
+ 'nu;': '\u03bd',
+ 'num;': '#',
+ 'numero;': '\u2116',
+ 'numsp;': '\u2007',
+ 'nvap;': '\u224d\u20d2',
+ 'nVDash;': '\u22af',
+ 'nVdash;': '\u22ae',
+ 'nvDash;': '\u22ad',
+ 'nvdash;': '\u22ac',
+ 'nvge;': '\u2265\u20d2',
+ 'nvgt;': '>\u20d2',
+ 'nvHarr;': '\u2904',
+ 'nvinfin;': '\u29de',
+ 'nvlArr;': '\u2902',
+ 'nvle;': '\u2264\u20d2',
+ 'nvlt;': '<\u20d2',
+ 'nvltrie;': '\u22b4\u20d2',
+ 'nvrArr;': '\u2903',
+ 'nvrtrie;': '\u22b5\u20d2',
+ 'nvsim;': '\u223c\u20d2',
+ 'nwarhk;': '\u2923',
+ 'nwArr;': '\u21d6',
+ 'nwarr;': '\u2196',
+ 'nwarrow;': '\u2196',
+ 'nwnear;': '\u2927',
+ 'Oacute': '\xd3',
+ 'oacute': '\xf3',
+ 'Oacute;': '\xd3',
+ 'oacute;': '\xf3',
+ 'oast;': '\u229b',
+ 'ocir;': '\u229a',
+ 'Ocirc': '\xd4',
+ 'ocirc': '\xf4',
+ 'Ocirc;': '\xd4',
+ 'ocirc;': '\xf4',
+ 'Ocy;': '\u041e',
+ 'ocy;': '\u043e',
+ 'odash;': '\u229d',
+ 'Odblac;': '\u0150',
+ 'odblac;': '\u0151',
+ 'odiv;': '\u2a38',
+ 'odot;': '\u2299',
+ 'odsold;': '\u29bc',
+ 'OElig;': '\u0152',
+ 'oelig;': '\u0153',
+ 'ofcir;': '\u29bf',
+ 'Ofr;': '\U0001d512',
+ 'ofr;': '\U0001d52c',
+ 'ogon;': '\u02db',
+ 'Ograve': '\xd2',
+ 'ograve': '\xf2',
+ 'Ograve;': '\xd2',
+ 'ograve;': '\xf2',
+ 'ogt;': '\u29c1',
+ 'ohbar;': '\u29b5',
+ 'ohm;': '\u03a9',
+ 'oint;': '\u222e',
+ 'olarr;': '\u21ba',
+ 'olcir;': '\u29be',
+ 'olcross;': '\u29bb',
+ 'oline;': '\u203e',
+ 'olt;': '\u29c0',
+ 'Omacr;': '\u014c',
+ 'omacr;': '\u014d',
+ 'Omega;': '\u03a9',
+ 'omega;': '\u03c9',
+ 'Omicron;': '\u039f',
+ 'omicron;': '\u03bf',
+ 'omid;': '\u29b6',
+ 'ominus;': '\u2296',
+ 'Oopf;': '\U0001d546',
+ 'oopf;': '\U0001d560',
+ 'opar;': '\u29b7',
+ 'OpenCurlyDoubleQuote;': '\u201c',
+ 'OpenCurlyQuote;': '\u2018',
+ 'operp;': '\u29b9',
+ 'oplus;': '\u2295',
+ 'Or;': '\u2a54',
+ 'or;': '\u2228',
+ 'orarr;': '\u21bb',
+ 'ord;': '\u2a5d',
+ 'order;': '\u2134',
+ 'orderof;': '\u2134',
+ 'ordf': '\xaa',
+ 'ordf;': '\xaa',
+ 'ordm': '\xba',
+ 'ordm;': '\xba',
+ 'origof;': '\u22b6',
+ 'oror;': '\u2a56',
+ 'orslope;': '\u2a57',
+ 'orv;': '\u2a5b',
+ 'oS;': '\u24c8',
+ 'Oscr;': '\U0001d4aa',
+ 'oscr;': '\u2134',
+ 'Oslash': '\xd8',
+ 'oslash': '\xf8',
+ 'Oslash;': '\xd8',
+ 'oslash;': '\xf8',
+ 'osol;': '\u2298',
+ 'Otilde': '\xd5',
+ 'otilde': '\xf5',
+ 'Otilde;': '\xd5',
+ 'otilde;': '\xf5',
+ 'Otimes;': '\u2a37',
+ 'otimes;': '\u2297',
+ 'otimesas;': '\u2a36',
+ 'Ouml': '\xd6',
+ 'ouml': '\xf6',
+ 'Ouml;': '\xd6',
+ 'ouml;': '\xf6',
+ 'ovbar;': '\u233d',
+ 'OverBar;': '\u203e',
+ 'OverBrace;': '\u23de',
+ 'OverBracket;': '\u23b4',
+ 'OverParenthesis;': '\u23dc',
+ 'par;': '\u2225',
+ 'para': '\xb6',
+ 'para;': '\xb6',
+ 'parallel;': '\u2225',
+ 'parsim;': '\u2af3',
+ 'parsl;': '\u2afd',
+ 'part;': '\u2202',
+ 'PartialD;': '\u2202',
+ 'Pcy;': '\u041f',
+ 'pcy;': '\u043f',
+ 'percnt;': '%',
+ 'period;': '.',
+ 'permil;': '\u2030',
+ 'perp;': '\u22a5',
+ 'pertenk;': '\u2031',
+ 'Pfr;': '\U0001d513',
+ 'pfr;': '\U0001d52d',
+ 'Phi;': '\u03a6',
+ 'phi;': '\u03c6',
+ 'phiv;': '\u03d5',
+ 'phmmat;': '\u2133',
+ 'phone;': '\u260e',
+ 'Pi;': '\u03a0',
+ 'pi;': '\u03c0',
+ 'pitchfork;': '\u22d4',
+ 'piv;': '\u03d6',
+ 'planck;': '\u210f',
+ 'planckh;': '\u210e',
+ 'plankv;': '\u210f',
+ 'plus;': '+',
+ 'plusacir;': '\u2a23',
+ 'plusb;': '\u229e',
+ 'pluscir;': '\u2a22',
+ 'plusdo;': '\u2214',
+ 'plusdu;': '\u2a25',
+ 'pluse;': '\u2a72',
+ 'PlusMinus;': '\xb1',
+ 'plusmn': '\xb1',
+ 'plusmn;': '\xb1',
+ 'plussim;': '\u2a26',
+ 'plustwo;': '\u2a27',
+ 'pm;': '\xb1',
+ 'Poincareplane;': '\u210c',
+ 'pointint;': '\u2a15',
+ 'Popf;': '\u2119',
+ 'popf;': '\U0001d561',
+ 'pound': '\xa3',
+ 'pound;': '\xa3',
+ 'Pr;': '\u2abb',
+ 'pr;': '\u227a',
+ 'prap;': '\u2ab7',
+ 'prcue;': '\u227c',
+ 'prE;': '\u2ab3',
+ 'pre;': '\u2aaf',
+ 'prec;': '\u227a',
+ 'precapprox;': '\u2ab7',
+ 'preccurlyeq;': '\u227c',
+ 'Precedes;': '\u227a',
+ 'PrecedesEqual;': '\u2aaf',
+ 'PrecedesSlantEqual;': '\u227c',
+ 'PrecedesTilde;': '\u227e',
+ 'preceq;': '\u2aaf',
+ 'precnapprox;': '\u2ab9',
+ 'precneqq;': '\u2ab5',
+ 'precnsim;': '\u22e8',
+ 'precsim;': '\u227e',
+ 'Prime;': '\u2033',
+ 'prime;': '\u2032',
+ 'primes;': '\u2119',
+ 'prnap;': '\u2ab9',
+ 'prnE;': '\u2ab5',
+ 'prnsim;': '\u22e8',
+ 'prod;': '\u220f',
+ 'Product;': '\u220f',
+ 'profalar;': '\u232e',
+ 'profline;': '\u2312',
+ 'profsurf;': '\u2313',
+ 'prop;': '\u221d',
+ 'Proportion;': '\u2237',
+ 'Proportional;': '\u221d',
+ 'propto;': '\u221d',
+ 'prsim;': '\u227e',
+ 'prurel;': '\u22b0',
+ 'Pscr;': '\U0001d4ab',
+ 'pscr;': '\U0001d4c5',
+ 'Psi;': '\u03a8',
+ 'psi;': '\u03c8',
+ 'puncsp;': '\u2008',
+ 'Qfr;': '\U0001d514',
+ 'qfr;': '\U0001d52e',
+ 'qint;': '\u2a0c',
+ 'Qopf;': '\u211a',
+ 'qopf;': '\U0001d562',
+ 'qprime;': '\u2057',
+ 'Qscr;': '\U0001d4ac',
+ 'qscr;': '\U0001d4c6',
+ 'quaternions;': '\u210d',
+ 'quatint;': '\u2a16',
+ 'quest;': '?',
+ 'questeq;': '\u225f',
+ 'QUOT': '"',
+ 'quot': '"',
+ 'QUOT;': '"',
+ 'quot;': '"',
+ 'rAarr;': '\u21db',
+ 'race;': '\u223d\u0331',
+ 'Racute;': '\u0154',
+ 'racute;': '\u0155',
+ 'radic;': '\u221a',
+ 'raemptyv;': '\u29b3',
+ 'Rang;': '\u27eb',
+ 'rang;': '\u27e9',
+ 'rangd;': '\u2992',
+ 'range;': '\u29a5',
+ 'rangle;': '\u27e9',
+ 'raquo': '\xbb',
+ 'raquo;': '\xbb',
+ 'Rarr;': '\u21a0',
+ 'rArr;': '\u21d2',
+ 'rarr;': '\u2192',
+ 'rarrap;': '\u2975',
+ 'rarrb;': '\u21e5',
+ 'rarrbfs;': '\u2920',
+ 'rarrc;': '\u2933',
+ 'rarrfs;': '\u291e',
+ 'rarrhk;': '\u21aa',
+ 'rarrlp;': '\u21ac',
+ 'rarrpl;': '\u2945',
+ 'rarrsim;': '\u2974',
+ 'Rarrtl;': '\u2916',
+ 'rarrtl;': '\u21a3',
+ 'rarrw;': '\u219d',
+ 'rAtail;': '\u291c',
+ 'ratail;': '\u291a',
+ 'ratio;': '\u2236',
+ 'rationals;': '\u211a',
+ 'RBarr;': '\u2910',
+ 'rBarr;': '\u290f',
+ 'rbarr;': '\u290d',
+ 'rbbrk;': '\u2773',
+ 'rbrace;': '}',
+ 'rbrack;': ']',
+ 'rbrke;': '\u298c',
+ 'rbrksld;': '\u298e',
+ 'rbrkslu;': '\u2990',
+ 'Rcaron;': '\u0158',
+ 'rcaron;': '\u0159',
+ 'Rcedil;': '\u0156',
+ 'rcedil;': '\u0157',
+ 'rceil;': '\u2309',
+ 'rcub;': '}',
+ 'Rcy;': '\u0420',
+ 'rcy;': '\u0440',
+ 'rdca;': '\u2937',
+ 'rdldhar;': '\u2969',
+ 'rdquo;': '\u201d',
+ 'rdquor;': '\u201d',
+ 'rdsh;': '\u21b3',
+ 'Re;': '\u211c',
+ 'real;': '\u211c',
+ 'realine;': '\u211b',
+ 'realpart;': '\u211c',
+ 'reals;': '\u211d',
+ 'rect;': '\u25ad',
+ 'REG': '\xae',
+ 'reg': '\xae',
+ 'REG;': '\xae',
+ 'reg;': '\xae',
+ 'ReverseElement;': '\u220b',
+ 'ReverseEquilibrium;': '\u21cb',
+ 'ReverseUpEquilibrium;': '\u296f',
+ 'rfisht;': '\u297d',
+ 'rfloor;': '\u230b',
+ 'Rfr;': '\u211c',
+ 'rfr;': '\U0001d52f',
+ 'rHar;': '\u2964',
+ 'rhard;': '\u21c1',
+ 'rharu;': '\u21c0',
+ 'rharul;': '\u296c',
+ 'Rho;': '\u03a1',
+ 'rho;': '\u03c1',
+ 'rhov;': '\u03f1',
+ 'RightAngleBracket;': '\u27e9',
+ 'RightArrow;': '\u2192',
+ 'Rightarrow;': '\u21d2',
+ 'rightarrow;': '\u2192',
+ 'RightArrowBar;': '\u21e5',
+ 'RightArrowLeftArrow;': '\u21c4',
+ 'rightarrowtail;': '\u21a3',
+ 'RightCeiling;': '\u2309',
+ 'RightDoubleBracket;': '\u27e7',
+ 'RightDownTeeVector;': '\u295d',
+ 'RightDownVector;': '\u21c2',
+ 'RightDownVectorBar;': '\u2955',
+ 'RightFloor;': '\u230b',
+ 'rightharpoondown;': '\u21c1',
+ 'rightharpoonup;': '\u21c0',
+ 'rightleftarrows;': '\u21c4',
+ 'rightleftharpoons;': '\u21cc',
+ 'rightrightarrows;': '\u21c9',
+ 'rightsquigarrow;': '\u219d',
+ 'RightTee;': '\u22a2',
+ 'RightTeeArrow;': '\u21a6',
+ 'RightTeeVector;': '\u295b',
+ 'rightthreetimes;': '\u22cc',
+ 'RightTriangle;': '\u22b3',
+ 'RightTriangleBar;': '\u29d0',
+ 'RightTriangleEqual;': '\u22b5',
+ 'RightUpDownVector;': '\u294f',
+ 'RightUpTeeVector;': '\u295c',
+ 'RightUpVector;': '\u21be',
+ 'RightUpVectorBar;': '\u2954',
+ 'RightVector;': '\u21c0',
+ 'RightVectorBar;': '\u2953',
+ 'ring;': '\u02da',
+ 'risingdotseq;': '\u2253',
+ 'rlarr;': '\u21c4',
+ 'rlhar;': '\u21cc',
+ 'rlm;': '\u200f',
+ 'rmoust;': '\u23b1',
+ 'rmoustache;': '\u23b1',
+ 'rnmid;': '\u2aee',
+ 'roang;': '\u27ed',
+ 'roarr;': '\u21fe',
+ 'robrk;': '\u27e7',
+ 'ropar;': '\u2986',
+ 'Ropf;': '\u211d',
+ 'ropf;': '\U0001d563',
+ 'roplus;': '\u2a2e',
+ 'rotimes;': '\u2a35',
+ 'RoundImplies;': '\u2970',
+ 'rpar;': ')',
+ 'rpargt;': '\u2994',
+ 'rppolint;': '\u2a12',
+ 'rrarr;': '\u21c9',
+ 'Rrightarrow;': '\u21db',
+ 'rsaquo;': '\u203a',
+ 'Rscr;': '\u211b',
+ 'rscr;': '\U0001d4c7',
+ 'Rsh;': '\u21b1',
+ 'rsh;': '\u21b1',
+ 'rsqb;': ']',
+ 'rsquo;': '\u2019',
+ 'rsquor;': '\u2019',
+ 'rthree;': '\u22cc',
+ 'rtimes;': '\u22ca',
+ 'rtri;': '\u25b9',
+ 'rtrie;': '\u22b5',
+ 'rtrif;': '\u25b8',
+ 'rtriltri;': '\u29ce',
+ 'RuleDelayed;': '\u29f4',
+ 'ruluhar;': '\u2968',
+ 'rx;': '\u211e',
+ 'Sacute;': '\u015a',
+ 'sacute;': '\u015b',
+ 'sbquo;': '\u201a',
+ 'Sc;': '\u2abc',
+ 'sc;': '\u227b',
+ 'scap;': '\u2ab8',
+ 'Scaron;': '\u0160',
+ 'scaron;': '\u0161',
+ 'sccue;': '\u227d',
+ 'scE;': '\u2ab4',
+ 'sce;': '\u2ab0',
+ 'Scedil;': '\u015e',
+ 'scedil;': '\u015f',
+ 'Scirc;': '\u015c',
+ 'scirc;': '\u015d',
+ 'scnap;': '\u2aba',
+ 'scnE;': '\u2ab6',
+ 'scnsim;': '\u22e9',
+ 'scpolint;': '\u2a13',
+ 'scsim;': '\u227f',
+ 'Scy;': '\u0421',
+ 'scy;': '\u0441',
+ 'sdot;': '\u22c5',
+ 'sdotb;': '\u22a1',
+ 'sdote;': '\u2a66',
+ 'searhk;': '\u2925',
+ 'seArr;': '\u21d8',
+ 'searr;': '\u2198',
+ 'searrow;': '\u2198',
+ 'sect': '\xa7',
+ 'sect;': '\xa7',
+ 'semi;': ';',
+ 'seswar;': '\u2929',
+ 'setminus;': '\u2216',
+ 'setmn;': '\u2216',
+ 'sext;': '\u2736',
+ 'Sfr;': '\U0001d516',
+ 'sfr;': '\U0001d530',
+ 'sfrown;': '\u2322',
+ 'sharp;': '\u266f',
+ 'SHCHcy;': '\u0429',
+ 'shchcy;': '\u0449',
+ 'SHcy;': '\u0428',
+ 'shcy;': '\u0448',
+ 'ShortDownArrow;': '\u2193',
+ 'ShortLeftArrow;': '\u2190',
+ 'shortmid;': '\u2223',
+ 'shortparallel;': '\u2225',
+ 'ShortRightArrow;': '\u2192',
+ 'ShortUpArrow;': '\u2191',
+ 'shy': '\xad',
+ 'shy;': '\xad',
+ 'Sigma;': '\u03a3',
+ 'sigma;': '\u03c3',
+ 'sigmaf;': '\u03c2',
+ 'sigmav;': '\u03c2',
+ 'sim;': '\u223c',
+ 'simdot;': '\u2a6a',
+ 'sime;': '\u2243',
+ 'simeq;': '\u2243',
+ 'simg;': '\u2a9e',
+ 'simgE;': '\u2aa0',
+ 'siml;': '\u2a9d',
+ 'simlE;': '\u2a9f',
+ 'simne;': '\u2246',
+ 'simplus;': '\u2a24',
+ 'simrarr;': '\u2972',
+ 'slarr;': '\u2190',
+ 'SmallCircle;': '\u2218',
+ 'smallsetminus;': '\u2216',
+ 'smashp;': '\u2a33',
+ 'smeparsl;': '\u29e4',
+ 'smid;': '\u2223',
+ 'smile;': '\u2323',
+ 'smt;': '\u2aaa',
+ 'smte;': '\u2aac',
+ 'smtes;': '\u2aac\ufe00',
+ 'SOFTcy;': '\u042c',
+ 'softcy;': '\u044c',
+ 'sol;': '/',
+ 'solb;': '\u29c4',
+ 'solbar;': '\u233f',
+ 'Sopf;': '\U0001d54a',
+ 'sopf;': '\U0001d564',
+ 'spades;': '\u2660',
+ 'spadesuit;': '\u2660',
+ 'spar;': '\u2225',
+ 'sqcap;': '\u2293',
+ 'sqcaps;': '\u2293\ufe00',
+ 'sqcup;': '\u2294',
+ 'sqcups;': '\u2294\ufe00',
+ 'Sqrt;': '\u221a',
+ 'sqsub;': '\u228f',
+ 'sqsube;': '\u2291',
+ 'sqsubset;': '\u228f',
+ 'sqsubseteq;': '\u2291',
+ 'sqsup;': '\u2290',
+ 'sqsupe;': '\u2292',
+ 'sqsupset;': '\u2290',
+ 'sqsupseteq;': '\u2292',
+ 'squ;': '\u25a1',
+ 'Square;': '\u25a1',
+ 'square;': '\u25a1',
+ 'SquareIntersection;': '\u2293',
+ 'SquareSubset;': '\u228f',
+ 'SquareSubsetEqual;': '\u2291',
+ 'SquareSuperset;': '\u2290',
+ 'SquareSupersetEqual;': '\u2292',
+ 'SquareUnion;': '\u2294',
+ 'squarf;': '\u25aa',
+ 'squf;': '\u25aa',
+ 'srarr;': '\u2192',
+ 'Sscr;': '\U0001d4ae',
+ 'sscr;': '\U0001d4c8',
+ 'ssetmn;': '\u2216',
+ 'ssmile;': '\u2323',
+ 'sstarf;': '\u22c6',
+ 'Star;': '\u22c6',
+ 'star;': '\u2606',
+ 'starf;': '\u2605',
+ 'straightepsilon;': '\u03f5',
+ 'straightphi;': '\u03d5',
+ 'strns;': '\xaf',
+ 'Sub;': '\u22d0',
+ 'sub;': '\u2282',
+ 'subdot;': '\u2abd',
+ 'subE;': '\u2ac5',
+ 'sube;': '\u2286',
+ 'subedot;': '\u2ac3',
+ 'submult;': '\u2ac1',
+ 'subnE;': '\u2acb',
+ 'subne;': '\u228a',
+ 'subplus;': '\u2abf',
+ 'subrarr;': '\u2979',
+ 'Subset;': '\u22d0',
+ 'subset;': '\u2282',
+ 'subseteq;': '\u2286',
+ 'subseteqq;': '\u2ac5',
+ 'SubsetEqual;': '\u2286',
+ 'subsetneq;': '\u228a',
+ 'subsetneqq;': '\u2acb',
+ 'subsim;': '\u2ac7',
+ 'subsub;': '\u2ad5',
+ 'subsup;': '\u2ad3',
+ 'succ;': '\u227b',
+ 'succapprox;': '\u2ab8',
+ 'succcurlyeq;': '\u227d',
+ 'Succeeds;': '\u227b',
+ 'SucceedsEqual;': '\u2ab0',
+ 'SucceedsSlantEqual;': '\u227d',
+ 'SucceedsTilde;': '\u227f',
+ 'succeq;': '\u2ab0',
+ 'succnapprox;': '\u2aba',
+ 'succneqq;': '\u2ab6',
+ 'succnsim;': '\u22e9',
+ 'succsim;': '\u227f',
+ 'SuchThat;': '\u220b',
+ 'Sum;': '\u2211',
+ 'sum;': '\u2211',
+ 'sung;': '\u266a',
+ 'sup1': '\xb9',
+ 'sup1;': '\xb9',
+ 'sup2': '\xb2',
+ 'sup2;': '\xb2',
+ 'sup3': '\xb3',
+ 'sup3;': '\xb3',
+ 'Sup;': '\u22d1',
+ 'sup;': '\u2283',
+ 'supdot;': '\u2abe',
+ 'supdsub;': '\u2ad8',
+ 'supE;': '\u2ac6',
+ 'supe;': '\u2287',
+ 'supedot;': '\u2ac4',
+ 'Superset;': '\u2283',
+ 'SupersetEqual;': '\u2287',
+ 'suphsol;': '\u27c9',
+ 'suphsub;': '\u2ad7',
+ 'suplarr;': '\u297b',
+ 'supmult;': '\u2ac2',
+ 'supnE;': '\u2acc',
+ 'supne;': '\u228b',
+ 'supplus;': '\u2ac0',
+ 'Supset;': '\u22d1',
+ 'supset;': '\u2283',
+ 'supseteq;': '\u2287',
+ 'supseteqq;': '\u2ac6',
+ 'supsetneq;': '\u228b',
+ 'supsetneqq;': '\u2acc',
+ 'supsim;': '\u2ac8',
+ 'supsub;': '\u2ad4',
+ 'supsup;': '\u2ad6',
+ 'swarhk;': '\u2926',
+ 'swArr;': '\u21d9',
+ 'swarr;': '\u2199',
+ 'swarrow;': '\u2199',
+ 'swnwar;': '\u292a',
+ 'szlig': '\xdf',
+ 'szlig;': '\xdf',
+ 'Tab;': '\t',
+ 'target;': '\u2316',
+ 'Tau;': '\u03a4',
+ 'tau;': '\u03c4',
+ 'tbrk;': '\u23b4',
+ 'Tcaron;': '\u0164',
+ 'tcaron;': '\u0165',
+ 'Tcedil;': '\u0162',
+ 'tcedil;': '\u0163',
+ 'Tcy;': '\u0422',
+ 'tcy;': '\u0442',
+ 'tdot;': '\u20db',
+ 'telrec;': '\u2315',
+ 'Tfr;': '\U0001d517',
+ 'tfr;': '\U0001d531',
+ 'there4;': '\u2234',
+ 'Therefore;': '\u2234',
+ 'therefore;': '\u2234',
+ 'Theta;': '\u0398',
+ 'theta;': '\u03b8',
+ 'thetasym;': '\u03d1',
+ 'thetav;': '\u03d1',
+ 'thickapprox;': '\u2248',
+ 'thicksim;': '\u223c',
+ 'ThickSpace;': '\u205f\u200a',
+ 'thinsp;': '\u2009',
+ 'ThinSpace;': '\u2009',
+ 'thkap;': '\u2248',
+ 'thksim;': '\u223c',
+ 'THORN': '\xde',
+ 'thorn': '\xfe',
+ 'THORN;': '\xde',
+ 'thorn;': '\xfe',
+ 'Tilde;': '\u223c',
+ 'tilde;': '\u02dc',
+ 'TildeEqual;': '\u2243',
+ 'TildeFullEqual;': '\u2245',
+ 'TildeTilde;': '\u2248',
+ 'times': '\xd7',
+ 'times;': '\xd7',
+ 'timesb;': '\u22a0',
+ 'timesbar;': '\u2a31',
+ 'timesd;': '\u2a30',
+ 'tint;': '\u222d',
+ 'toea;': '\u2928',
+ 'top;': '\u22a4',
+ 'topbot;': '\u2336',
+ 'topcir;': '\u2af1',
+ 'Topf;': '\U0001d54b',
+ 'topf;': '\U0001d565',
+ 'topfork;': '\u2ada',
+ 'tosa;': '\u2929',
+ 'tprime;': '\u2034',
+ 'TRADE;': '\u2122',
+ 'trade;': '\u2122',
+ 'triangle;': '\u25b5',
+ 'triangledown;': '\u25bf',
+ 'triangleleft;': '\u25c3',
+ 'trianglelefteq;': '\u22b4',
+ 'triangleq;': '\u225c',
+ 'triangleright;': '\u25b9',
+ 'trianglerighteq;': '\u22b5',
+ 'tridot;': '\u25ec',
+ 'trie;': '\u225c',
+ 'triminus;': '\u2a3a',
+ 'TripleDot;': '\u20db',
+ 'triplus;': '\u2a39',
+ 'trisb;': '\u29cd',
+ 'tritime;': '\u2a3b',
+ 'trpezium;': '\u23e2',
+ 'Tscr;': '\U0001d4af',
+ 'tscr;': '\U0001d4c9',
+ 'TScy;': '\u0426',
+ 'tscy;': '\u0446',
+ 'TSHcy;': '\u040b',
+ 'tshcy;': '\u045b',
+ 'Tstrok;': '\u0166',
+ 'tstrok;': '\u0167',
+ 'twixt;': '\u226c',
+ 'twoheadleftarrow;': '\u219e',
+ 'twoheadrightarrow;': '\u21a0',
+ 'Uacute': '\xda',
+ 'uacute': '\xfa',
+ 'Uacute;': '\xda',
+ 'uacute;': '\xfa',
+ 'Uarr;': '\u219f',
+ 'uArr;': '\u21d1',
+ 'uarr;': '\u2191',
+ 'Uarrocir;': '\u2949',
+ 'Ubrcy;': '\u040e',
+ 'ubrcy;': '\u045e',
+ 'Ubreve;': '\u016c',
+ 'ubreve;': '\u016d',
+ 'Ucirc': '\xdb',
+ 'ucirc': '\xfb',
+ 'Ucirc;': '\xdb',
+ 'ucirc;': '\xfb',
+ 'Ucy;': '\u0423',
+ 'ucy;': '\u0443',
+ 'udarr;': '\u21c5',
+ 'Udblac;': '\u0170',
+ 'udblac;': '\u0171',
+ 'udhar;': '\u296e',
+ 'ufisht;': '\u297e',
+ 'Ufr;': '\U0001d518',
+ 'ufr;': '\U0001d532',
+ 'Ugrave': '\xd9',
+ 'ugrave': '\xf9',
+ 'Ugrave;': '\xd9',
+ 'ugrave;': '\xf9',
+ 'uHar;': '\u2963',
+ 'uharl;': '\u21bf',
+ 'uharr;': '\u21be',
+ 'uhblk;': '\u2580',
+ 'ulcorn;': '\u231c',
+ 'ulcorner;': '\u231c',
+ 'ulcrop;': '\u230f',
+ 'ultri;': '\u25f8',
+ 'Umacr;': '\u016a',
+ 'umacr;': '\u016b',
+ 'uml': '\xa8',
+ 'uml;': '\xa8',
+ 'UnderBar;': '_',
+ 'UnderBrace;': '\u23df',
+ 'UnderBracket;': '\u23b5',
+ 'UnderParenthesis;': '\u23dd',
+ 'Union;': '\u22c3',
+ 'UnionPlus;': '\u228e',
+ 'Uogon;': '\u0172',
+ 'uogon;': '\u0173',
+ 'Uopf;': '\U0001d54c',
+ 'uopf;': '\U0001d566',
+ 'UpArrow;': '\u2191',
+ 'Uparrow;': '\u21d1',
+ 'uparrow;': '\u2191',
+ 'UpArrowBar;': '\u2912',
+ 'UpArrowDownArrow;': '\u21c5',
+ 'UpDownArrow;': '\u2195',
+ 'Updownarrow;': '\u21d5',
+ 'updownarrow;': '\u2195',
+ 'UpEquilibrium;': '\u296e',
+ 'upharpoonleft;': '\u21bf',
+ 'upharpoonright;': '\u21be',
+ 'uplus;': '\u228e',
+ 'UpperLeftArrow;': '\u2196',
+ 'UpperRightArrow;': '\u2197',
+ 'Upsi;': '\u03d2',
+ 'upsi;': '\u03c5',
+ 'upsih;': '\u03d2',
+ 'Upsilon;': '\u03a5',
+ 'upsilon;': '\u03c5',
+ 'UpTee;': '\u22a5',
+ 'UpTeeArrow;': '\u21a5',
+ 'upuparrows;': '\u21c8',
+ 'urcorn;': '\u231d',
+ 'urcorner;': '\u231d',
+ 'urcrop;': '\u230e',
+ 'Uring;': '\u016e',
+ 'uring;': '\u016f',
+ 'urtri;': '\u25f9',
+ 'Uscr;': '\U0001d4b0',
+ 'uscr;': '\U0001d4ca',
+ 'utdot;': '\u22f0',
+ 'Utilde;': '\u0168',
+ 'utilde;': '\u0169',
+ 'utri;': '\u25b5',
+ 'utrif;': '\u25b4',
+ 'uuarr;': '\u21c8',
+ 'Uuml': '\xdc',
+ 'uuml': '\xfc',
+ 'Uuml;': '\xdc',
+ 'uuml;': '\xfc',
+ 'uwangle;': '\u29a7',
+ 'vangrt;': '\u299c',
+ 'varepsilon;': '\u03f5',
+ 'varkappa;': '\u03f0',
+ 'varnothing;': '\u2205',
+ 'varphi;': '\u03d5',
+ 'varpi;': '\u03d6',
+ 'varpropto;': '\u221d',
+ 'vArr;': '\u21d5',
+ 'varr;': '\u2195',
+ 'varrho;': '\u03f1',
+ 'varsigma;': '\u03c2',
+ 'varsubsetneq;': '\u228a\ufe00',
+ 'varsubsetneqq;': '\u2acb\ufe00',
+ 'varsupsetneq;': '\u228b\ufe00',
+ 'varsupsetneqq;': '\u2acc\ufe00',
+ 'vartheta;': '\u03d1',
+ 'vartriangleleft;': '\u22b2',
+ 'vartriangleright;': '\u22b3',
+ 'Vbar;': '\u2aeb',
+ 'vBar;': '\u2ae8',
+ 'vBarv;': '\u2ae9',
+ 'Vcy;': '\u0412',
+ 'vcy;': '\u0432',
+ 'VDash;': '\u22ab',
+ 'Vdash;': '\u22a9',
+ 'vDash;': '\u22a8',
+ 'vdash;': '\u22a2',
+ 'Vdashl;': '\u2ae6',
+ 'Vee;': '\u22c1',
+ 'vee;': '\u2228',
+ 'veebar;': '\u22bb',
+ 'veeeq;': '\u225a',
+ 'vellip;': '\u22ee',
+ 'Verbar;': '\u2016',
+ 'verbar;': '|',
+ 'Vert;': '\u2016',
+ 'vert;': '|',
+ 'VerticalBar;': '\u2223',
+ 'VerticalLine;': '|',
+ 'VerticalSeparator;': '\u2758',
+ 'VerticalTilde;': '\u2240',
+ 'VeryThinSpace;': '\u200a',
+ 'Vfr;': '\U0001d519',
+ 'vfr;': '\U0001d533',
+ 'vltri;': '\u22b2',
+ 'vnsub;': '\u2282\u20d2',
+ 'vnsup;': '\u2283\u20d2',
+ 'Vopf;': '\U0001d54d',
+ 'vopf;': '\U0001d567',
+ 'vprop;': '\u221d',
+ 'vrtri;': '\u22b3',
+ 'Vscr;': '\U0001d4b1',
+ 'vscr;': '\U0001d4cb',
+ 'vsubnE;': '\u2acb\ufe00',
+ 'vsubne;': '\u228a\ufe00',
+ 'vsupnE;': '\u2acc\ufe00',
+ 'vsupne;': '\u228b\ufe00',
+ 'Vvdash;': '\u22aa',
+ 'vzigzag;': '\u299a',
+ 'Wcirc;': '\u0174',
+ 'wcirc;': '\u0175',
+ 'wedbar;': '\u2a5f',
+ 'Wedge;': '\u22c0',
+ 'wedge;': '\u2227',
+ 'wedgeq;': '\u2259',
+ 'weierp;': '\u2118',
+ 'Wfr;': '\U0001d51a',
+ 'wfr;': '\U0001d534',
+ 'Wopf;': '\U0001d54e',
+ 'wopf;': '\U0001d568',
+ 'wp;': '\u2118',
+ 'wr;': '\u2240',
+ 'wreath;': '\u2240',
+ 'Wscr;': '\U0001d4b2',
+ 'wscr;': '\U0001d4cc',
+ 'xcap;': '\u22c2',
+ 'xcirc;': '\u25ef',
+ 'xcup;': '\u22c3',
+ 'xdtri;': '\u25bd',
+ 'Xfr;': '\U0001d51b',
+ 'xfr;': '\U0001d535',
+ 'xhArr;': '\u27fa',
+ 'xharr;': '\u27f7',
+ 'Xi;': '\u039e',
+ 'xi;': '\u03be',
+ 'xlArr;': '\u27f8',
+ 'xlarr;': '\u27f5',
+ 'xmap;': '\u27fc',
+ 'xnis;': '\u22fb',
+ 'xodot;': '\u2a00',
+ 'Xopf;': '\U0001d54f',
+ 'xopf;': '\U0001d569',
+ 'xoplus;': '\u2a01',
+ 'xotime;': '\u2a02',
+ 'xrArr;': '\u27f9',
+ 'xrarr;': '\u27f6',
+ 'Xscr;': '\U0001d4b3',
+ 'xscr;': '\U0001d4cd',
+ 'xsqcup;': '\u2a06',
+ 'xuplus;': '\u2a04',
+ 'xutri;': '\u25b3',
+ 'xvee;': '\u22c1',
+ 'xwedge;': '\u22c0',
+ 'Yacute': '\xdd',
+ 'yacute': '\xfd',
+ 'Yacute;': '\xdd',
+ 'yacute;': '\xfd',
+ 'YAcy;': '\u042f',
+ 'yacy;': '\u044f',
+ 'Ycirc;': '\u0176',
+ 'ycirc;': '\u0177',
+ 'Ycy;': '\u042b',
+ 'ycy;': '\u044b',
+ 'yen': '\xa5',
+ 'yen;': '\xa5',
+ 'Yfr;': '\U0001d51c',
+ 'yfr;': '\U0001d536',
+ 'YIcy;': '\u0407',
+ 'yicy;': '\u0457',
+ 'Yopf;': '\U0001d550',
+ 'yopf;': '\U0001d56a',
+ 'Yscr;': '\U0001d4b4',
+ 'yscr;': '\U0001d4ce',
+ 'YUcy;': '\u042e',
+ 'yucy;': '\u044e',
+ 'yuml': '\xff',
+ 'Yuml;': '\u0178',
+ 'yuml;': '\xff',
+ 'Zacute;': '\u0179',
+ 'zacute;': '\u017a',
+ 'Zcaron;': '\u017d',
+ 'zcaron;': '\u017e',
+ 'Zcy;': '\u0417',
+ 'zcy;': '\u0437',
+ 'Zdot;': '\u017b',
+ 'zdot;': '\u017c',
+ 'zeetrf;': '\u2128',
+ 'ZeroWidthSpace;': '\u200b',
+ 'Zeta;': '\u0396',
+ 'zeta;': '\u03b6',
+ 'Zfr;': '\u2128',
+ 'zfr;': '\U0001d537',
+ 'ZHcy;': '\u0416',
+ 'zhcy;': '\u0436',
+ 'zigrarr;': '\u21dd',
+ 'Zopf;': '\u2124',
+ 'zopf;': '\U0001d56b',
+ 'Zscr;': '\U0001d4b5',
+ 'zscr;': '\U0001d4cf',
+ 'zwj;': '\u200d',
+ 'zwnj;': '\u200c',
+}
- """Substitute XML or HTML entities for the corresponding characters."""
+
+class EntitySubstitution(object):
+ """The ability to substitute XML or HTML entities for certain characters."""
def _populate_class_variables():
- lookup = {}
- reverse_lookup = {}
- characters_for_re = []
+ """Initialize variables used by this class to manage the plethora of
+ HTML5 named entities.
- # &apos is an XHTML entity and an HTML 5, but not an HTML 4
- # entity. We don't want to use it, but we want to recognize it on the way in.
- #
- # TODO: Ideally we would be able to recognize all HTML 5 named
- # entities, but that's a little tricky.
- extra = [(39, 'apos')]
- for codepoint, name in list(codepoint2name.items()) + extra:
+ This function returns a 3-tuple containing two dictionaries
+ and a regular expression:
+
+ unicode_to_name - A mapping of Unicode strings like "⦨" to
+ entity names like "angmsdaa". When a single Unicode string has
+ multiple entity names, we try to choose the most commonly-used
+ name.
+
+ name_to_unicode: A mapping of entity names like "angmsdaa" to
+ Unicode strings like "⦨".
+
+ named_entity_re: A regular expression matching (almost) any
+ Unicode string that corresponds to an HTML5 named entity.
+ """
+ unicode_to_name = {}
+ name_to_unicode = {}
+
+ short_entities = set()
+ long_entities_by_first_character = defaultdict(set)
+
+ for name_with_semicolon, character in sorted(html5.items()):
+ # "It is intentional, for legacy compatibility, that many
+ # code points have multiple character reference names. For
+ # example, some appear both with and without the trailing
+ # semicolon, or with different capitalizations."
+ # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
+ #
+ # The parsers are in charge of handling (or not) character
+ # references with no trailing semicolon, so we remove the
+ # semicolon whenever it appears.
+ if name_with_semicolon.endswith(';'):
+ name = name_with_semicolon[:-1]
+ else:
+ name = name_with_semicolon
+
+ # When parsing HTML, we want to recognize any known named
+ # entity and convert it to a sequence of Unicode
+ # characters.
+ if name not in name_to_unicode:
+ name_to_unicode[name] = character
+
+ # When _generating_ HTML, we want to recognize special
+ # character sequences that _could_ be converted to named
+ # entities.
+ unicode_to_name[character] = name
+
+ # We also need to build a regular expression that lets us
+ # _find_ those characters in output strings so we can
+ # replace them.
+ #
+ # This is tricky, for two reasons.
+
+ if (len(character) == 1 and ord(character) < 128
+ and character not in '<>&'):
+ # First, it would be annoying to turn single ASCII
+ # characters like | into named entities like
+ # |. The exceptions are <>&, which we _must_
+ # turn into named entities to produce valid HTML.
+ continue
+
+ if len(character) > 1 and all(ord(x) < 128 for x in character):
+ # We also do not want to turn _combinations_ of ASCII
+ # characters like 'fj' into named entities like 'fj',
+ # though that's more debateable.
+ continue
+
+ # Second, some named entities have a Unicode value that's
+ # a subset of the Unicode value for some _other_ named
+ # entity. As an example, \u2267' is ≧,
+ # but '\u2267\u0338' is ≧̸. Our regular
+ # expression needs to match the first two characters of
+ # "\u2267\u0338foo", but only the first character of
+ # "\u2267foo".
+ #
+ # In this step, we build two sets of characters that
+ # _eventually_ need to go into the regular expression. But
+ # we won't know exactly what the regular expression needs
+ # to look like until we've gone through the entire list of
+ # named entities.
+ if len(character) == 1:
+ short_entities.add(character)
+ else:
+ long_entities_by_first_character[character[0]].add(character)
+
+ # Now that we've been through the entire list of entities, we
+ # can create a regular expression that matches any of them.
+ particles = set()
+ for short in short_entities:
+ long_versions = long_entities_by_first_character[short]
+ if not long_versions:
+ particles.add(short)
+ else:
+ ignore = "".join([x[1] for x in long_versions])
+ # This finds, e.g. \u2267 but only if it is _not_
+ # followed by \u0338.
+ particles.add("%s(?![%s])" % (short, ignore))
+
+ for long_entities in list(long_entities_by_first_character.values()):
+ for long_entity in long_entities:
+ particles.add(long_entity)
+
+ re_definition = "(%s)" % "|".join(particles)
+
+ # If an entity shows up in both html5 and codepoint2name, it's
+ # likely that HTML5 gives it several different names, such as
+ # 'rsquo' and 'rsquor'. When converting Unicode characters to
+ # named entities, the codepoint2name name should take
+ # precedence where possible, since that's the more easily
+ # recognizable one.
+ for codepoint, name in list(codepoint2name.items()):
character = chr(codepoint)
- if codepoint not in (34, 39):
- # There's no point in turning the quotation mark into
- # " or the single quote into ', unless it
- # happens within an attribute value, which is handled
- # elsewhere.
- characters_for_re.append(character)
- lookup[character] = name
- # But we do want to recognize those entities on the way in and
- # convert them to Unicode characters.
- reverse_lookup[name] = character
- re_definition = "[%s]" % "".join(characters_for_re)
- return lookup, reverse_lookup, re.compile(re_definition)
+ unicode_to_name[character] = name
+
+ return unicode_to_name, name_to_unicode, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
@@ -111,13 +2447,15 @@ class EntitySubstitution(object):
@classmethod
def _substitute_html_entity(cls, matchobj):
+ """Used with a regular expression to substitute the
+ appropriate HTML entity for a special character string."""
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
@classmethod
def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
- appropriate XML entity for an XML special character."""
+ appropriate XML entity for a special character string."""
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
@@ -212,6 +2550,8 @@ class EntitySubstitution(object):
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
character with "é" will make it more readable to some
people.
+
+ :param s: A Unicode string.
"""
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
cls._substitute_html_entity, s)
@@ -223,23 +2563,65 @@ class EncodingDetector:
Order of precedence:
1. Encodings you specifically tell EncodingDetector to try first
- (the override_encodings argument to the constructor).
+ (the known_definite_encodings argument to the constructor).
- 2. An encoding declared within the bytestring itself, either in an
+ 2. An encoding determined by sniffing the document's byte-order mark.
+
+ 3. Encodings you specifically tell EncodingDetector to try if
+ byte-order mark sniffing fails (the user_encodings argument to the
+ constructor).
+
+ 4. An encoding declared within the bytestring itself, either in an
XML declaration (if the bytestring is to be interpreted as an XML
document), or in a tag (if the bytestring is to be
interpreted as an HTML document.)
- 3. An encoding detected through textual analysis by chardet,
+ 5. An encoding detected through textual analysis by chardet,
cchardet, or a similar external library.
4. UTF-8.
5. Windows-1252.
+
"""
- def __init__(self, markup, override_encodings=None, is_html=False,
- exclude_encodings=None):
- self.override_encodings = override_encodings or []
+ def __init__(self, markup, known_definite_encodings=None,
+ is_html=False, exclude_encodings=None,
+ user_encodings=None, override_encodings=None):
+ """Constructor.
+
+ :param markup: Some markup in an unknown encoding.
+
+ :param known_definite_encodings: When determining the encoding
+ of `markup`, these encodings will be tried first, in
+ order. In HTML terms, this corresponds to the "known
+ definite encoding" step defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
+
+ :param user_encodings: These encodings will be tried after the
+ `known_definite_encodings` have been tried and failed, and
+ after an attempt to sniff the encoding by looking at a
+ byte order mark has failed. In HTML terms, this
+ corresponds to the step "user has explicitly instructed
+ the user agent to override the document's character
+ encoding", defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+
+ :param override_encodings: A deprecated alias for
+ known_definite_encodings. Any encodings here will be tried
+ immediately after the encodings in
+ known_definite_encodings.
+
+ :param is_html: If True, this markup is considered to be
+ HTML. Otherwise it's assumed to be XML.
+
+ :param exclude_encodings: These encodings will not be tried,
+ even if they otherwise would be.
+
+ """
+ self.known_definite_encodings = list(known_definite_encodings or [])
+ if override_encodings:
+ self.known_definite_encodings += override_encodings
+ self.user_encodings = user_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
@@ -250,6 +2632,12 @@ class EncodingDetector:
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
def _usable(self, encoding, tried):
+ """Should we even bother to try this encoding?
+
+ :param encoding: Name of an encoding.
+ :param tried: Encodings that have already been tried. This will be modified
+ as a side effect.
+ """
if encoding is not None:
encoding = encoding.lower()
if encoding in self.exclude_encodings:
@@ -261,9 +2649,14 @@ class EncodingDetector:
@property
def encodings(self):
- """Yield a number of encodings that might work for this markup."""
+ """Yield a number of encodings that might work for this markup.
+
+ :yield: A sequence of strings.
+ """
tried = set()
- for e in self.override_encodings:
+
+ # First, try the known definite encodings
+ for e in self.known_definite_encodings:
if self._usable(e, tried):
yield e
@@ -272,6 +2665,12 @@ class EncodingDetector:
if self._usable(self.sniffed_encoding, tried):
yield self.sniffed_encoding
+ # Sniffing the byte-order mark did nothing; try the user
+ # encodings.
+ for e in self.user_encodings:
+ if self._usable(e, tried):
+ yield e
+
# Look within the document for an XML or HTML encoding
# declaration.
if self.declared_encoding is None:
@@ -294,7 +2693,11 @@ class EncodingDetector:
@classmethod
def strip_byte_order_mark(cls, data):
- """If a byte-order mark is present, strip it and return the encoding it implies."""
+ """If a byte-order mark is present, strip it and return the encoding it implies.
+
+ :param data: Some markup.
+ :return: A 2-tuple (modified data, implied encoding)
+ """
encoding = None
if isinstance(data, str):
# Unicode data cannot have a byte-order mark.
@@ -326,6 +2729,13 @@ class EncodingDetector:
An HTML encoding is declared in a tag, hopefully near the
beginning of the document.
+
+ :param markup: Some markup.
+ :param is_html: If True, this markup is considered to be HTML. Otherwise
+ it's assumed to be XML.
+ :param search_entire_document: Since an encoding is supposed to declared near the beginning
+ of the document, most of the time it's only necessary to search a few kilobytes of data.
+ Set this to True to force this method to search the entire document.
"""
if search_entire_document:
xml_endpos = html_endpos = len(markup)
@@ -371,15 +2781,53 @@ class UnicodeDammit:
"iso-8859-2",
]
- def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False, exclude_encodings=[]):
+ def __init__(self, markup, known_definite_encodings=[],
+ smart_quotes_to=None, is_html=False, exclude_encodings=[],
+ user_encodings=None, override_encodings=None
+ ):
+ """Constructor.
+
+ :param markup: A bytestring representing markup in an unknown encoding.
+
+ :param known_definite_encodings: When determining the encoding
+ of `markup`, these encodings will be tried first, in
+ order. In HTML terms, this corresponds to the "known
+ definite encoding" step defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
+
+ :param user_encodings: These encodings will be tried after the
+ `known_definite_encodings` have been tried and failed, and
+ after an attempt to sniff the encoding by looking at a
+ byte order mark has failed. In HTML terms, this
+ corresponds to the step "user has explicitly instructed
+ the user agent to override the document's character
+ encoding", defined here:
+ https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+
+ :param override_encodings: A deprecated alias for
+ known_definite_encodings. Any encodings here will be tried
+ immediately after the encodings in
+ known_definite_encodings.
+
+ :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
+ to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
+ Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
+ will convert them to HTML entity references.
+ :param is_html: If True, this markup is considered to be HTML. Otherwise
+ it's assumed to be XML.
+ :param exclude_encodings: These encodings will not be considered, even
+ if the sniffing code thinks they might make sense.
+
+ """
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
self.log = logging.getLogger(__name__)
self.detector = EncodingDetector(
- markup, override_encodings, is_html, exclude_encodings)
+ markup, known_definite_encodings, is_html, exclude_encodings,
+ user_encodings, override_encodings
+ )
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, str) or markup == '':
@@ -439,6 +2887,10 @@ class UnicodeDammit:
return sub
def _convert_from(self, proposed, errors="strict"):
+ """Attempt to convert the markup to the proposed encoding.
+
+ :param proposed: The name of a character encoding.
+ """
proposed = self.find_codec(proposed)
if not proposed or (proposed, errors) in self.tried_encodings:
return None
@@ -453,30 +2905,40 @@ class UnicodeDammit:
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
- #print "Trying to convert document to %s (errors=%s)" % (
- # proposed, errors)
+ #print("Trying to convert document to %s (errors=%s)" % (
+ # proposed, errors))
u = self._to_unicode(markup, proposed, errors)
self.markup = u
self.original_encoding = proposed
except Exception as e:
- #print "That didn't work!"
- #print e
+ #print("That didn't work!")
+ #print(e)
return None
- #print "Correct encoding: %s" % proposed
+ #print("Correct encoding: %s" % proposed)
return self.markup
def _to_unicode(self, data, encoding, errors="strict"):
- '''Given a string and its encoding, decodes the string into Unicode.
- %encoding is a string recognized by encodings.aliases'''
+ """Given a string and its encoding, decodes the string into Unicode.
+
+ :param encoding: The name of an encoding.
+ """
return str(data, encoding, errors)
@property
def declared_html_encoding(self):
+ """If the markup is an HTML document, returns the encoding declared _within_
+ the document.
+ """
if not self.is_html:
return None
return self.detector.declared_encoding
def find_codec(self, charset):
+ """Convert the name of a character set to a codec name.
+
+ :param charset: The name of a character set.
+ :return: The name of a codec.
+ """
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
or (charset and self._codec(charset.replace("-", "")))
or (charset and self._codec(charset.replace("-", "_")))
@@ -815,12 +3277,16 @@ class UnicodeDammit:
Currently the only situation supported is Windows-1252 (or its
subset ISO-8859-1), embedded in UTF-8.
- The input must be a bytestring. If you've already converted
- the document to Unicode, you're too late.
-
- The output is a bytestring in which `embedded_encoding`
- characters have been converted to their `main_encoding`
- equivalents.
+ :param in_bytes: A bytestring that you suspect contains
+ characters from multiple encodings. Note that this _must_
+ be a bytestring. If you've already converted the document
+ to Unicode, you're too late.
+ :param main_encoding: The primary encoding of `in_bytes`.
+ :param embedded_encoding: The encoding that was used to embed characters
+ in the main document.
+ :return: A bytestring in which `embedded_encoding`
+ characters have been converted to their `main_encoding`
+ equivalents.
"""
if embedded_encoding.replace('_', '-').lower() not in (
'windows-1252', 'windows_1252'):
diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py
index a1ae23dc..500e92df 100644
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@@ -20,9 +20,13 @@ import sys
import cProfile
def diagnose(data):
- """Diagnostic suite for isolating common problems."""
- print("Diagnostic running on Beautiful Soup %s" % __version__)
- print("Python version %s" % sys.version)
+ """Diagnostic suite for isolating common problems.
+
+ :param data: A string containing markup that needs to be explained.
+ :return: None; diagnostics are printed to standard output.
+ """
+ print(("Diagnostic running on Beautiful Soup %s" % __version__))
+ print(("Python version %s" % sys.version))
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@@ -39,65 +43,76 @@ def diagnose(data):
basic_parsers.append("lxml-xml")
try:
from lxml import etree
- print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+ print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
except ImportError as e:
- print (
+ print(
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers:
try:
import html5lib
- print("Found html5lib version %s" % html5lib.__version__)
+ print(("Found html5lib version %s" % html5lib.__version__))
except ImportError as e:
- print (
+ print(
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
elif data.startswith("http:") or data.startswith("https:"):
- print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+ print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
else:
try:
if os.path.exists(data):
- print('"%s" looks like a filename. Reading data from the file.' % data)
+ print(('"%s" looks like a filename. Reading data from the file.' % data))
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
- print()
+ print("")
for parser in basic_parsers:
- print("Trying to parse your markup with %s" % parser)
+ print(("Trying to parse your markup with %s" % parser))
success = False
try:
soup = BeautifulSoup(data, features=parser)
success = True
except Exception as e:
- print("%s could not parse the markup." % parser)
+ print(("%s could not parse the markup." % parser))
traceback.print_exc()
if success:
- print("Here's what %s did with the markup:" % parser)
- print(soup.prettify())
+ print(("Here's what %s did with the markup:" % parser))
+ print((soup.prettify()))
- print("-" * 80)
+ print(("-" * 80))
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
- Soup code is running.
+ Soup code is running. You can use this to determine whether
+ an lxml-specific problem is in Beautiful Soup's lxml tree builders
+ or in lxml itself.
+
+ :param data: Some markup.
+ :param html: If True, markup will be parsed with lxml's HTML parser.
+ if False, lxml's XML parser will be used.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
- """Announces HTMLParser parse events, without doing anything else."""
+ """Subclass of HTMLParser that announces parse events, without doing
+ anything else.
+
+ You can use this to get a picture of how html.parser sees a given
+ document. The easiest way to do this is to call `htmlparser_trace`.
+ """
def _p(self, s):
print(s)
@@ -134,6 +149,8 @@ def htmlparser_trace(data):
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
+
+ :param data: Some markup.
"""
parser = AnnouncingParser()
parser.feed(data)
@@ -176,9 +193,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
- print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
+ print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
data = rdoc(num_elements)
- print("Generated a large invalid HTML document (%d bytes)." % len(data))
+ print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
@@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
b = time.time()
success = True
except Exception as e:
- print("%s could not parse the markup." % parser)
+ print(("%s could not parse the markup." % parser))
traceback.print_exc()
if success:
- print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
+ print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
- print("Raw lxml parsed the markup in %.2fs." % (b-a))
+ print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
- print("Raw html5lib parsed the markup in %.2fs." % (b-a))
+ print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
def profile(num_elements=100000, parser="lxml"):
-
+ """Use Python's profiler on a randomly generated document."""
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
@@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
stats.sort_stats("cumulative")
stats.print_stats('_html5lib|bs4', 50)
+# If this file is run as a script, standard input is diagnosed.
if __name__ == '__main__':
diagnose(sys.stdin.read())
diff --git a/lib/bs4/element.py b/lib/bs4/element.py
index 69399e5c..82a986e4 100644
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@@ -43,17 +43,49 @@ def _alias(attr):
return alias
-class NamespacedAttribute(str):
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+ "idna",
+ "mbcs",
+ "oem",
+ "palmos",
+ "punycode",
+ "raw_unicode_escape",
+ "undefined",
+ "unicode_escape",
+ "raw-unicode-escape",
+ "unicode-escape",
+ "string-escape",
+ "string_escape",
+])
+
+class NamespacedAttribute(str):
+ """A namespaced string (e.g. 'xml:lang') that remembers the namespace
+ ('xml') and the name ('lang') that were used to create it.
+ """
+
def __new__(cls, prefix, name=None, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
# per https://www.w3.org/TR/xml-names/#defaulting
name = None
- if name is None:
+ if not name:
obj = str.__new__(cls, prefix)
- elif prefix is None:
+ elif not prefix:
# Not really namespaced.
obj = str.__new__(cls, name)
else:
@@ -79,6 +111,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ """When an HTML document is being encoded to a given encoding, the
+ value of a meta tag's 'charset' is the name of the encoding.
+ """
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
return encoding
@@ -104,19 +141,39 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
def rewrite(match):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
+ """Contains the navigational information for some part of the page:
+ that is, its current location in the parse tree.
+
+ NavigableString, Tag, etc. are all subclasses of PageElement.
+ """
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
- other elements."""
+ other elements.
+
+ :param parent: The parent of this element.
+
+ :param previous_element: The element parsed immediately before
+ this one.
+
+ :param next_element: The element parsed immediately before
+ this one.
+
+ :param previous_sibling: The most recently encountered element
+ on the same level of the parse tree as this one.
+
+ :param previous_sibling: The next element to be encountered
+ on the same level of the parse tree as this one.
+ """
self.parent = parent
self.previous_element = previous_element
@@ -140,7 +197,11 @@ class PageElement(object):
self.previous_sibling.next_sibling = self
def format_string(self, s, formatter):
- """Format the given string using the given formatter."""
+ """Format the given string using the given formatter.
+
+ :param s: A string.
+ :param formatter: A Formatter object, or a string naming one of the standard formatters.
+ """
if formatter is None:
return s
if not isinstance(formatter, Formatter):
@@ -153,9 +214,10 @@ class PageElement(object):
if necessary.
:param formatter: Can be a Formatter object (used as-is), a
- function (used as the entity substitution hook for an
- XMLFormatter or HTMLFormatter), or a string (used to look up
- an XMLFormatter or HTMLFormatter in the appropriate registry.
+ function (used as the entity substitution hook for an
+ XMLFormatter or HTMLFormatter), or a string (used to look
+ up an XMLFormatter or HTMLFormatter in the appropriate
+ registry.
"""
if isinstance(formatter, Formatter):
return formatter
@@ -163,7 +225,7 @@ class PageElement(object):
c = XMLFormatter
else:
c = HTMLFormatter
- if callable(formatter):
+ if isinstance(formatter, Callable):
return c(entity_substitution=formatter)
return c.REGISTRY[formatter]
@@ -193,30 +255,82 @@ class PageElement(object):
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
- def replace_with(self, replace_with):
+ default = object()
+ def _all_strings(self, strip=False, types=default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This is implemented differently in Tag and NavigableString.
+ """
+ raise NotImplementedError()
+
+ @property
+ def stripped_strings(self):
+ """Yield all strings in this PageElement, stripping them first.
+
+ :yield: A sequence of stripped strings.
+ """
+ for string in self._all_strings(True):
+ yield string
+
+ def get_text(self, separator="", strip=False,
+ types=default):
+ """Get all child strings of this PageElement, concatenated using the
+ given separator.
+
+ :param separator: Strings will be concatenated using this separator.
+
+ :param strip: If True, strings will be stripped before being
+ concatenated.
+
+ :param types: A tuple of NavigableString subclasses. Any
+ strings of a subclass not found in this list will be
+ ignored. Although there are exceptions, the default
+ behavior in most cases is to consider only NavigableString
+ and CData objects. That means no comments, processing
+ instructions, etc.
+
+ :return: A string.
+ """
+ return separator.join([s for s in self._all_strings(
+ strip, types=types)])
+ getText = get_text
+ text = property(get_text)
+
+ def replace_with(self, *args):
+ """Replace this PageElement with one or more PageElements, keeping the
+ rest of the tree the same.
+
+ :param args: One or more PageElements.
+ :return: `self`, no longer part of the tree.
+ """
if self.parent is None:
raise ValueError(
"Cannot replace one element with another when the "
"element to be replaced is not part of a tree.")
- if replace_with is self:
+ if len(args) == 1 and args[0] is self:
return
- if replace_with is self.parent:
+ if any(x is self.parent for x in args):
raise ValueError("Cannot replace a Tag with its parent.")
old_parent = self.parent
my_index = self.parent.index(self)
- self.extract()
- old_parent.insert(my_index, replace_with)
+ self.extract(_self_index=my_index)
+ for idx, replace_with in enumerate(args, start=my_index):
+ old_parent.insert(idx, replace_with)
return self
replaceWith = replace_with # BS3
def unwrap(self):
+ """Replace this PageElement with its contents.
+
+ :return: `self`, no longer part of the tree.
+ """
my_parent = self.parent
if self.parent is None:
raise ValueError(
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
my_index = self.parent.index(self)
- self.extract()
+ self.extract(_self_index=my_index)
for child in reversed(self.contents[:]):
my_parent.insert(my_index, child)
return self
@@ -224,14 +338,29 @@ class PageElement(object):
replaceWithChildren = unwrap # BS3
def wrap(self, wrap_inside):
+ """Wrap this PageElement inside another one.
+
+ :param wrap_inside: A PageElement.
+ :return: `wrap_inside`, occupying the position in the tree that used
+ to be occupied by `self`, and with `self` inside it.
+ """
me = self.replace_with(wrap_inside)
wrap_inside.append(me)
return wrap_inside
- def extract(self):
- """Destructively rips this element out of the tree."""
+ def extract(self, _self_index=None):
+ """Destructively rips this element out of the tree.
+
+ :param _self_index: The location of this element in its parent's
+ .contents, if known. Passing this in allows for a performance
+ optimization.
+
+ :return: `self`, no longer part of the tree.
+ """
if self.parent is not None:
- del self.parent.contents[self.parent.index(self)]
+ if _self_index is None:
+ _self_index = self.parent.index(self)
+ del self.parent.contents[_self_index]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
@@ -258,7 +387,12 @@ class PageElement(object):
return self
def _last_descendant(self, is_initialized=True, accept_self=True):
- "Finds the last element beneath this object to be parsed."
+ """Finds the last element beneath this object to be parsed.
+
+ :param is_initialized: Has `setup` been called on this PageElement
+ yet?
+ :param accept_self: Is `self` an acceptable answer to the question?
+ """
if is_initialized and self.next_sibling is not None:
last_child = self.next_sibling.previous_element
else:
@@ -272,6 +406,14 @@ class PageElement(object):
_lastRecursiveChild = _last_descendant
def insert(self, position, new_child):
+ """Insert a new PageElement in the list of this PageElement's children.
+
+ This works the same way as `list.insert`.
+
+ :param position: The numeric position that should be occupied
+ in `self.children` by the new PageElement.
+ :param new_child: A PageElement.
+ """
if new_child is None:
raise ValueError("Cannot insert None into a tag.")
if new_child is self:
@@ -346,19 +488,32 @@ class PageElement(object):
self.contents.insert(position, new_child)
def append(self, tag):
- """Appends the given tag to the contents of this tag."""
+ """Appends the given PageElement to the contents of this one.
+
+ :param tag: A PageElement.
+ """
self.insert(len(self.contents), tag)
def extend(self, tags):
- """Appends the given tags to the contents of this tag."""
+ """Appends the given PageElements to this one's contents.
+
+ :param tags: A list of PageElements.
+ """
+ if isinstance(tags, Tag):
+ # Calling self.append() on another tag's contents will change
+ # the list we're iterating over. Make a list that won't
+ # change.
+ tags = list(tags.contents)
for tag in tags:
self.append(tag)
def insert_before(self, *args):
"""Makes the given element(s) the immediate predecessor of this one.
- The elements will have the same parent, and the given elements
+ All the elements will have the same parent, and the given elements
will be immediately before this one.
+
+ :param args: One or more PageElements.
"""
parent = self.parent
if parent is None:
@@ -379,6 +534,8 @@ class PageElement(object):
The elements will have the same parent, and the given elements
will be immediately after this one.
+
+ :param args: One or more PageElements.
"""
# Do all error checking before modifying the tree.
parent = self.parent
@@ -399,70 +556,172 @@ class PageElement(object):
offset += 1
def find_next(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears after this Tag in the document."""
+ """Find the first PageElement that matches the given criteria and
+ appears later in the document than this PageElement.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
findNext = find_next # BS3
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
- """Returns all items that match the given criteria and appear
- after this Tag in the document."""
+ """Find all PageElements that match the given criteria and appear
+ later in the document than this PageElement.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet containing PageElements.
+ """
return self._find_all(name, attrs, text, limit, self.next_elements,
**kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears after this Tag in the document."""
+ """Find the closest sibling to this PageElement that matches the
+ given criteria and appears later in the document.
+
+ All find_* methods take a common set of arguments. See the
+ online documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self._find_one(self.find_next_siblings, name, attrs, text,
**kwargs)
findNextSibling = find_next_sibling # BS3
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear after this Tag in the document."""
+ """Find all siblings of this PageElement that match the given criteria
+ and appear later in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
return self._find_all(name, attrs, text, limit,
self.next_siblings, **kwargs)
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears before this Tag in the document."""
+ """Look backwards in the document from this PageElement and find the
+ first PageElement that matches the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self._find_one(
self.find_all_previous, name, attrs, text, **kwargs)
findPrevious = find_previous # BS3
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
- """Returns all items that match the given criteria and appear
- before this Tag in the document."""
+ """Look backwards in the document from this PageElement and find all
+ PageElements that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
return self._find_all(name, attrs, text, limit, self.previous_elements,
**kwargs)
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears before this Tag in the document."""
+ """Returns the closest sibling to this PageElement that matches the
+ given criteria and appears earlier in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self._find_one(self.find_previous_siblings, name, attrs, text,
**kwargs)
findPreviousSibling = find_previous_sibling # BS3
def find_previous_siblings(self, name=None, attrs={}, text=None,
limit=None, **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear before this Tag in the document."""
+ """Returns all siblings to this PageElement that match the
+ given criteria and appear earlier in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
return self._find_all(name, attrs, text, limit,
self.previous_siblings, **kwargs)
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
def find_parent(self, name=None, attrs={}, **kwargs):
- """Returns the closest parent of this Tag that matches the given
- criteria."""
+ """Find the closest parent of this PageElement that matches the given
+ criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :kwargs: A dictionary of filters on attribute values.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
@@ -473,9 +732,19 @@ class PageElement(object):
findParent = find_parent # BS3
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
- """Returns the parents of this Tag that match the given
- criteria."""
+ """Find all parents of this PageElement that match the given criteria.
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
findParents = find_parents # BS3
@@ -483,10 +752,20 @@ class PageElement(object):
@property
def next(self):
+ """The PageElement, if any, that was parsed just after this one.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self.next_element
@property
def previous(self):
+ """The PageElement, if any, that was parsed just before this one.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self.previous_element
#These methods do the real heavy lifting.
@@ -554,6 +833,10 @@ class PageElement(object):
#NavigableStrings and Tags.
@property
def next_elements(self):
+ """All PageElements that were parsed after this one.
+
+ :yield: A sequence of PageElements.
+ """
i = self.next_element
while i is not None:
yield i
@@ -561,6 +844,11 @@ class PageElement(object):
@property
def next_siblings(self):
+ """All PageElements that are siblings of this one but were parsed
+ later.
+
+ :yield: A sequence of PageElements.
+ """
i = self.next_sibling
while i is not None:
yield i
@@ -568,6 +856,10 @@ class PageElement(object):
@property
def previous_elements(self):
+ """All PageElements that were parsed before this one.
+
+ :yield: A sequence of PageElements.
+ """
i = self.previous_element
while i is not None:
yield i
@@ -575,6 +867,11 @@ class PageElement(object):
@property
def previous_siblings(self):
+ """All PageElements that are siblings of this one but were parsed
+ earlier.
+
+ :yield: A sequence of PageElements.
+ """
i = self.previous_sibling
while i is not None:
yield i
@@ -582,11 +879,23 @@ class PageElement(object):
@property
def parents(self):
+ """All PageElements that are parents of this PageElement.
+
+ :yield: A sequence of PageElements.
+ """
i = self.parent
while i is not None:
yield i
i = i.parent
+ @property
+ def decomposed(self):
+ """Check whether a PageElement has been decomposed.
+
+ :rtype: bool
+ """
+ return getattr(self, '_decomposed', False) or False
+
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -606,6 +915,11 @@ class PageElement(object):
class NavigableString(str, PageElement):
+ """A Python Unicode string that is part of a parse tree.
+
+ When Beautiful Soup parses the markup penguin , it will
+ create a NavigableString for the string "penguin".
+ """
PREFIX = ''
SUFFIX = ''
@@ -651,35 +965,100 @@ class NavigableString(str, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
- """Run the string through the provided formatter."""
+ """Run the string through the provided formatter.
+
+ :param formatter: A Formatter object, or a string naming one of the standard formatters.
+ """
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@property
def name(self):
+ """Since a NavigableString is not a Tag, it has no .name.
+
+ This property is implemented so that code like this doesn't crash
+ when run on a mixture of Tag and NavigableString objects:
+ [x.name for x in tag.children]
+ """
return None
@name.setter
def name(self, name):
+ """Prevent NavigableString.name from ever being set."""
raise AttributeError("A NavigableString cannot be given a name.")
+ def _all_strings(self, strip=False, types=PageElement.default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This makes it easy for NavigableString to implement methods
+ like get_text() as conveniences, creating a consistent
+ text-extraction API across all PageElements.
+
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :param types: A tuple of NavigableString subclasses. If this
+ NavigableString isn't one of those subclasses, the
+ sequence will be empty. By default, the subclasses
+ considered are NavigableString and CData objects. That
+ means no comments, processing instructions, etc.
+
+ :yield: A sequence that either contains this string, or is empty.
+
+ """
+ if types is self.default:
+ # This is kept in Tag because it's full of subclasses of
+ # this class, which aren't defined until later in the file.
+ types = Tag.DEFAULT_INTERESTING_STRING_TYPES
+
+ # Do nothing if the caller is looking for specific types of
+ # string, and we're of a different type.
+ my_type = type(self)
+ if types is not None:
+ if isinstance(types, type):
+ # Looking for a single type.
+ if my_type is not types:
+ return
+ elif my_type not in types:
+ # Looking for one of a list of types.
+ return
+
+ value = self
+ if strip:
+ value = value.strip()
+ if len(value) > 0:
+ yield value
+ strings = property(_all_strings)
+
class PreformattedString(NavigableString):
"""A NavigableString not subject to the normal formatting rules.
- The string will be passed into the formatter (to trigger side effects),
- but the return value will be ignored.
+ This is an abstract class used for special kinds of strings such
+ as comments (the Comment class) and CDATA blocks (the CData
+ class).
"""
-
+
+ PREFIX = ''
+ SUFFIX = ''
+
def output_ready(self, formatter=None):
- """CData strings are passed into the formatter, purely
- for any side effects. The return value is ignored.
+ """Make this string ready for output by adding any subclass-specific
+ prefix or suffix.
+
+ :param formatter: A Formatter object, or a string naming one
+ of the standard formatters. The string will be passed into the
+ Formatter, but only to trigger any side effects: the return
+ value is ignored.
+
+ :return: The string, with any subclass-specific prefix and
+ suffix added on.
"""
if formatter is not None:
ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
-
+ """A CDATA block."""
PREFIX = ''
@@ -695,20 +1074,32 @@ class XMLProcessingInstruction(ProcessingInstruction):
SUFFIX = '?>'
class Comment(PreformattedString):
-
+ """An HTML or XML comment."""
PREFIX = ''
class Declaration(PreformattedString):
+ """An XML declaration."""
PREFIX = ''
SUFFIX = '?>'
class Doctype(PreformattedString):
-
+ """A document type declaration."""
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):
+ """Generate an appropriate document type declaration for a given
+ public ID and system ID.
+
+ :param name: The name of the document's root element, e.g. 'html'.
+ :param pub_id: The Formal Public Identifier for this document type,
+ e.g. '-//W3C//DTD XHTML 1.1//EN'
+ :param system_id: The system identifier for this document type,
+ e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
+
+ :return: A Doctype.
+ """
value = name or ''
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
@@ -723,18 +1114,80 @@ class Doctype(PreformattedString):
SUFFIX = '>\n'
-class Tag(PageElement):
+class Stylesheet(NavigableString):
+ """A NavigableString representing an stylesheet (probably
+ CSS).
- """Represents a found HTML tag with its attributes and contents."""
+ Used to distinguish embedded stylesheets from textual content.
+ """
+ pass
+
+
+class Script(NavigableString):
+ """A NavigableString representing an executable script (probably
+ Javascript).
+
+ Used to distinguish executable code from textual content.
+ """
+ pass
+
+
+class TemplateString(NavigableString):
+ """A NavigableString representing a string found inside an HTML
+ template embedded in a larger document.
+
+ Used to distinguish such strings from the main body of the document.
+ """
+ pass
+
+
+class Tag(PageElement):
+ """Represents an HTML or XML tag that is part of a parse tree, along
+ with its attributes and contents.
+
+ When Beautiful Soup parses the markup penguin , it will
+ create a Tag object representing the tag.
+ """
def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None,
is_xml=None, sourceline=None, sourcepos=None,
can_be_empty_element=None, cdata_list_attributes=None,
- preserve_whitespace_tags=None
+ preserve_whitespace_tags=None,
+ interesting_string_types=None,
):
- "Basic constructor."
+ """Basic constructor.
+ :param parser: A BeautifulSoup object.
+ :param builder: A TreeBuilder.
+ :param name: The name of the tag.
+ :param namespace: The URI of this Tag's XML namespace, if any.
+ :param prefix: The prefix for this Tag's XML namespace, if any.
+ :param attrs: A dictionary of this Tag's attribute values.
+ :param parent: The PageElement to use as this Tag's parent.
+ :param previous: The PageElement that was parsed immediately before
+ this tag.
+ :param is_xml: If True, this is an XML tag. Otherwise, this is an
+ HTML tag.
+ :param sourceline: The line number where this tag was found in its
+ source document.
+ :param sourcepos: The character position within `sourceline` where this
+ tag was found.
+ :param can_be_empty_element: If True, this tag should be
+ represented as . If False, this tag should be represented
+ as .
+ :param cdata_list_attributes: A list of attributes whose values should
+ be treated as CDATA if they ever show up on this tag.
+ :param preserve_whitespace_tags: A list of tag names whose contents
+ should have their whitespace preserved.
+ :param interesting_string_types: This is a NavigableString
+ subclass or a tuple of them. When iterating over this
+ Tag's strings in methods like Tag.strings or Tag.get_text,
+ these are the types of strings that are interesting enough
+ to be considered. The default is to consider
+ NavigableString and CData the only interesting string
+ subtypes.
+ """
if parser is None:
self.parser_class = None
else:
@@ -779,6 +1232,7 @@ class Tag(PageElement):
self.can_be_empty_element = can_be_empty_element
self.cdata_list_attributes = cdata_list_attributes
self.preserve_whitespace_tags = preserve_whitespace_tags
+ self.interesting_string_types = interesting_string_types
else:
# Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self)
@@ -799,6 +1253,13 @@ class Tag(PageElement):
# Keep track of the names that might cause this tag to be treated as a
# whitespace-preserved tag.
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
+
+ if self.name in builder.string_containers:
+ # This sort of tag uses a special string container
+ # subclass for most of its strings. When we ask the
+ self.interesting_string_types = builder.string_containers[self.name]
+ else:
+ self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias("parser_class") # BS3
@@ -840,13 +1301,17 @@ class Tag(PageElement):
@property
def string(self):
- """Convenience property to get the single string within this tag.
+ """Convenience property to get the single string within this
+ PageElement.
- :Return: If this tag has a single string child, return value
- is that string. If this tag has no children, or more than one
- child, return value is None. If this tag has one child tag,
+ TODO It might make sense to have NavigableString.string return
+ itself.
+
+ :return: If this element has a single string child, return
+ value is that string. If this element has one child tag,
return value is the 'string' attribute of the child tag,
- recursively.
+ recursively. If this element is itself a string, has no
+ children, or has more than one child, return value is None.
"""
if len(self.contents) != 1:
return None
@@ -857,57 +1322,75 @@ class Tag(PageElement):
@string.setter
def string(self, string):
+ """Replace this PageElement's contents with `string`."""
self.clear()
self.append(string.__class__(string))
- def _all_strings(self, strip=False, types=(NavigableString, CData)):
+ DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
+ def _all_strings(self, strip=False, types=PageElement.default):
"""Yield all strings of certain classes, possibly stripping them.
- By default, yields only NavigableString and CData objects. So
- no comments, processing instructions, etc.
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :param types: A tuple of NavigableString subclasses. Any strings of
+ a subclass not found in this list will be ignored. By
+ default, the subclasses considered are the ones found in
+ self.interesting_string_types. If that's not specified,
+ only NavigableString and CData objects will be
+ considered. That means no comments, processing
+ instructions, etc.
+
+ :yield: A sequence of strings.
+
"""
+ if types is self.default:
+ types = self.interesting_string_types
+
for descendant in self.descendants:
- if (
- (types is None and not isinstance(descendant, NavigableString))
- or
- (types is not None and type(descendant) not in types)):
+ if (types is None and not isinstance(descendant, NavigableString)):
+ continue
+ descendant_type = type(descendant)
+ if isinstance(types, type):
+ if descendant_type is not types:
+ # We're not interested in strings of this type.
+ continue
+ elif types is not None and descendant_type not in types:
+ # We're not interested in strings of this type.
continue
if strip:
descendant = descendant.strip()
if len(descendant) == 0:
continue
yield descendant
-
strings = property(_all_strings)
- @property
- def stripped_strings(self):
- for string in self._all_strings(True):
- yield string
-
- def get_text(self, separator="", strip=False,
- types=(NavigableString, CData)):
- """
- Get all child strings, concatenated using the given separator.
- """
- return separator.join([s for s in self._all_strings(
- strip, types=types)])
- getText = get_text
- text = property(get_text)
-
def decompose(self):
- """Recursively destroys the contents of this tree."""
+ """Recursively destroys this PageElement and its children.
+
+ This element will be removed from the tree and wiped out; so
+ will everything beneath it.
+
+ The behavior of a decomposed PageElement is undefined and you
+ should never use one for anything, but if you need to _check_
+ whether an element has been decomposed, you can use the
+ `decomposed` property.
+ """
self.extract()
i = self
while i is not None:
- next = i.next_element
+ n = i.next_element
i.__dict__.clear()
i.contents = []
- i = next
-
+ i._decomposed = True
+ i = n
+
def clear(self, decompose=False):
- """
- Extract all children. If decompose is True, decompose instead.
+ """Wipe out all children of this PageElement by calling extract()
+ on them.
+
+ :param decompose: If this is True, decompose() (a more
+ destructive method) will be called instead of extract().
"""
if decompose:
for element in self.contents[:]:
@@ -920,7 +1403,8 @@ class Tag(PageElement):
element.extract()
def smooth(self):
- """Smooth out this element's children by consolidating consecutive strings.
+ """Smooth out this element's children by consolidating consecutive
+ strings.
This makes pretty-printed output look more natural following a
lot of operations that modified the tree.
@@ -957,9 +1441,12 @@ class Tag(PageElement):
a.replace_with(n)
def index(self, element):
- """
- Find the index of a child by identity, not value. Avoids issues with
- tag.contents.index(element) getting the index of equal elements.
+ """Find the index of a child by identity, not value.
+
+ Avoids issues with tag.contents.index(element) getting the
+ index of equal elements.
+
+ :param element: Look for this PageElement in `self.contents`.
"""
for i, child in enumerate(self.contents):
if child is element:
@@ -973,29 +1460,37 @@ class Tag(PageElement):
return self.attrs.get(key, default)
def get_attribute_list(self, key, default=None):
- """The same as get(), but always returns a list."""
+ """The same as get(), but always returns a list.
+
+ :param key: The attribute to look for.
+ :param default: Use this value if the attribute is not present
+ on this PageElement.
+ :return: A list of values, probably containing only a single
+ value.
+ """
value = self.get(key, default)
if not isinstance(value, list):
value = [value]
return value
def has_attr(self, key):
+ """Does this PageElement have an attribute with the given name?"""
return key in self.attrs
def __hash__(self):
return str(self).__hash__()
def __getitem__(self, key):
- """tag[key] returns the value of the 'key' attribute for the tag,
+ """tag[key] returns the value of the 'key' attribute for the Tag,
and throws an exception if it's not there."""
return self.attrs[key]
def __iter__(self):
- "Iterating over a tag iterates over its contents."
+ "Iterating over a Tag iterates over its contents."
return iter(self.contents)
def __len__(self):
- "The length of a tag is the length of its list of contents."
+ "The length of a Tag is the length of its list of contents."
return len(self.contents)
def __contains__(self, x):
@@ -1015,13 +1510,14 @@ class Tag(PageElement):
self.attrs.pop(key, None)
def __call__(self, *args, **kwargs):
- """Calling a tag like a function is the same as calling its
+ """Calling a Tag like a function is the same as calling its
find_all() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
return self.find_all(*args, **kwargs)
def __getattr__(self, tag):
- #print "Getattr %s.%s" % (self.__class__, tag)
+ """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
+ #print("Getattr %s.%s" % (self.__class__, tag))
if len(tag) > 3 and tag.endswith('Tag'):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
@@ -1038,8 +1534,8 @@ class Tag(PageElement):
"'%s' object has no attribute '%s'" % (self.__class__, tag))
def __eq__(self, other):
- """Returns true iff this tag has the same name, the same attributes,
- and the same contents (recursively) as the given tag."""
+ """Returns true iff this Tag has the same name, the same attributes,
+ and the same contents (recursively) as `other`."""
if self is other:
return True
if (not hasattr(other, 'name') or
@@ -1055,12 +1551,17 @@ class Tag(PageElement):
return True
def __ne__(self, other):
- """Returns true iff this tag is not identical to the other tag,
+ """Returns true iff this Tag is not identical to `other`,
as defined in __eq__."""
return not self == other
def __repr__(self, encoding="unicode-escape"):
- """Renders this tag as a string."""
+ """Renders this PageElement as a string.
+
+ :param encoding: The encoding to use (Python 2 only).
+ :return: Under Python 2, a bytestring; under Python 3,
+ a Unicode string.
+ """
if PY3K:
# "The return value must be a string object", i.e. Unicode
return self.decode()
@@ -1071,9 +1572,15 @@ class Tag(PageElement):
return self.encode(encoding)
def __unicode__(self):
+ """Renders this PageElement as a Unicode string."""
return self.decode()
def __str__(self):
+ """Renders this PageElement as a generic string.
+
+ :return: Under Python 2, a UTF-8 bytestring; under Python 3,
+ a Unicode string.
+ """
if PY3K:
return self.decode()
else:
@@ -1085,6 +1592,22 @@ class Tag(PageElement):
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
+ """Render a bytestring representation of this PageElement and its
+ contents.
+
+ :param encoding: The destination encoding.
+ :param indent_level: Each line of the rendering will be
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
+ :param errors: An error handling strategy such as
+ 'xmlcharrefreplace'. This value is passed along into
+ encode() and its value should be one of the constants
+ defined by Python.
+ :return: A bytestring.
+
+ """
# Turn the data structure into Unicode, then encode the
# Unicode.
u = self.decode(indent_level, encoding, formatter)
@@ -1093,14 +1616,20 @@ class Tag(PageElement):
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Returns a Unicode representation of this tag and its contents.
+ """Render a Unicode representation of this PageElement and its
+ contents.
+ :param indent_level: Each line of the rendering will be
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a tag that mentions the document's
- encoding.
+ encoded into this encoding. This method is _not_
+ responsible for performing that encoding. This information
+ is passed in so that it can be substituted in if the
+ document contains a tag that mentions the document's
+ encoding.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
"""
# First off, turn a non-Formatter `formatter` into a Formatter
@@ -1186,7 +1715,11 @@ class Tag(PageElement):
return s
def _should_pretty_print(self, indent_level):
- """Should this tag be pretty-printed?"""
+ """Should this tag be pretty-printed?
+
+ Most of them should, but some (such as in HTML
+ documents) should not.
+ """
return (
indent_level is not None
and (
@@ -1196,6 +1729,15 @@ class Tag(PageElement):
)
def prettify(self, encoding=None, formatter="minimal"):
+ """Pretty-print this PageElement as a string.
+
+ :param encoding: The eventual encoding of the string. If this is None,
+ a Unicode string will be returned.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
+ :return: A Unicode string (if encoding==None) or a bytestring
+ (otherwise).
+ """
if encoding is None:
return self.decode(True, formatter=formatter)
else:
@@ -1207,7 +1749,8 @@ class Tag(PageElement):
"""Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
- indented this many spaces.
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. decode_contents() is _not_
@@ -1249,23 +1792,26 @@ class Tag(PageElement):
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Renders the contents of this tag as a bytestring.
+ """Renders the contents of this PageElement as a bytestring.
:param indent_level: Each line of the rendering will be
- indented this many spaces.
+ indented this many spaces. Used internally in
+ recursive calls while pretty-printing.
:param eventual_encoding: The bytestring will be in this encoding.
- :param formatter: The output formatter responsible for converting
- entities to Unicode characters.
- """
+ :param formatter: A Formatter object, or a string naming one of
+ the standard Formatters.
+ :return: A bytestring.
+ """
contents = self.decode_contents(indent_level, encoding, formatter)
return contents.encode(encoding)
# Old method for BS3 compatibility
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
+ """Deprecated method for BS3 compatibility."""
if not prettyPrint:
indentLevel = None
return self.encode_contents(
@@ -1275,27 +1821,47 @@ class Tag(PageElement):
def find(self, name=None, attrs={}, recursive=True, text=None,
**kwargs):
- """Return only the first child of this Tag matching the given
- criteria."""
+ """Look in the children of this PageElement and find the first
+ PageElement that matches the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param recursive: If this is True, find() will perform a
+ recursive search of this PageElement's children. Otherwise,
+ only the direct children will be considered.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
r = None
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
if l:
r = l[0]
return r
- findChild = find
+ findChild = find #BS2
def find_all(self, name=None, attrs={}, recursive=True, text=None,
limit=None, **kwargs):
- """Extracts a list of Tag objects that match the given
- criteria. You can specify the name of the Tag and any
- attributes you want the Tag to have.
+ """Look in the children of this PageElement and find all
+ PageElements that match the given criteria.
- The value of a key-value pair in the 'attrs' map can be a
- string, a list of strings, a regular expression object, or a
- callable that takes a string and returns whether or not the
- string matches for some custom definition of 'matches'. The
- same is true of the tag name."""
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param recursive: If this is True, find_all() will perform a
+ recursive search of this PageElement's children. Otherwise,
+ only the direct children will be considered.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
generator = self.descendants
if not recursive:
generator = self.children
@@ -1306,11 +1872,20 @@ class Tag(PageElement):
#Generator methods
@property
def children(self):
+ """Iterate over all direct children of this PageElement.
+
+ :yield: A sequence of PageElements.
+ """
# return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def descendants(self):
+ """Iterate over all children of this PageElement in a
+ breadth-first sequence.
+
+ :yield: A sequence of PageElements.
+ """
if not len(self.contents):
return
stopNode = self._last_descendant().next_element
@@ -1321,7 +1896,21 @@ class Tag(PageElement):
# CSS selector code
def select_one(self, selector, namespaces=None, **kwargs):
- """Perform a CSS selection operation on the current element."""
+ """Perform a CSS selection operation on the current element.
+
+ :param selector: A CSS selector.
+
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select() method.
+
+ :return: A Tag.
+ :rtype: bs4.element.Tag
+ """
value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
@@ -1335,14 +1924,17 @@ class Tag(PageElement):
:param selector: A string containing a CSS selector.
:param namespaces: A dictionary mapping namespace prefixes
- used in the CSS selector to namespace URIs. By default,
- Beautiful Soup will use the prefixes it encountered while
- parsing the document.
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
:param limit: After finding this number of results, stop looking.
- :param kwargs: Any extra arguments you'd like to pass in to
- soupsieve.select().
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select() method.
+
+ :return: A ResultSet of Tags.
+ :rtype: bs4.element.ResultSet
"""
if namespaces is None:
namespaces = self._namespaces
@@ -1354,19 +1946,27 @@ class Tag(PageElement):
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
- return soupsieve.select(selector, self, namespaces, limit, **kwargs)
+ results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
+
+ # We do this because it's more consistent and because
+ # ResultSet.__getattr__ has a helpful error message.
+ return ResultSet(None, results)
# Old names for backwards compatibility
def childGenerator(self):
+ """Deprecated generator."""
return self.children
def recursiveChildGenerator(self):
+ """Deprecated generator."""
return self.descendants
def has_key(self, key):
- """This was kind of misleading because has_key() (attributes)
- was different from __in__ (contents). has_key() is gone in
- Python 3, anyway."""
+ """Deprecated method. This was kind of misleading because has_key()
+ (attributes) was different from __in__ (contents).
+
+ has_key() is gone in Python 3, anyway.
+ """
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
key))
return self.has_attr(key)
@@ -1374,9 +1974,26 @@ class Tag(PageElement):
# Next, a couple classes to represent queries and their results.
class SoupStrainer(object):
"""Encapsulates a number of ways of matching a markup element (tag or
- text)."""
+ string).
+
+ This is primarily used to underpin the find_* methods, but you can
+ create one yourself and pass it in as `parse_only` to the
+ `BeautifulSoup` constructor, to parse a subset of a large
+ document.
+ """
def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ """Constructor.
+
+ The SoupStrainer constructor takes the same arguments passed
+ into the find_* methods. See the online documentation for
+ detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param text: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ """
self.name = self._normalize_search_value(name)
if not isinstance(attrs, dict):
# Treat a non-dict value for attrs as a search for the 'class'
@@ -1434,17 +2051,38 @@ class SoupStrainer(object):
return str(str(value))
def __str__(self):
+ """A human-readable representation of this SoupStrainer."""
if self.text:
return self.text
else:
return "%s|%s" % (self.name, self.attrs)
def search_tag(self, markup_name=None, markup_attrs={}):
+ """Check whether a Tag with the given name and attributes would
+ match this SoupStrainer.
+
+ Used prospectively to decide whether to even bother creating a Tag
+ object.
+
+ :param markup_name: A tag name as found in some markup.
+ :param markup_attrs: A dictionary of attributes as found in some markup.
+
+ :return: True if the prospective tag would match this SoupStrainer;
+ False otherwise.
+ """
found = None
markup = None
if isinstance(markup_name, Tag):
markup = markup_name
markup_attrs = markup
+
+ if isinstance(self.name, str):
+ # Optimization for a very common case where the user is
+ # searching for a tag with one specific name, and we're
+ # looking at a tag with a different name.
+ if markup and not markup.prefix and self.name != markup.name:
+ return False
+
call_function_with_tag_data = (
isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
@@ -1478,10 +2116,19 @@ class SoupStrainer(object):
if found and self.text and not self._matches(found.string, self.text):
found = None
return found
+
+ # For BS3 compatibility.
searchTag = search_tag
def search(self, markup):
- # print 'looking for %s in %s' % (self, markup)
+ """Find all items in `markup` that match this SoupStrainer.
+
+ Used by the core _find_all() method, which is ultimately
+ called by all find_* methods.
+
+ :param markup: A PageElement or a list of them.
+ """
+ # print('looking for %s in %s' % (self, markup))
found = None
# If given a list of items, scan it for a text element that
# matches.
@@ -1507,7 +2154,7 @@ class SoupStrainer(object):
return found
def _matches(self, markup, match_against, already_tried=None):
- # print u"Matching %s against %s" % (markup, match_against)
+ # print(u"Matching %s against %s" % (markup, match_against))
result = False
if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute
@@ -1593,10 +2240,16 @@ class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
def __init__(self, source, result=()):
+ """Constructor.
+
+ :param source: A SoupStrainer.
+ :param result: A list of PageElements.
+ """
super(ResultSet, self).__init__(result)
self.source = source
def __getattr__(self, key):
+ """Raise a helpful exception to explain a common code fix."""
raise AttributeError(
- "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
+ "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
)
diff --git a/lib/bs4/formatter.py b/lib/bs4/formatter.py
index 7dbaa385..3bd9f859 100644
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
+
+ Formatters are passed in as the `formatter` argument to methods
+ like `PageElement.encode`. Most people won't need to think about
+ formatters, and most people who need to think about them can pass
+ in one of these predefined strings as `formatter` rather than
+ making a new Formatter object:
+
+ For HTML documents:
+ * 'html' - HTML entity substitution for generic HTML documents. (default)
+ * 'html5' - HTML entity substitution for HTML5 documents, as
+ well as some optimizations in the way tags are rendered.
+ * 'minimal' - Only make the substitutions necessary to guarantee
+ valid HTML.
+ * None - Do not perform any substitution. This will be faster
+ but may result in invalid markup.
+
+ For XML documents:
+ * 'html' - Entity substitution for XHTML documents.
+ * 'minimal' - Only make the substitutions necessary to guarantee
+ valid XML. (default)
+ * None - Do not perform any substitution. This will be faster
+ but may result in invalid markup.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
@@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
+ empty_attributes_are_booleans=False,
):
- """
+ """Constructor.
- :param void_element_close_prefix: By default, represent void
- elements as rather than
+ :param language: This should be Formatter.XML if you are formatting
+ XML markup and Formatter.HTML if you are formatting HTML markup.
+
+ :param entity_substitution: A function to call to replace special
+ characters with XML/HTML entities. For examples, see
+ bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
+ :param void_element_close_prefix: By default, void elements
+ are represented as (XML rules) rather than
+ (HTML rules). To get , pass in the empty string.
+ :param cdata_containing_tags: The list of tags that are defined
+ as containing CDATA in this dialect. For example, in HTML,
+ "
+ )
+ assert isinstance(soup.style.string, Stylesheet)
+ assert isinstance(soup.script.string, Script)
+
+ soup = self.soup(
+ ""
+ )
+ assert isinstance(soup.style.string, Stylesheet)
+ # The contents of the style tag resemble an HTML comment, but
+ # it's not treated as a comment.
+ self.assertEqual("", soup.style.string)
+ assert isinstance(soup.style.string, Stylesheet)
+
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
@@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
- self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+ self.assertEqual(
+ soup.encode("utf8")[:len(doctype_str)],
+ doctype_str
+ )
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
- def _document_with_doctype(self, doctype_fragment):
+ def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
"""Generate and parse a document with the given doctype."""
- doctype = '' % doctype_fragment
+ doctype = '' % (doctype_string, doctype_fragment)
markup = doctype + '\nfoo
'
soup = self.soup(markup)
- return doctype, soup
+ return doctype.encode("utf8"), soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
@@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object):
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
+ def test_mixed_case_doctype(self):
+ # A lowercase or mixed-case doctype becomes a Doctype.
+ for doctype_fragment in ("doctype", "DocType"):
+ doctype_str, soup = self._document_with_doctype(
+ "html", doctype_fragment
+ )
+
+ # Make sure a Doctype object was created and that the DOCTYPE
+ # is uppercase.
+ doctype = soup.contents[0]
+ self.assertEqual(doctype.__class__, Doctype)
+ self.assertEqual(doctype, "html")
+ self.assertEqual(
+ soup.encode("utf8")[:len(doctype_str)],
+ b""
+ )
+
+ # Make sure that the doctype was correctly associated with the
+ # parse tree and that the rest of the document parsed.
+ self.assertEqual(soup.p.contents[0], 'foo')
+
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
@@ -532,7 +624,7 @@ Hello, world!
self.assertSoupEquals("", expect)
self.assertSoupEquals("", expect)
self.assertSoupEquals("", expect)
-
+
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("\nfoo
")
@@ -594,7 +686,7 @@ Hello, world!
markup = b''
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class'])
-
+
#
# Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are
@@ -779,11 +871,44 @@ Hello, world!
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
+ def test_python_specific_encodings_not_used_in_charset(self):
+ # You can encode an HTML document using a Python-specific
+ # encoding, but that encoding won't be mentioned _inside_ the
+ # resulting document. Instead, the document will appear to
+ # have no encoding.
+ for markup in [
+ b' '
+ b' '
+ ]:
+ soup = self.soup(markup)
+ for encoding in PYTHON_SPECIFIC_ENCODINGS:
+ if encoding in (
+ 'idna', 'mbcs', 'oem', 'undefined',
+ 'string_escape', 'string-escape'
+ ):
+ # For one reason or another, these will raise an
+ # exception if we actually try to use them, so don't
+ # bother.
+ continue
+ encoded = soup.encode(encoding)
+ assert b'meta charset=""' in encoded
+ assert encoding.encode("ascii") not in encoded
+
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup(" text ")
data.a['foo'] = 'bar'
self.assertEqual('text ', data.a.decode())
+ def test_closing_tag_with_no_opening_tag(self):
+ # Without BeautifulSoup.open_tag_counter, the tag will
+ # cause _popToTag to be called over and over again as we look
+ # for a tag that wasn't there. The result is that 'text2'
+ # will show up outside the body of the document.
+ soup = self.soup("")
+ self.assertEqual(
+ "", soup.body.decode()
+ )
+
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
@@ -791,7 +916,7 @@ Hello, world!
self.linkage_validator(soup)
-class XMLTreeBuilderSmokeTest(object):
+class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
@@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
+ def test_python_specific_encodings_not_used_in_xml_declaration(self):
+ # You can encode an XML document using a Python-specific
+ # encoding, but that encoding won't be mentioned _inside_ the
+ # resulting document.
+ markup = b"""\n """
+ soup = self.soup(markup)
+ for encoding in PYTHON_SPECIFIC_ENCODINGS:
+ if encoding in (
+ 'idna', 'mbcs', 'oem', 'undefined',
+ 'string_escape', 'string-escape'
+ ):
+ # For one reason or another, these will raise an
+ # exception if we actually try to use them, so don't
+ # bother.
+ continue
+ encoded = soup.encode(encoding)
+ assert b'' in encoded
+ assert encoding.encode("ascii") not in encoded
+
def test_processing_instruction(self):
markup = b"""\n"""
soup = self.soup(markup)
@@ -828,7 +972,7 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8"), markup)
-
+
def test_nested_namespaces(self):
doc = b"""
diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py
index d7a0b298..f8902ad7 100644
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
+
+ def test_special_string_containers(self):
+ # The html5lib tree builder doesn't support this standard feature,
+ # because there's no way of knowing, when a string is created,
+ # where in the tree it will eventually end up.
+ pass
+
+ def test_html5_attributes(self):
+ # The html5lib TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ #
+ # This is a copy of the same test from
+ # HTMLParserTreeBuilderSmokeTest. It's not in the superclass
+ # because the lxml HTML TreeBuilder _doesn't_ work this way.
+ for input_element, output_unicode, output_element in (
+ ("⇄", '\u21c4', b'⇄'),
+ ('⊧', '\u22a7', b'⊧'),
+ ('𝔑', '\U0001d511', b'𝔑'),
+ ('≧̸', '\u2267\u0338', b'≧̸'),
+ ('¬', '\xac', b'¬'),
+ ('⫬', '\u2aec', b'⫬'),
+ ('"', '"', b'"'),
+ ('∴', '\u2234', b'∴'),
+ ('∴', '\u2234', b'∴'),
+ ('∴', '\u2234', b'∴'),
+ ("fj", 'fj', b'fj'),
+ ("⊔", '\u2294', b'⊔'),
+ ("⊔︀", '\u2294\ufe00', b'⊔︀'),
+ ("'", "'", b"'"),
+ ("|", "|", b"|"),
+ ):
+ markup = '%s
' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"%s
" % output_unicode.encode("utf8")
+ self.assertEqual(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"%s
" % output_element
+ self.assertEqual(with_element, expect)
diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py
index 7be64935..0d8161ef 100644
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@@ -3,6 +3,7 @@ trees."""
from pdb import set_trace
import pickle
+import warnings
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
+ def test_on_duplicate_attribute(self):
+ # The html.parser tree builder has a variety of ways of
+ # handling a tag that contains the same attribute multiple times.
+
+ markup = ''
+
+ # If you don't provide any particular value for
+ # on_duplicate_attribute, later values replace earlier values.
+ soup = self.soup(markup)
+ self.assertEqual("url3", soup.a['href'])
+ self.assertEqual(["cls"], soup.a['class'])
+ self.assertEqual("id", soup.a['id'])
+ # You can also get this behavior explicitly.
+ def assert_attribute(on_duplicate_attribute, expected):
+ soup = self.soup(
+ markup, on_duplicate_attribute=on_duplicate_attribute
+ )
+ self.assertEqual(expected, soup.a['href'])
+
+ # Verify that non-duplicate attributes are treated normally.
+ self.assertEqual(["cls"], soup.a['class'])
+ self.assertEqual("id", soup.a['id'])
+ assert_attribute(None, "url3")
+ assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
+
+ # You can ignore subsequent values in favor of the first.
+ assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
+
+ # And you can pass in a callable that does whatever you want.
+ def accumulate(attrs, key, value):
+ if not isinstance(attrs[key], list):
+ attrs[key] = [attrs[key]]
+ attrs[key].append(value)
+ assert_attribute(accumulate, ["url1", "url2", "url3"])
+
+ def test_html5_attributes(self):
+ # The html.parser TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ for input_element, output_unicode, output_element in (
+ ("⇄", '\u21c4', b'⇄'),
+ ('⊧', '\u22a7', b'⊧'),
+ ('𝔑', '\U0001d511', b'𝔑'),
+ ('≧̸', '\u2267\u0338', b'≧̸'),
+ ('¬', '\xac', b'¬'),
+ ('⫬', '\u2aec', b'⫬'),
+ ('"', '"', b'"'),
+ ('∴', '\u2234', b'∴'),
+ ('∴', '\u2234', b'∴'),
+ ('∴', '\u2234', b'∴'),
+ ("fj", 'fj', b'fj'),
+ ("⊔", '\u2294', b'⊔'),
+ ("⊔︀", '\u2294\ufe00', b'⊔︀'),
+ ("'", "'", b"'"),
+ ("|", "|", b"|"),
+ ):
+ markup = '%s
' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"%s
" % output_unicode.encode("utf8")
+ self.assertEqual(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"%s
" % output_element
+ self.assertEqual(with_element, expect)
+
+
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
- parser.error("don't crash")
+ with warnings.catch_warnings(record=True) as warns:
+ parser.error("don't crash")
+ [warning] = warns
+ assert "don't crash" == str(warning.message)
+
diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py
index 3d0c75fa..71931ffe 100644
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"foobar
", "foobar
")
self.assertSoupEquals(
"foobar
", "foobar
")
-
+
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "", it's been (incorrectly) converted into
diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py
index 0e7dac11..4d00845d 100644
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@@ -3,6 +3,7 @@
from pdb import set_trace
import logging
+import os
import unittest
import sys
import tempfile
@@ -10,6 +11,8 @@ import tempfile
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
+ GuessedAtParserWarning,
+ MarkupResemblesLocatorWarning,
)
from bs4.builder import (
TreeBuilder,
@@ -29,7 +32,6 @@ import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
- EncodingDetector,
)
from bs4.testing import (
default_builder,
@@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
self.store_line_numbers = False
self.cdata_list_attributes = []
self.preserve_whitespace_tags = []
+ self.string_containers = {}
def initialize_soup(self, soup):
pass
def feed(self, markup):
@@ -186,28 +189,69 @@ class TestConstructor(SoupTest):
isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)
+
+ def test_alternate_string_containers(self):
+ # Test the ability to customize the string containers for
+ # different types of tags.
+ class PString(NavigableString):
+ pass
+
+ class BString(NavigableString):
+ pass
+
+ soup = self.soup(
+ "Hello.
Here is some bolded text",
+ string_containers = {
+ 'b': BString,
+ 'p': PString,
+ }
+ )
+
+ # The string before the
tag is a regular NavigableString.
+ assert isinstance(soup.div.contents[0], NavigableString)
+ # The string inside the
tag, but not inside the tag,
+ # is a PString.
+ assert isinstance(soup.p.contents[0], PString)
+
+ # Every string inside the tag is a BString, even the one that
+ # was also inside an tag.
+ for s in soup.b.strings:
+ assert isinstance(s, BString)
+
+ # Now that parsing was complete, the string_container_stack
+ # (where this information was kept) has been cleared out.
+ self.assertEqual([], soup.string_container_stack)
+
+
class TestWarnings(SoupTest):
- def _no_parser_specified(self, s, is_there=True):
- v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
- self.assertTrue(v)
+ def _assert_warning(self, warnings, cls):
+ for w in warnings:
+ if isinstance(w.message, cls):
+ return w
+ raise Exception("%s warning not found in %r" % cls, warnings)
+
+ def _assert_no_parser_specified(self, w):
+ warning = self._assert_warning(w, GuessedAtParserWarning)
+ message = str(warning.message)
+ self.assertTrue(
+ message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
+ )
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup(" ")
- msg = str(w[0].message)
- self._assert_no_parser_specified(msg)
+ soup = BeautifulSoup(" ")
+ self._assert_no_parser_specified(w)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup(" ", "html")
- msg = str(w[0].message)
- self._assert_no_parser_specified(msg)
+ soup = BeautifulSoup(" ", "html")
+ self._assert_no_parser_specified(w)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup(" ", "html.parser")
+ soup = BeautifulSoup(" ", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
@@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
self.assertRaises(
TypeError, self.soup, "", no_such_argument=True)
-class TestWarnings(SoupTest):
-
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
- msg = str(w[0].message)
- self.assertTrue("looks like a filename" in msg)
+ warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+ self.assertTrue("looks like a filename" in str(warning.message))
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
- self.assertEqual(0, len(w))
+ self.assertEqual([], w)
+ def test_directory_warning(self):
+ try:
+ filename = tempfile.mkdtemp()
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(filename)
+ warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
+ self.assertTrue("looks like a directory" in str(warning.message))
+ finally:
+ os.rmdir(filename)
+
+ # The directory no longer exists, so Beautiful Soup will no longer issue the warning.
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(filename)
+ self.assertEqual([], w)
+
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
- # Be aware this isn't the only warning that can be raised during
- # execution..
- self.assertTrue(any("looks like a URL" in str(w.message)
- for w in warning_list))
+ warning = self._assert_warning(
+ warning_list, MarkupResemblesLocatorWarning
+ )
+ self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup("http://www.crummyunicode.com/")
- self.assertTrue(any("looks like a URL" in str(w.message)
- for w in warning_list))
+ warning = self._assert_warning(
+ warning_list, MarkupResemblesLocatorWarning
+ )
+ self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_bytes_and_space(self):
+ # Here the markup contains something besides a URL, so no warning
+ # is issued.
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
@@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual(self.sub.substitute_html(dammit.markup),
"‘’foo“”")
+ def test_html5_entity(self):
+ # Some HTML5 entities correspond to single- or multi-character
+ # Unicode sequences.
+
+ for entity, u in (
+ # A few spot checks of our ability to recognize
+ # special character sequences and convert them
+ # to named entities.
+ ('⊧', '\u22a7'),
+ ('𝔑', '\U0001d511'),
+ ('≧̸', '\u2267\u0338'),
+ ('¬', '\xac'),
+ ('⫬', '\u2aec'),
+
+ # We _could_ convert | to &verbarr;, but we don't, because
+ # | is an ASCII character.
+ ('|' '|'),
+
+ # Similarly for the fj ligature, which we could convert to
+ # fj, but we don't.
+ ("fj", "fj"),
+
+ # We do convert _these_ ASCII characters to HTML entities,
+ # because that's required to generate valid HTML.
+ ('>', '>'),
+ ('<', '<'),
+ ('&', '&'),
+ ):
+ template = '3 %s 4'
+ raw = template % u
+ with_entities = template % entity
+ self.assertEqual(self.sub.substitute_html(raw), with_entities)
+
+ def test_html5_entity_with_variation_selector(self):
+ # Some HTML5 entities correspond either to a single-character
+ # Unicode sequence _or_ to the same character plus U+FE00,
+ # VARIATION SELECTOR 1. We can handle this.
+ data = "fjords \u2294 penguins"
+ markup = "fjords ⊔ penguins"
+ self.assertEqual(self.sub.substitute_html(data), markup)
+
+ data = "fjords \u2294\ufe00 penguins"
+ markup = "fjords ⊔︀ penguins"
+ self.assertEqual(self.sub.substitute_html(data), markup)
+
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
@@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest):
markup = ''
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
-class TestUnicodeDammit(unittest.TestCase):
- """Standalone tests of UnicodeDammit."""
- def test_unicode_input(self):
- markup = "I'm already Unicode! \N{SNOWMAN}"
- dammit = UnicodeDammit(markup)
- self.assertEqual(dammit.unicode_markup, markup)
-
- def test_smart_quotes_to_unicode(self):
- markup = b"\x91\x92\x93\x94 "
- dammit = UnicodeDammit(markup)
- self.assertEqual(
- dammit.unicode_markup, "\u2018\u2019\u201c\u201d ")
-
- def test_smart_quotes_to_xml_entities(self):
- markup = b"\x91\x92\x93\x94 "
- dammit = UnicodeDammit(markup, smart_quotes_to="xml")
- self.assertEqual(
- dammit.unicode_markup, "‘’“” ")
-
- def test_smart_quotes_to_html_entities(self):
- markup = b"\x91\x92\x93\x94 "
- dammit = UnicodeDammit(markup, smart_quotes_to="html")
- self.assertEqual(
- dammit.unicode_markup, "‘’“” ")
-
- def test_smart_quotes_to_ascii(self):
- markup = b"\x91\x92\x93\x94 "
- dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
- self.assertEqual(
- dammit.unicode_markup, """''"" """)
-
- def test_detect_utf8(self):
- utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
- dammit = UnicodeDammit(utf8)
- self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
- self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
-
-
- def test_convert_hebrew(self):
- hebrew = b"\xed\xe5\xec\xf9"
- dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
- self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
- self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
-
- def test_dont_see_smart_quotes_where_there_are_none(self):
- utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
- dammit = UnicodeDammit(utf_8)
- self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
- self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
-
- def test_ignore_inappropriate_codecs(self):
- utf8_data = "Räksmörgås".encode("utf-8")
- dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
- self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
-
- def test_ignore_invalid_codecs(self):
- utf8_data = "Räksmörgås".encode("utf-8")
- for bad_encoding in ['.utf8', '...', 'utF---16.!']:
- dammit = UnicodeDammit(utf8_data, [bad_encoding])
- self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
-
- def test_exclude_encodings(self):
- # This is UTF-8.
- utf8_data = "Räksmörgås".encode("utf-8")
-
- # But if we exclude UTF-8 from consideration, the guess is
- # Windows-1252.
- dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
- self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
-
- # And if we exclude that, there is no valid guess at all.
- dammit = UnicodeDammit(
- utf8_data, exclude_encodings=["utf-8", "windows-1252"])
- self.assertEqual(dammit.original_encoding, None)
-
- def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
- detected = EncodingDetector(
- b'')
- encodings = list(detected.encodings)
- assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
-
- def test_detect_html5_style_meta_tag(self):
-
- for data in (
- b' ',
- b" ",
- b" ",
- b" "):
- dammit = UnicodeDammit(data, is_html=True)
- self.assertEqual(
- "euc-jp", dammit.original_encoding)
-
- def test_last_ditch_entity_replacement(self):
- # This is a UTF-8 document that contains bytestrings
- # completely incompatible with UTF-8 (ie. encoded with some other
- # encoding).
- #
- # Since there is no consistent encoding for the document,
- # Unicode, Dammit will eventually encode the document as UTF-8
- # and encode the incompatible characters as REPLACEMENT
- # CHARACTER.
- #
- # If chardet is installed, it will detect that the document
- # can be converted into ISO-8859-1 without errors. This happens
- # to be the wrong encoding, but it is a consistent encoding, so the
- # code we're testing here won't run.
- #
- # So we temporarily disable chardet if it's present.
- doc = b"""\357\273\277
-\330\250\330\252\330\261
-\310\322\321\220\312\321\355\344 """
- chardet = bs4.dammit.chardet_dammit
- logging.disable(logging.WARNING)
- try:
- def noop(str):
- return None
- bs4.dammit.chardet_dammit = noop
- dammit = UnicodeDammit(doc)
- self.assertEqual(True, dammit.contains_replacement_characters)
- self.assertTrue("\ufffd" in dammit.unicode_markup)
-
- soup = BeautifulSoup(doc, "html.parser")
- self.assertTrue(soup.contains_replacement_characters)
- finally:
- logging.disable(logging.NOTSET)
- bs4.dammit.chardet_dammit = chardet
-
- def test_byte_order_mark_removed(self):
- # A document written in UTF-16LE will have its byte order marker stripped.
- data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
- dammit = UnicodeDammit(data)
- self.assertEqual(" áé ", dammit.unicode_markup)
- self.assertEqual("utf-16le", dammit.original_encoding)
-
- def test_detwingle(self):
- # Here's a UTF8 document.
- utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
-
- # Here's a Windows-1252 document.
- windows_1252 = (
- "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
- "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
-
- # Through some unholy alchemy, they've been stuck together.
- doc = utf8 + windows_1252 + utf8
-
- # The document can't be turned into UTF-8:
- self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
-
- # Unicode, Dammit thinks the whole document is Windows-1252,
- # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
-
- # But if we run it through fix_embedded_windows_1252, it's fixed:
-
- fixed = UnicodeDammit.detwingle(doc)
- self.assertEqual(
- "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
-
- def test_detwingle_ignores_multibyte_characters(self):
- # Each of these characters has a UTF-8 representation ending
- # in \x93. \x93 is a smart quote if interpreted as
- # Windows-1252. But our code knows to skip over multibyte
- # UTF-8 characters, so they'll survive the process unscathed.
- for tricky_unicode_char in (
- "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
- "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
- "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
- ):
- input = tricky_unicode_char.encode("utf8")
- self.assertTrue(input.endswith(b'\x93'))
- output = UnicodeDammit.detwingle(input)
- self.assertEqual(output, input)
-
- def test_find_declared_encoding(self):
- # Test our ability to find a declared encoding inside an
- # XML or HTML document.
- #
- # Even if the document comes in as Unicode, it may be
- # interesting to know what encoding was claimed
- # originally.
-
- html_unicode = ' '
- html_bytes = html_unicode.encode("ascii")
-
- xml_unicode= ''
- xml_bytes = xml_unicode.encode("ascii")
-
- m = EncodingDetector.find_declared_encoding
- self.assertEqual(None, m(html_unicode, is_html=False))
- self.assertEqual("utf-8", m(html_unicode, is_html=True))
- self.assertEqual("utf-8", m(html_bytes, is_html=True))
-
- self.assertEqual("iso-8859-1", m(xml_unicode))
- self.assertEqual("iso-8859-1", m(xml_bytes))
-
- # Normally, only the first few kilobytes of a document are checked for
- # an encoding.
- spacer = b' ' * 5000
- self.assertEqual(None, m(spacer + html_bytes))
- self.assertEqual(None, m(spacer + xml_bytes))
-
- # But you can tell find_declared_encoding to search an entire
- # HTML document.
- self.assertEqual(
- "utf-8",
- m(spacer + html_bytes, is_html=True, search_entire_document=True)
- )
-
- # The XML encoding declaration has to be the very first thing
- # in the document. We'll allow whitespace before the document
- # starts, but nothing else.
- self.assertEqual(
- "iso-8859-1",
- m(xml_bytes, search_entire_document=True)
- )
- self.assertEqual(
- None, m(b'a' + xml_bytes, search_entire_document=True)
- )
-
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none_or_missing(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
+ a = NamespacedAttribute("xmlns", "")
+ self.assertEqual(a, "xmlns")
+
a = NamespacedAttribute("xmlns")
self.assertEqual(a, "xmlns")
+ def test_namespace_may_be_none_or_missing(self):
+ a = NamespacedAttribute(None, "tag")
+ self.assertEqual(a, "tag")
+
+ a = NamespacedAttribute("", "tag")
+ self.assertEqual(a, "tag")
+
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py
index e69afdf9..59b51d0b 100644
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@@ -27,13 +27,17 @@ from bs4.element import (
Doctype,
Formatter,
NavigableString,
+ Script,
SoupStrainer,
+ Stylesheet,
Tag,
+ TemplateString,
)
from bs4.testing import (
SoupTest,
skipIf,
)
+from soupsieve import SelectorSyntaxError
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
soup.a.extend(l)
self.assertEqual(" ", soup.decode())
+ def test_extend_with_another_tags_contents(self):
+ data = '
'
+ soup = self.soup(data)
+ d1 = soup.find('div', id='d1')
+ d2 = soup.find('div', id='d2')
+ d2.extend(d1)
+ self.assertEqual('
', d1.decode())
+ self.assertEqual('', d2.decode())
+
def test_move_tag_to_beginning_of_parent(self):
data = " "
soup = self.soup(data)
@@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
self.assertEqual(no.next_element, "no")
self.assertEqual(no.next_sibling, " business")
+ def test_replace_with_errors(self):
+ # Can't replace a tag that's not part of a tree.
+ a_tag = Tag(name="a")
+ self.assertRaises(ValueError, a_tag.replace_with, "won't work")
+
+ # Can't replace a tag with its parent.
+ a_tag = self.soup(" ").a
+ self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
+
+ # Or with a list that includes its parent.
+ self.assertRaises(ValueError, a_tag.b.replace_with,
+ "string1", a_tag, "string2")
+
+ def test_replace_with_multiple(self):
+ data = " "
+ soup = self.soup(data)
+ d_tag = soup.new_tag("d")
+ d_tag.string = "Text In D Tag"
+ e_tag = soup.new_tag("e")
+ f_tag = soup.new_tag("f")
+ a_string = "Random Text"
+ soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
+ self.assertEqual(
+ "Text In D Tag Random Text ",
+ soup.decode()
+ )
+ assert soup.b.next_element == d_tag
+ assert d_tag.string.next_element==e_tag
+ assert e_tag.next_element.string == a_string
+ assert e_tag.next_element.next_element == f_tag
+
def test_replace_first_child(self):
data = " "
soup = self.soup(data)
@@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
a.clear(decompose=True)
self.assertEqual(0, len(em.contents))
+
+ def test_decompose(self):
+ # Test PageElement.decompose() and PageElement.decomposed
+ soup = self.soup("String Italicized
Another para
")
+ p1, p2 = soup.find_all('p')
+ a = p1.a
+ text = p1.em.string
+ for i in [p1, p2, a, text]:
+ self.assertEqual(False, i.decomposed)
+
+ # This sets p1 and everything beneath it to decomposed.
+ p1.decompose()
+ for i in [p1, a, text]:
+ self.assertEqual(True, i.decomposed)
+ # p2 is unaffected.
+ self.assertEqual(False, p2.decomposed)
+
def test_string_set(self):
"""Tag.string = 'string'"""
soup = self.soup("
")
@@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
- def test_get_text_ignores_comments(self):
+ def test_get_text_ignores_special_string_containers(self):
soup = self.soup("foobar")
self.assertEqual(soup.get_text(), "foobar")
@@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
- def test_all_strings_ignores_comments(self):
+ soup = self.soup("foobar")
+ self.assertEqual(soup.get_text(), "foobar")
+
+ def test_all_strings_ignores_special_string_containers(self):
soup = self.soup("foobar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
+ soup = self.soup("foobar")
+ self.assertEqual(['foo', 'bar'], list(soup.strings))
+
+ def test_string_methods_inside_special_string_container_tags(self):
+ # Strings inside tags like
")
+
+ self.assertEqual(style.div.get_text(), "a")
+ self.assertEqual(list(style.div.strings), ["a"])
+ self.assertEqual(style.div.style.get_text(), "Some CSS")
+ self.assertEqual(list(style.div.style.strings),
+ ['Some CSS'])
+
+ # The comment is not picked up here. That's because it was
+ # parsed into a Comment object, which is not considered
+ # interesting by template.strings.
+ self.assertEqual(template.div.get_text(), "a")
+ self.assertEqual(list(template.div.strings), ["a"])
+ self.assertEqual(template.div.template.get_text(), "Templated text.")
+ self.assertEqual(list(template.div.template.strings),
+ ["Templated ", "text", "."])
+
+ # The comment is included here, because it didn't get parsed
+ # into a Comment object--it's part of the Script string.
+ self.assertEqual(script.div.get_text(), "a")
+ self.assertEqual(list(script.div.strings), ["a"])
+ self.assertEqual(script.div.script.get_text(),
+ "Some text")
+ self.assertEqual(list(script.div.script.strings),
+ ['Some text'])
+
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
@@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest):
else:
self.assertEqual(b'\\u2603 ', repr(soup))
-class TestFormatter(SoupTest):
-
- def test_sort_attributes(self):
- # Test the ability to override Formatter.attributes() to,
- # e.g., disable the normal sorting of attributes.
- class UnsortedFormatter(Formatter):
- def attributes(self, tag):
- self.called_with = tag
- for k, v in sorted(tag.attrs.items()):
- if k == 'ignore':
- continue
- yield k,v
-
- soup = self.soup('
')
- formatter = UnsortedFormatter()
- decoded = soup.decode(formatter=formatter)
-
- # attributes() was called on the tag. It filtered out one
- # attribute and sorted the other two.
- self.assertEqual(formatter.called_with, soup.p)
- self.assertEqual('
', decoded)
-
-
-class TestNavigableStringSubclasses(SoupTest):
-
- def test_cdata(self):
- # None of the current builders turn CDATA sections into CData
- # objects, but you can create them manually.
- soup = self.soup("")
- cdata = CData("foo")
- soup.insert(1, cdata)
- self.assertEqual(str(soup), "")
- self.assertEqual(soup.find(text="foo"), "foo")
- self.assertEqual(soup.contents[0], "foo")
-
- def test_cdata_is_never_formatted(self):
- """Text inside a CData object is passed into the formatter.
-
- But the return value is ignored.
- """
-
- self.count = 0
- def increment(*args):
- self.count += 1
- return "BITTER FAILURE"
-
- soup = self.soup("")
- cdata = CData("<><><>")
- soup.insert(1, cdata)
- self.assertEqual(
- b"<><>]]>", soup.encode(formatter=increment))
- self.assertEqual(1, self.count)
-
- def test_doctype_ends_in_newline(self):
- # Unlike other NavigableString subclasses, a DOCTYPE always ends
- # in a newline.
- doctype = Doctype("foo")
- soup = self.soup("")
- soup.insert(1, doctype)
- self.assertEqual(soup.encode(), b"\n")
-
- def test_declaration(self):
- d = Declaration("foo")
- self.assertEqual("", d.output_ready())
-
+
class TestSoupSelector(TreeTest):
HTML = """
@@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
- self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
+ self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
- SyntaxError, self.soup.select, "a:nth-of-type(a)")
+ SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
@@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
- self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
+ self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
- self.assertRaises(SyntaxError, self.soup.select, ',x, y')
- self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
+ self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
+ self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])