Update bs4 to 4.8.1 (with 2to3)

This commit is contained in:
JonnyWong16 2019-11-23 18:54:24 -08:00
parent 23c4e5b09d
commit f28e741ad7
19 changed files with 5487 additions and 792 deletions

View file

@ -5,26 +5,30 @@ http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup (possibly invalid) document into a tree representation. Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to provides methods and Pythonic idioms that make it easy to navigate,
navigate, search, and modify the parse tree. search, and modify the parse tree.
Beautiful Soup works with Python 2.6 and up. It works better if lxml Beautiful Soup works with Python 2.7 and up. It works better if lxml
and/or html5lib is installed. and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the For more than you ever wanted to know about Beautiful Soup, see the
documentation: documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.3.2" __version__ = "4.8.1"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
import os import os
import re import re
import sys
import traceback
import warnings import warnings
from .builder import builder_registry, ParserRejectedMarkup from .builder import builder_registry, ParserRejectedMarkup
@ -45,7 +49,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is # The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it. # running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag): class BeautifulSoup(Tag):
""" """
@ -59,7 +63,7 @@ class BeautifulSoup(Tag):
handle_starttag(name, attrs) # See note about return value handle_starttag(name, attrs) # See note about return value
handle_endtag(name) handle_endtag(name)
handle_data(data) # Appends to the current data node handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events, able to build a tree using 'start tag' events, 'end tag' events,
@ -69,21 +73,70 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then like HTML's <br> tag), call handle_starttag and then
handle_endtag. handle_endtag.
""" """
ROOT_TAG_NAME = u'[document]' ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they # If the end-user gives no indication which tree builder they
# want, look for one with these features. # want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast'] DEFAULT_BUILDER_FEATURES = ['html', 'fast']
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs): parse_only=None, from_encoding=None, exclude_encodings=None,
"""The Soup object is initialized as the 'root tag', and the element_classes=None, **kwargs):
provided markup (which can be a string or a file-like object) """Constructor.
is fed into the underlying parser."""
:param markup: A string or a file-like object representing
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or
instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param element_classes: A dictionary mapping BeautifulSoup
classes like Tag and NavigableString to other classes you'd
like to be instantiated instead as the parse tree is
built. This is useful for using subclasses to modify the
default behavior of Tag or NavigableString.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4; they will result in a warning and then be ignored.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This
makes it possible to configure a TreeBuilder beyond saying
which one to use.
"""
if 'convertEntities' in kwargs: if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn( warnings.warn(
"BS4 does not respect the convertEntities argument to the " "BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted " "BeautifulSoup constructor. Entities are always converted "
@ -114,9 +167,9 @@ class BeautifulSoup(Tag):
del kwargs['isHTML'] del kwargs['isHTML']
warnings.warn( warnings.warn(
"BS4 does not respect the isHTML argument to the " "BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. You can pass in features='html' " "BeautifulSoup constructor. Suggest you use "
"or features='xml' to get a builder capable of handling " "features='lxml' for HTML and features='lxml-xml' for "
"one or the other.") "XML.")
def deprecated_argument(old_name, new_name): def deprecated_argument(old_name, new_name):
if old_name in kwargs: if old_name in kwargs:
@ -134,13 +187,24 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument( from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if len(kwargs) > 0: if from_encoding and isinstance(markup, str):
arg = kwargs.keys().pop() warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
raise TypeError( from_encoding = None
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None: self.element_classes = element_classes or dict()
if isinstance(features, basestring):
# We need this information to track whether or not the builder
# was specified well enough that we can omit the 'you need to
# specify a parser' warning.
original_builder = builder
original_features = features
if isinstance(builder, type):
# A builder class was passed in; it needs to be instantiated.
builder_class = builder
builder = None
elif builder is None:
if isinstance(features, str):
features = [features] features = [features]
if features is None or len(features) == 0: if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES features = self.DEFAULT_BUILDER_FEATURES
@ -150,21 +214,73 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you " "Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?" "requested: %s. Do you need to install a parser library?"
% ",".join(features)) % ",".join(features))
builder = builder_class()
# At this point either we have a TreeBuilder instance in
# builder, or we have a builder_class that we can instantiate
# with the remaining **kwargs.
if builder is None:
builder = builder_class(**kwargs)
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
):
if builder.is_xml:
markup_type = "XML"
else:
markup_type = "HTML"
# This code adapted from warnings.py so that we get the same line
# of code as our warnings.warn() call gets, even if the answer is wrong
# (as it may be in a multithreading situation).
caller = None
try:
caller = sys._getframe(1)
except ValueError:
pass
if caller:
globals = caller.f_globals
line_number = caller.f_lineno
else:
globals = sys.__dict__
line_number= 1
filename = globals.get('__file__')
if filename:
fnl = filename.lower()
if fnl.endswith((".pyc", ".pyo")):
filename = filename[:-1]
if filename:
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.builder.soup = self self.known_xml = self.is_xml
self._namespaces = dict()
self.parse_only = parse_only self.parse_only = parse_only
self.builder.initialize_soup(self)
if hasattr(markup, 'read'): # It's a file-type object. if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read() markup = markup.read()
elif len(markup) <= 256: elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, str) and not '<' in markup)
):
# Print out warnings for a couple beginner problems # Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants. # just in case that's what the user really wants.
if (isinstance(markup, unicode) if (isinstance(markup, str)
and not os.path.supports_unicode_filenames): and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8") possible_filename = markup.encode("utf8")
else: else:
@ -172,37 +288,93 @@ class BeautifulSoup(Tag):
is_file = False is_file = False
try: try:
is_file = os.path.exists(possible_filename) is_file = os.path.exists(possible_filename)
except Exception, e: except Exception as e:
# This is almost certainly a problem involving # This is almost certainly a problem involving
# characters not valid in filenames on this # characters not valid in filenames on this
# system. Just let it go. # system. Just let it go.
pass pass
if is_file: if is_file:
if isinstance(markup, str):
markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) '"%s" looks like a filename, not markup. You should'
if markup[:5] == "http:" or markup[:6] == "https:": ' probably open this file and pass the filehandle into'
# TODO: This is ugly but I couldn't get it to work in ' Beautiful Soup.' % markup)
# Python 3 otherwise. self._check_markup_is_url(markup)
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
rejections = []
success = False
for (self.markup, self.original_encoding, self.declared_html_encoding, for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in ( self.contains_replacement_characters) in (
self.builder.prepare_markup(markup, from_encoding)): self.builder.prepare_markup(
markup, from_encoding, exclude_encodings=exclude_encodings)):
self.reset() self.reset()
try: try:
self._feed() self._feed()
success = True
break break
except ParserRejectedMarkup: except ParserRejectedMarkup as e:
rejections.append(e)
pass pass
if not success:
other_exceptions = [str(e) for e in rejections]
raise ParserRejectedMarkup(
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
)
# Clear out the markup and remove the builder's circular # Clear out the markup and remove the builder's circular
# reference to this object. # reference to this object.
self.markup = None self.markup = None
self.builder.soup = None self.builder.soup = None
def __copy__(self):
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable:
d['builder'] = None
return d
@staticmethod
def _check_markup_is_url(markup):
"""
Check if markup looks like it's actually a url and raise a warning
if so. Markup can be unicode or str (py2) / bytes (py3).
"""
if isinstance(markup, bytes):
space = b' '
cant_start_with = (b"http:", b"https:")
elif isinstance(markup, str):
space = ' '
cant_start_with = ("http:", "https:")
else:
return
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
if isinstance(markup, bytes):
decoded_markup = markup.decode('utf-8', 'replace')
else:
decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % decoded_markup
)
def _feed(self): def _feed(self):
# Convert the document to Unicode. # Convert the document to Unicode.
self.builder.reset() self.builder.reset()
@ -223,15 +395,21 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = [] self.preserve_whitespace_tag_stack = []
self.pushTag(self) self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs): def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup.""" """Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs) kwattrs.update(attrs)
return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos
)
def new_string(self, s, subclass=NavigableString): def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup.""" """Create a new NavigableString associated with this soup."""
navigable = subclass(s) subclass = subclass or self.element_classes.get(
navigable.setup() NavigableString, NavigableString
return navigable )
return subclass(s)
def insert_before(self, successor): def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().") raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@ -250,16 +428,26 @@ class BeautifulSoup(Tag):
def pushTag(self, tag): def pushTag(self, tag):
#print "Push", tag.name #print "Push", tag.name
if self.currentTag: if self.currentTag is not None:
self.currentTag.contents.append(tag) self.currentTag.contents.append(tag)
self.tagStack.append(tag) self.tagStack.append(tag)
self.currentTag = self.tagStack[-1] self.currentTag = self.tagStack[-1]
if tag.name in self.builder.preserve_whitespace_tags: if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag) self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=NavigableString): def endData(self, containerClass=None):
# Default container is NavigableString.
containerClass = containerClass or NavigableString
# The user may want us to instantiate some alias for the
# container class.
containerClass = self.element_classes.get(
containerClass, containerClass
)
if self.current_data: if self.current_data:
current_data = u''.join(self.current_data) current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains # If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space # nothing but ASCII spaces, replace it with a single space
# or newline. # or newline.
@ -289,15 +477,72 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None): def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree.""" """Add an object to the parse tree."""
parent = parent or self.currentTag if parent is None:
most_recent_element = most_recent_element or self._most_recent_element parent = self.currentTag
o.setup(parent, most_recent_element)
if most_recent_element is not None: if most_recent_element is not None:
most_recent_element.next_element = o previous_element = most_recent_element
else:
previous_element = self._most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
if previous_element is None:
previous_element = o.previous_element
fix = parent.next_element is not None
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o self._most_recent_element = o
parent.contents.append(o) parent.contents.append(o)
# Check if we are inserting into an already parsed node.
if fix:
self._linkage_fixer(parent)
def _linkage_fixer(self, el):
"""Make sure linkage of this fragment is sound."""
first = el.contents[0]
child = el.contents[-1]
descendant = child
if child is first and el.parent is not None:
# Parent should be linked to first child
el.next_element = child
# We are no longer linked to whatever this element is
prev_el = child.previous_element
if prev_el is not None and prev_el is not el:
prev_el.next_element = None
# First child should be linked to the parent, and no previous siblings.
child.previous_element = el
child.previous_sibling = None
# We have no sibling as we've been appended as the last.
child.next_sibling = None
# This index is a tag, dig deeper for a "last descendant"
if isinstance(child, Tag) and child.contents:
descendant = child._last_descendant(False)
# As the final step, link last descendant. It should be linked
# to the parent's next sibling (if found), else walk up the chain
# and find a parent with a sibling. It should have no next sibling.
descendant.next_element = None
descendant.next_sibling = None
target = el
while True:
if target is None:
break
elif target.next_sibling is not None:
descendant.next_element = target.next_sibling
target.next_sibling.previous_element = child
break
target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True): def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent """Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag instance of the given tag. If inclusivePop is false, pops the tag
@ -321,11 +566,12 @@ class BeautifulSoup(Tag):
return most_recently_popped return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs): def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None):
"""Push a start tag on to the stack. """Push a start tag on to the stack.
If this method returns None, the tag was rejected by the If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occured SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag, in the document. For instance, if this was a self-closing tag,
don't call handle_endtag. don't call handle_endtag.
""" """
@ -338,11 +584,14 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))): or not self.parse_only.search_tag(name, attrs))):
return None return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, tag = self.element_classes.get(Tag, Tag)(
self.currentTag, self._most_recent_element) self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element,
sourceline=sourceline, sourcepos=sourcepos
)
if tag is None: if tag is None:
return tag return tag
if self._most_recent_element: if self._most_recent_element is not None:
self._most_recent_element.next_element = tag self._most_recent_element.next_element = tag
self._most_recent_element = tag self._most_recent_element = tag
self.pushTag(tag) self.pushTag(tag)
@ -367,9 +616,9 @@ class BeautifulSoup(Tag):
encoding_part = '' encoding_part = ''
if eventual_encoding != None: if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else: else:
prefix = u'' prefix = ''
if not pretty_print: if not pretty_print:
indent_level = None indent_level = None
else: else:
@ -403,4 +652,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
soup = BeautifulSoup(sys.stdin) soup = BeautifulSoup(sys.stdin)
print soup.prettify() print(soup.prettify())

View file

@ -1,10 +1,13 @@
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
from collections import defaultdict from collections import defaultdict
import itertools import itertools
import sys import sys
from bs4.element import ( from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
whitespace_re nonwhitespace_re
) )
__all__ = [ __all__ = [
@ -80,21 +83,70 @@ builder_registry = TreeBuilderRegistry()
class TreeBuilder(object): class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree.""" """Turn a document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
features = [] features = []
is_xml = False is_xml = False
preserve_whitespace_tags = set() picklable = False
empty_element_tags = None # A tag will be considered an empty-element empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents. # tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or # A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA. # comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {} DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
USE_DEFAULT = object()
def __init__(self): # Most parsers don't keep track of line numbers.
TRACKS_LINE_NUMBERS = False
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
store_line_numbers=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
the way <pre> tags are treated in HTML. Tags in this list
will have
:param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
`Tag` object. You can turn this off by passing
store_line_numbers=False. If the parser you're using doesn't
keep track of this information, then setting store_line_numbers=True
will do nothing.
"""
self.soup = None self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
self.store_line_numbers = store_line_numbers
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
"""
self.soup = soup
def reset(self): def reset(self):
pass pass
@ -118,13 +170,13 @@ class TreeBuilder(object):
if self.empty_element_tags is None: if self.empty_element_tags is None:
return True return True
return tag_name in self.empty_element_tags return tag_name in self.empty_element_tags
def feed(self, markup): def feed(self, markup):
raise NotImplementedError() raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None): document_declared_encoding=None, exclude_encodings=None):
return markup, None, None, False yield markup, None, None, False
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document. """Wrap an HTML fragment to make it look like a document.
@ -153,14 +205,14 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', []) universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get( tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None) tag_name.lower(), None)
for attr in attrs.keys(): for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific): if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string # We have a "class"-type attribute whose string
# value is a whitespace-separated list of # value is a whitespace-separated list of
# values. Split it into a list. # values. Split it into a list.
value = attrs[attr] value = attrs[attr]
if isinstance(value, basestring): if isinstance(value, str):
values = whitespace_re.split(value) values = nonwhitespace_re.findall(value)
else: else:
# html5lib sometimes calls setAttributes twice # html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse # for the same tag when rearranging the parse
@ -224,10 +276,20 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags. Such as which tags are empty-element tags.
""" """
preserve_whitespace_tags = set(['pre', 'textarea']) empty_element_tags = set([
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', # These are from HTML5.
'spacer', 'link', 'frame', 'base']) 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
# These are from earlier versions of HTML and are removed in HTML5.
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
# The HTML standard defines these as block-level elements. Beautiful
# Soup does not treat these elements differently from other elements,
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines these attributes as containing a # The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is, # space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values, # class="foo bar" means that the 'class' attribute has two values,
@ -235,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into # encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be # a list of values if possible. Upon output, the list will be
# converted back into a string. # converted back into a string.
cdata_list_attributes = { DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'], "*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'], "a" : ['rel', 'rev'],
"link" : ['rel', 'rev'], "link" : ['rel', 'rev'],
@ -252,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"], "output" : ["for"],
} }
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag): def set_up_substitutions(self, tag):
# We are only interested in <meta> tags # We are only interested in <meta> tags
if tag.name != 'meta': if tag.name != 'meta':
@ -299,8 +363,15 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj) this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception): class ParserRejectedMarkup(Exception):
pass def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
"""
if isinstance(message_or_exception, Exception):
e = message_or_exception
message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
super(ParserRejectedMarkup, self).__init__(message_or_exception)
# Builders are registered in reverse order of priority, so that custom # Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml # builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only # to take precedence over html5lib, because it's faster. And we only

View file

@ -1,17 +1,27 @@
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
] ]
import warnings import warnings
import re
from bs4.builder import ( from bs4.builder import (
PERMISSIVE, PERMISSIVE,
HTML, HTML,
HTML_5, HTML_5,
HTMLTreeBuilder, HTMLTreeBuilder,
) )
from bs4.element import NamespacedAttribute from bs4.element import (
NamespacedAttribute,
nonwhitespace_re,
)
import html5lib import html5lib
from html5lib.constants import namespaces from html5lib.constants import (
namespaces,
prefixes,
)
from bs4.element import ( from bs4.element import (
Comment, Comment,
Doctype, Doctype,
@ -19,14 +29,36 @@ from bs4.element import (
Tag, Tag,
) )
try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
except ImportError as e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder): class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree.""" """Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML] NAME = "html5lib"
def prepare_markup(self, markup, user_specified_encoding): features = [NAME, PERMISSIVE, HTML_5, HTML]
# html5lib can tell us which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
def prepare_markup(self, markup, user_specified_encoding,
document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on. # Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding self.user_specified_encoding = user_specified_encoding
# document_declared_encoding and exclude_encodings aren't used
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
yield (markup, None, None, False) yield (markup, None, None, False)
# These methods are defined by Beautiful Soup. # These methods are defined by Beautiful Soup.
@ -34,32 +66,63 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None: if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding) self.underlying_builder.parser = parser
extra_kwargs = dict()
if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, str):
# We need to special-case this because html5lib sets # We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
else: else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0] original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
self.underlying_builder.parser = None
def create_treebuilder(self, namespaceHTMLElements): def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib( self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements) namespaceHTMLElements, self.soup,
store_line_numbers=self.store_line_numbers
)
return self.underlying_builder return self.underlying_builder
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements): def __init__(self, namespaceHTMLElements, soup=None,
self.soup = soup store_line_numbers=True, **kwargs):
if soup:
self.soup = soup
else:
from bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup(
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
# object, which we can use to track the current line number.
self.parser = None
self.store_line_numbers = store_line_numbers
def documentClass(self): def documentClass(self):
self.soup.reset() self.soup.reset()
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
@ -73,14 +136,26 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
self.soup.object_was_parsed(doctype) self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace): def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace) kwargs = {}
if self.parser and self.store_line_numbers:
# This represents the point immediately after the end of the
# tag. We don't know when the tag started, but we do know
# where it ended -- the character just before this one.
sourceline, sourcepos = self.parser.tokenizer.stream.position()
kwargs['sourceline'] = sourceline
kwargs['sourcepos'] = sourcepos-1
tag = self.soup.new_tag(name, namespace, **kwargs)
return Element(tag, self.soup, namespace) return Element(tag, self.soup, namespace)
def commentClass(self, data): def commentClass(self, data):
return TextNode(Comment(data), self.soup) return TextNode(Comment(data), self.soup)
def fragmentClass(self): def fragmentClass(self):
self.soup = BeautifulSoup("") from bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]" self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
@ -92,7 +167,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return self.soup return self.soup
def getFragment(self): def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element):
from bs4 import BeautifulSoup
rv = []
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
def serializeElement(element, indent=0):
if isinstance(element, BeautifulSoup):
pass
if isinstance(element, Doctype):
m = doctype_re.match(element)
if m:
name = m.group(1)
if m.lastindex > 1:
publicId = m.group(2) or ""
systemId = m.group(3) or m.group(4) or ""
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
else:
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
elif isinstance(element, NavigableString):
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
if element.namespace:
name = "%s %s" % (prefixes[element.namespace],
element.name)
else:
name = element.name
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
value = " ".join(value)
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.children:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
class AttrList(object): class AttrList(object):
def __init__(self, element): def __init__(self, element):
@ -101,7 +226,16 @@ class AttrList(object):
def __iter__(self): def __iter__(self):
return list(self.attrs.items()).__iter__() return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value): def __setitem__(self, name, value):
"set attr", name, value # If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = self.element.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
value = nonwhitespace_re.findall(value)
self.element[name] = value self.element[name] = value
def items(self): def items(self):
return list(self.attrs.items()) return list(self.attrs.items())
@ -115,16 +249,16 @@ class AttrList(object):
return name in list(self.attrs.keys()) return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node): class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace): def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name) treebuilder_base.Node.__init__(self, element.name)
self.element = element self.element = element
self.soup = soup self.soup = soup
self.namespace = namespace self.namespace = namespace
def appendChild(self, node): def appendChild(self, node):
string_child = child = None string_child = child = None
if isinstance(node, basestring): if isinstance(node, str):
# Some other piece of code decided to pass in a string # Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the # instead of creating a TextElement object to contain the
# string. # string.
@ -136,13 +270,15 @@ class Element(html5lib.treebuilders._base.Node):
child = node child = node
elif node.element.__class__ == NavigableString: elif node.element.__class__ == NavigableString:
string_child = child = node.element string_child = child = node.element
node.parent = self
else: else:
child = node.element child = node.element
node.parent = self
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, str) and child.parent is not None:
node.element.extract() node.element.extract()
if (string_child and self.element.contents if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString): and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string. # We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like # TODO This has O(n^2) performance, for input like
@ -152,7 +288,7 @@ class Element(html5lib.treebuilders._base.Node):
old_element.replace_with(new_element) old_element.replace_with(new_element)
self.soup._most_recent_element = new_element self.soup._most_recent_element = new_element
else: else:
if isinstance(node, basestring): if isinstance(node, str):
# Create a brand new NavigableString from this string. # Create a brand new NavigableString from this string.
child = self.soup.new_string(node) child = self.soup.new_string(node)
@ -161,6 +297,12 @@ class Element(html5lib.treebuilders._base.Node):
# immediately after the parent, if it has no children.) # immediately after the parent, if it has no children.)
if self.element.contents: if self.element.contents:
most_recent_element = self.element._last_descendant(False) most_recent_element = self.element._last_descendant(False)
elif self.element.next_element is not None:
# Something from further ahead in the parse tree is
# being inserted into this earlier element. This is
# very annoying because it means an expensive search
# for the last element in the tree.
most_recent_element = self.soup._last_descendant()
else: else:
most_recent_element = self.element most_recent_element = self.element
@ -169,9 +311,12 @@ class Element(html5lib.treebuilders._base.Node):
most_recent_element=most_recent_element) most_recent_element=most_recent_element)
def getAttributes(self): def getAttributes(self):
if isinstance(self.element, Comment):
return {}
return AttrList(self.element) return AttrList(self.element)
def setAttributes(self, attributes): def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0: if attributes is not None and len(attributes) > 0:
converted_attributes = [] converted_attributes = []
@ -183,7 +328,7 @@ class Element(html5lib.treebuilders._base.Node):
self.soup.builder._replace_cdata_list_attribute_values( self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes) self.name, attributes)
for name, value in attributes.items(): for name, value in list(attributes.items()):
self.element[name] = value self.element[name] = value
# The attributes may contain variables that need substitution. # The attributes may contain variables that need substitution.
@ -195,11 +340,11 @@ class Element(html5lib.treebuilders._base.Node):
attributes = property(getAttributes, setAttributes) attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore: if insertBefore:
text = TextNode(self.soup.new_string(data), self.soup) self.insertBefore(text, insertBefore)
self.insertBefore(data, insertBefore)
else: else:
self.appendChild(data) self.appendChild(text)
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
index = self.element.index(refNode.element) index = self.element.index(refNode.element)
@ -218,6 +363,10 @@ class Element(html5lib.treebuilders._base.Node):
def reparentChildren(self, new_parent): def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag.""" """Move all of this tag's children into another tag."""
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
element = self.element element = self.element
new_parent_element = new_parent.element new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children # Determine what this tag's next_element will be once all the children
@ -236,18 +385,35 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents to_append = element.contents
append_after = new_parent.element.contents
if len(to_append) > 0: if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling # Set the first child's previous_element and previous_sibling
# to elements within the new parent # to elements within the new parent
first_child = to_append[0] first_child = to_append[0]
first_child.previous_element = new_parents_last_descendant if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child
# Fix the last child's next_element and next_sibling # Find the very last element being moved. It is now the
last_child = to_append[-1] # parent's last descendant. It has no .next_sibling and
last_child.next_element = new_parents_last_descendant_next_element # its .next_element is whatever the previous last
last_child.next_sibling = None # descendant had.
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
last_childs_last_descendant.next_sibling = None
for child in to_append: for child in to_append:
child.parent = new_parent_element child.parent = new_parent_element
@ -257,6 +423,10 @@ class Element(html5lib.treebuilders._base.Node):
element.contents = [] element.contents = []
element.next_element = final_next_element element.next_element = final_next_element
# print "DONE WITH MOVE"
# print "FROM", self.element
# print "TO", new_parent_element
def cloneNode(self): def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace) tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace) node = Element(tag, self.soup, self.namespace)
@ -268,7 +438,7 @@ class Element(html5lib.treebuilders._base.Node):
return self.element.contents return self.element.contents
def getNameTuple(self): def getNameTuple(self):
if self.namespace is None: if self.namespace == None:
return namespaces["html"], self.name return namespaces["html"], self.name
else: else:
return self.namespace, self.name return self.namespace, self.name
@ -277,7 +447,7 @@ class Element(html5lib.treebuilders._base.Node):
class TextNode(Element): class TextNode(Element):
def __init__(self, element, soup): def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None) treebuilder_base.Node.__init__(self, None)
self.element = element self.element = element
self.soup = soup self.soup = soup

View file

@ -1,13 +1,23 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" """Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
from HTMLParser import ( from html.parser import HTMLParser
HTMLParser,
HTMLParseError, try:
) from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys import sys
import warnings import warnings
@ -19,10 +29,10 @@ import warnings
# At the end of this file, we monkeypatch HTMLParser so that # At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2. # strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3] major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = ( CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
major > 3 CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
or (major == 3 and minor > 2) CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
or (major == 3 and minor == 2 and release >= 3))
from bs4.element import ( from bs4.element import (
CData, CData,
@ -43,7 +53,42 @@ from bs4.builder import (
HTMLPARSER = 'html.parser' HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser): class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
# without an explicit closing tag. If we encounter a closing tag
# of this type, we'll associate it with one of those entries.
#
# This isn't a stack because we don't care about the
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
# is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call
# handle_endtag ourselves.
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True):
# XXX namespace # XXX namespace
attr_dict = {} attr_dict = {}
for key, value in attrs: for key, value in attrs:
@ -53,17 +98,46 @@ class BeautifulSoupHTMLParser(HTMLParser):
value = '' value = ''
attr_dict[key] = value attr_dict[key] = value
attrvalue = '""' attrvalue = '""'
self.soup.handle_starttag(name, None, None, attr_dict) #print "START", name
sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline,
sourcepos=sourcepos
)
if tag and tag.is_empty_element and handle_empty_element:
# Unlike other parsers, html.parser doesn't send separate end tag
# events for empty-element tags. (It's handled in
# handle_startendtag, but only if the original markup looked like
# <tag/>.)
#
# So we need to call handle_endtag() ourselves. Since we
# know the start event is identical to the end event, we
# don't want handle_endtag() to cross off any previous end
# events for tags of this name.
self.handle_endtag(name, check_already_closed=False)
def handle_endtag(self, name): # But we might encounter an explicit closing tag for this tag
self.soup.handle_endtag(name) # later on. If so, we want to ignore it.
self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True):
#print "END", name
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
# We've already called handle_endtag() for it, so just
# check it off the list.
# print "ALREADY CLOSED", name
self.already_closed_empty_element.remove(name)
else:
self.soup.handle_endtag(name)
def handle_data(self, data): def handle_data(self, data):
self.soup.handle_data(data) self.soup.handle_data(data)
def handle_charref(self, name): def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once # XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed. # it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'): if name.startswith('x'):
real_name = int(name.lstrip('x'), 16) real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'): elif name.startswith('X'):
@ -71,11 +145,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else: else:
real_name = int(name) real_name = int(name)
try: data = None
data = unichr(real_name) if real_name < 256:
except (ValueError, OverflowError), e: # HTML numeric entities are supposed to reference Unicode
data = u"\N{REPLACEMENT CHARACTER}" # code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, 'windows-1252'):
if not encoding:
continue
try:
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError as e:
pass
if not data:
try:
data = chr(real_name)
except (ValueError, OverflowError) as e:
pass
data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data) self.handle_data(data)
def handle_entityref(self, name): def handle_entityref(self, name):
@ -83,7 +172,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None: if character is not None:
data = character data = character
else: else:
data = "&%s;" % name # If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
self.handle_data(data) self.handle_data(data)
def handle_comment(self, data): def handle_comment(self, data):
@ -113,14 +207,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data): def handle_pi(self, data):
self.soup.endData() self.soup.endData()
if data.endswith("?") and data.lower().startswith("xml"):
# "An XHTML processing instruction using the trailing '?'
# will cause the '?' to be included in data." - HTMLParser
# docs.
#
# Strip the question mark so we don't end up with two
# question marks.
data = data[:-1]
self.soup.handle_data(data) self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction) self.soup.endData(ProcessingInstruction)
@ -128,26 +214,38 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder): class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False is_xml = False
features = [HTML, STRICT, HTMLPARSER] picklable = True
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs): # The html.parser knows which line number and position in the
if CONSTRUCTOR_TAKES_STRICT: # original file is the source of an element.
kwargs['strict'] = False TRACKS_LINE_NUMBERS = True
self.parser_args = (args, kwargs)
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None): document_declared_encoding=None, exclude_encodings=None):
""" """
:return: A 4-tuple (markup, original encoding, encoding :return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER). replaced with REPLACEMENT CHARACTER).
""" """
if isinstance(markup, unicode): if isinstance(markup, str):
yield (markup, None, None, False) yield (markup, None, None, False)
return return
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True) dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
yield (dammit.markup, dammit.original_encoding, yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding, dammit.declared_html_encoding,
dammit.contains_replacement_characters) dammit.contains_replacement_characters)
@ -158,10 +256,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
except HTMLParseError, e: parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning( warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e raise e
parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a # 3.2.3 code. This ensures they don't treat markup like <p></p> as a

View file

@ -1,13 +1,26 @@
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = [ __all__ = [
'LXMLTreeBuilderForXML', 'LXMLTreeBuilderForXML',
'LXMLTreeBuilder', 'LXMLTreeBuilder',
] ]
try:
from collections.abc import Callable # Python 3.6
except ImportError as e:
from collections import Callable
from io import BytesIO from io import BytesIO
from StringIO import StringIO from io import StringIO
import collections
from lxml import etree from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.element import (
Comment,
Doctype,
NamespacedAttribute,
ProcessingInstruction,
XMLProcessingInstruction,
)
from bs4.builder import ( from bs4.builder import (
FAST, FAST,
HTML, HTML,
@ -20,19 +33,55 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml' LXML = 'lxml'
def _invert(d):
"Invert a dictionary."
return dict((v,k) for k, v in list(d.items()))
class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True is_xml = True
processing_instruction_class = XMLProcessingInstruction
NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"]
# Well, it's permissive by XML parser standards. # Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE] features = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512 CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace # This namespace mapping is specified in the XML Namespace
# standard. # standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
# NOTE: If we parsed Element objects and looked at .sourceline,
# we'd be able to see the line numbers from the original document.
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
def _register_namespaces(self, mapping):
"""Let the BeautifulSoup object know about namespaces encountered
while parsing the document.
This might be useful later on when creating CSS selectors.
"""
for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
def default_parser(self, encoding): def default_parser(self, encoding):
# This can either return a parser object or a class, which # This can either return a parser object or a class, which
@ -46,12 +95,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser. # Use the default parser.
parser = self.default_parser(encoding) parser = self.default_parser(encoding)
if isinstance(parser, collections.Callable): if isinstance(parser, Callable):
# Instantiate the parser with default arguments # Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding) parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser return parser
def __init__(self, parser=None, empty_element_tags=None): def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a # TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new # callable, since that means there's no way to create new
# parsers for different encodings. # parsers for different encodings.
@ -59,8 +108,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None: if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags) self.empty_element_tags = set(empty_element_tags)
self.soup = None self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag): def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag # Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py. # name. Copied from lxml's src/lxml/sax.py.
@ -70,6 +120,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
return (None, tag) return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None): document_declared_encoding=None):
""" """
:yield: A series of 4-tuples. :yield: A series of 4-tuples.
@ -78,31 +129,37 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document. Each 4-tuple represents a strategy for parsing the document.
""" """
if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
# Instead of using UnicodeDammit to convert the bytestring to # Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to # Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse # iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn. # the document as each one in turn.
is_html = not self.is_xml is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
else:
self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(markup, try_encodings, is_html) detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
for encoding in detector.encodings: for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False) yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup): def feed(self, markup):
if isinstance(markup, bytes): if isinstance(markup, bytes):
markup = BytesIO(markup) markup = BytesIO(markup)
elif isinstance(markup, unicode): elif isinstance(markup, str):
markup = StringIO(markup) markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty, # Call feed() at least once, even if the markup is empty,
@ -117,30 +174,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0: if len(data) != 0:
self.parser.feed(data) self.parser.feed(data)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(e)
def close(self): def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}): def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs) attrs = dict(attrs)
nsprefix = None nsprefix = None
# Invert each namespace map as it comes in. # Invert each namespace map as it comes in.
if len(self.nsmaps) > 1: if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but # There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a # non-default namespaces are in play, so we need a
# separate tag stack to know when they end. # separate tag stack to know when they end.
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap) # First, Let the BeautifulSoup object know about it.
self._register_namespaces(nsmap)
# Then, add it to our running list of inverted namespace
# mappings.
self.nsmaps.append(_invert(nsmap))
# Also treat the namespace mapping as a set of attributes on the # Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later. # tag, so we can recreate it later.
attrs = attrs.copy() attrs = attrs.copy()
for prefix, namespace in nsmap.items(): for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute( attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/") "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace attrs[attribute] = namespace
@ -149,7 +212,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and # from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects. # turn then into NamespacedAttribute objects.
new_attrs = {} new_attrs = {}
for attr, value in attrs.items(): for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr) namespace, attr = self._getNsTag(attr)
if namespace is None: if namespace is None:
new_attrs[attr] = value new_attrs[attr] = value
@ -189,7 +252,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.pop() self.nsmaps.pop()
def pi(self, target, data): def pi(self, target, data):
pass self.soup.endData()
self.soup.handle_data(target + ' ' + data)
self.soup.endData(self.processing_instruction_class)
def data(self, content): def data(self, content):
self.soup.handle_data(content) self.soup.handle_data(content)
@ -207,13 +272,17 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE] NAME = LXML
ALTERNATE_NAMES = ["lxml-html"]
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False is_xml = False
processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding): def default_parser(self, encoding):
return etree.HTMLParser return etree.HTMLParser
@ -224,10 +293,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding) self.parser = self.parser_for(encoding)
self.parser.feed(markup) self.parser.feed(markup)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(e)
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment return '<html><body>%s</body></html>' % fragment

4
lib/bs4/check_block.py Normal file
View file

@ -0,0 +1,4 @@
import requests
data = requests.get("https://www.crummy.com/").content
from bs4 import _s
data = [x for x in _s(data).block_text()]

View file

@ -3,12 +3,14 @@
This library converts a bytestream to Unicode through any means This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and XML, but it does not rewrite the Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job. XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import codecs import codecs
from htmlentitydefs import codepoint2name from html.entities import codepoint2name
import re import re
import logging import logging
import string import string
@ -20,6 +22,8 @@ try:
# PyPI package: cchardet # PyPI package: cchardet
import cchardet import cchardet
def chardet_dammit(s): def chardet_dammit(s):
if isinstance(s, str):
return None
return cchardet.detect(s)['encoding'] return cchardet.detect(s)['encoding']
except ImportError: except ImportError:
try: try:
@ -28,6 +32,8 @@ except ImportError:
# PyPI package: chardet # PyPI package: chardet
import chardet import chardet
def chardet_dammit(s): def chardet_dammit(s):
if isinstance(s, str):
return None
return chardet.detect(s)['encoding'] return chardet.detect(s)['encoding']
#import chardet.constants #import chardet.constants
#chardet.constants._debug = 1 #chardet.constants._debug = 1
@ -42,10 +48,19 @@ try:
except ImportError: except ImportError:
pass pass
xml_encoding_re = re.compile( # Build bytestring and Unicode versions of regular expressions for finding
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) # a declared encoding inside an XML or HTML document.
html_meta_re = re.compile( xml_encoding = '^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
encoding_res = dict()
encoding_res[bytes] = {
'html' : re.compile(html_meta.encode("ascii"), re.I),
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
}
encoding_res[str] = {
'html' : re.compile(html_meta, re.I),
'xml' : re.compile(xml_encoding, re.I)
}
class EntitySubstitution(object): class EntitySubstitution(object):
@ -55,15 +70,24 @@ class EntitySubstitution(object):
lookup = {} lookup = {}
reverse_lookup = {} reverse_lookup = {}
characters_for_re = [] characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint) # &apos is an XHTML entity and an HTML 5, but not an HTML 4
if codepoint != 34: # entity. We don't want to use it, but we want to recognize it on the way in.
#
# TODO: Ideally we would be able to recognize all HTML 5 named
# entities, but that's a little tricky.
extra = [(39, 'apos')]
for codepoint, name in list(codepoint2name.items()) + extra:
character = chr(codepoint)
if codepoint not in (34, 39):
# There's no point in turning the quotation mark into # There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which # &quot; or the single quote into &apos;, unless it
# is handled elsewhere. # happens within an attribute value, which is handled
# elsewhere.
characters_for_re.append(character) characters_for_re.append(character)
lookup[character] = name lookup[character] = name
# But we do want to turn &quot; into the quotation mark. # But we do want to recognize those entities on the way in and
# convert them to Unicode characters.
reverse_lookup[name] = character reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re) re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition) return lookup, reverse_lookup, re.compile(re_definition)
@ -79,7 +103,7 @@ class EntitySubstitution(object):
} }
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")") ")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])") AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@ -212,8 +236,11 @@ class EncodingDetector:
5. Windows-1252. 5. Windows-1252.
""" """
def __init__(self, markup, override_encodings=None, is_html=False): def __init__(self, markup, override_encodings=None, is_html=False,
exclude_encodings=None):
self.override_encodings = override_encodings or [] self.override_encodings = override_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None self.chardet_encoding = None
self.is_html = is_html self.is_html = is_html
self.declared_encoding = None self.declared_encoding = None
@ -224,6 +251,8 @@ class EncodingDetector:
def _usable(self, encoding, tried): def _usable(self, encoding, tried):
if encoding is not None: if encoding is not None:
encoding = encoding.lower() encoding = encoding.lower()
if encoding in self.exclude_encodings:
return False
if encoding not in tried: if encoding not in tried:
tried.add(encoding) tried.add(encoding)
return True return True
@ -266,6 +295,9 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data): def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies.""" """If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None encoding = None
if isinstance(data, str):
# Unicode data cannot have a byte-order mark.
return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'): and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be' encoding = 'utf-16be'
@ -300,14 +332,22 @@ class EncodingDetector:
xml_endpos = 1024 xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05)) html_endpos = max(2048, int(len(markup) * 0.05))
if isinstance(markup, bytes):
res = encoding_res[bytes]
else:
res = encoding_res[str]
xml_re = res['xml']
html_re = res['html']
declared_encoding = None declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html: if not declared_encoding_match and is_html:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) declared_encoding_match = html_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None: if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode( declared_encoding = declared_encoding_match.groups()[0]
'ascii')
if declared_encoding: if declared_encoding:
if isinstance(declared_encoding, bytes):
declared_encoding = declared_encoding.decode('ascii', 'replace')
return declared_encoding.lower() return declared_encoding.lower()
return None return None
@ -331,18 +371,19 @@ class UnicodeDammit:
] ]
def __init__(self, markup, override_encodings=[], def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False): smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to self.smart_quotes_to = smart_quotes_to
self.tried_encodings = [] self.tried_encodings = []
self.contains_replacement_characters = False self.contains_replacement_characters = False
self.is_html = is_html self.is_html = is_html
self.log = logging.getLogger(__name__)
self.detector = EncodingDetector(markup, override_encodings, is_html) self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with. # Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '': if isinstance(markup, str) or markup == '':
self.markup = markup self.markup = markup
self.unicode_markup = unicode(markup) self.unicode_markup = str(markup)
self.original_encoding = None self.original_encoding = None
return return
@ -365,9 +406,10 @@ class UnicodeDammit:
if encoding != "ascii": if encoding != "ascii":
u = self._convert_from(encoding, "replace") u = self._convert_from(encoding, "replace")
if u is not None: if u is not None:
logging.warning( self.log.warning(
"Some characters could not be decoded, and were " "Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER.") "replaced with REPLACEMENT CHARACTER."
)
self.contains_replacement_characters = True self.contains_replacement_characters = True
break break
@ -425,7 +467,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"): def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode. '''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases''' %encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors) return str(data, encoding, errors)
@property @property
def declared_html_encoding(self): def declared_html_encoding(self):
@ -723,7 +765,7 @@ class UnicodeDammit:
0xde : b'\xc3\x9e', # Þ 0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß 0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à 0xe0 : b'\xc3\xa0', # à
0xe1 : b'\xa1', # á 0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â 0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã 0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä 0xe4 : b'\xc3\xa4', # ä

View file

@ -1,7 +1,11 @@
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import cProfile import cProfile
from StringIO import StringIO from io import StringIO
from HTMLParser import HTMLParser from html.parser import HTMLParser
import bs4 import bs4
from bs4 import BeautifulSoup, __version__ from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry from bs4.builder import builder_registry
@ -17,8 +21,8 @@ import cProfile
def diagnose(data): def diagnose(data):
"""Diagnostic suite for isolating common problems.""" """Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__ print("Diagnostic running on Beautiful Soup %s" % __version__)
print "Python version %s" % sys.version print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"] basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers: for name in basic_parsers:
@ -27,44 +31,60 @@ def diagnose(data):
break break
else: else:
basic_parsers.remove(name) basic_parsers.remove(name)
print ( print((
"I noticed that %s is not installed. Installing it may help." % "I noticed that %s is not installed. Installing it may help." %
name) name))
if 'lxml' in basic_parsers: if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"]) basic_parsers.append("lxml-xml")
from lxml import etree try:
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) from lxml import etree
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
except ImportError as e:
print (
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers: if 'html5lib' in basic_parsers:
import html5lib try:
print "Found html5lib version %s" % html5lib.__version__ import html5lib
print("Found html5lib version %s" % html5lib.__version__)
except ImportError as e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return return
print else:
try:
if os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print()
for parser in basic_parsers: for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser print("Trying to parse your markup with %s" % parser)
success = False success = False
try: try:
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, features=parser)
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "Here's what %s did with the markup:" % parser print("Here's what %s did with the markup:" % parser)
print soup.prettify() print(soup.prettify())
print "-" * 80 print("-" * 80)
def lxml_trace(data, html=True, **kwargs): def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing. """Print out the lxml events that occur during parsing.
@ -74,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
""" """
from lxml import etree from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text)) print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser): class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else.""" """Announces HTMLParser parse events, without doing anything else."""
@ -135,7 +155,7 @@ def rword(length=5):
def rsentence(length=4): def rsentence(length=4):
"Generate a random sentence-like string." "Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length)) return " ".join(rword(random.randint(4,9)) for i in range(length))
def rdoc(num_elements=1000): def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document.""" """Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
@ -156,10 +176,10 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000): def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark.""" """Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements) data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data) print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False success = False
try: try:
@ -167,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
b = time.time() b = time.time()
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree from lxml import etree
a = time.time() a = time.time()
etree.HTML(data) etree.HTML(data)
b = time.time() b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a) print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib import html5lib
parser = html5lib.HTMLParser() parser = html5lib.HTMLParser()
a = time.time() a = time.time()
parser.parse(data) parser.parse(data)
b = time.time() b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a) print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"): def profile(num_elements=100000, parser="lxml"):

File diff suppressed because it is too large Load diff

99
lib/bs4/formatter.py Normal file
View file

@ -0,0 +1,99 @@
from bs4.dammit import EntitySubstitution
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
HTML_FORMATTERS = {}
HTML = 'html'
XML = 'xml'
HTML_DEFAULTS = dict(
cdata_containing_tags=set(["script", "style"]),
)
def _default(self, language, value, kwarg):
if value is not None:
return value
if language == self.XML:
return set()
return self.HTML_DEFAULTS[kwarg]
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
):
"""
:param void_element_close_prefix: By default, represent void
elements as <tag/> rather than <tag>
"""
self.language = language
self.entity_substitution = entity_substitution
self.void_element_close_prefix = void_element_close_prefix
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution."""
if not self.entity_substitution:
return ns
from .element import NavigableString
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in self.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return self.entity_substitution(ns)
def attribute_value(self, value):
"""Process the value of an attribute."""
return self.substitute(value)
def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
return sorted(tag.attrs.items())
class HTMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
entity_substitution=None
)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = Formatter(
Formatter(Formatter.XML, entity_substitution=None)
)

View file

@ -1,5 +1,10 @@
# encoding: utf-8
"""Helper classes for tests.""" """Helper classes for tests."""
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
import pickle
import copy import copy
import functools import functools
import unittest import unittest
@ -11,29 +16,66 @@ from bs4.element import (
ContentMetaAttributeValue, ContentMetaAttributeValue,
Doctype, Doctype,
SoupStrainer, SoupStrainer,
Tag
) )
from bs4.builder import HTMLParserTreeBuilder from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder
BAD_DOCUMENT = """A bare string
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
<div>A <meta> tag</div>
<div>A <br> tag that supposedly has contents.</br></div>
<div>AT&T</div>
<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
<div><a href="http://example.com/</a> that attribute value never got closed</div>
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
<! This document starts with a bogus declaration ><div>a</div>
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
<div>This document ends with <!an incomplete declaration
<div><a style={height:21px;}>That attribute value was bogus</a></div>
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
<div>This document ends before the entity finishes: &gt
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
<div><table><tr><td>Here's a table</td></tr></table></div>
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
<div>This tag contains nothing but whitespace: <b> </b></div>
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
<div><table><div>This table contains bare markup</div></table></div>
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
"""
class SoupTest(unittest.TestCase): class SoupTest(unittest.TestCase):
@property @property
def default_builder(self): def default_builder(self):
return default_builder() return default_builder
def soup(self, markup, **kwargs): def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup.""" """Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder) builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs) return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup): def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document. """Turn an HTML fragment into a document.
The details depend on the builder. The details depend on the builder.
""" """
return self.default_builder.test_fragment_to_document(markup) return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None): def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder builder = self.default_builder
@ -43,6 +85,131 @@ class SoupTest(unittest.TestCase):
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
self.assertEqual(e, earlier.next_element)
self.assertEqual(earlier, e.previous_element)
earlier = e
def linkage_validator(self, el, _recursive_call=False):
"""Ensure proper linkage throughout the document."""
descendant = None
# Document element should have no previous element or previous sibling.
# It also shouldn't have a next sibling.
if el.parent is None:
assert el.previous_element is None,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_element, None
)
assert el.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_sibling, None
)
assert el.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_sibling, None
)
idx = 0
child = None
last_child = None
last_idx = len(el.contents) - 1
for child in el.contents:
descendant = None
# Parent should link next element to their first child
# That child should have no previous sibling
if idx == 0:
if el.parent is not None:
assert el.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_element, child
)
assert child.previous_element is el,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
child, child.previous_element, el
)
assert child.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
child, child.previous_sibling, None
)
# If not the first child, previous index should link as sibling to this index
# Previous element should match the last index or the last bubbled up descendant
else:
assert child.previous_sibling is el.contents[idx - 1],\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
child, child.previous_sibling, el.contents[idx - 1]
)
assert el.contents[idx - 1].next_sibling is child,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
)
if last_child is not None:
assert child.previous_element is last_child,\
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
child, child.previous_element, last_child, child.parent.contents
)
assert last_child.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
last_child, last_child.next_element, child
)
if isinstance(child, Tag) and child.contents:
descendant = self.linkage_validator(child, True)
# A bubbled up descendant should have no next siblings
assert descendant.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
descendant, descendant.next_sibling, None
)
# Mark last child as either the bubbled up descendant or the current child
if descendant is not None:
last_child = descendant
else:
last_child = child
# If last child, there are non next siblings
if idx == last_idx:
assert child.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_sibling, None
)
idx += 1
child = descendant if descendant is not None else child
if child is None:
child = el
if not _recursive_call and child is not None:
target = el
while True:
if target is None:
assert child.next_element is None, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, None
)
break
elif target.next_sibling is not None:
assert child.next_element is target.next_sibling, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, target.next_sibling
)
break
target = target.parent
# We are done, so nothing to return
return None
else:
# Return the child to the recursive caller
return child
class HTMLTreeBuilderSmokeTest(object): class HTMLTreeBuilderSmokeTest(object):
@ -54,6 +221,27 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation. markup in these tests, there's not much room for interpretation.
""" """
def test_empty_element_tags(self):
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
are handled correctly.
"""
for name in [
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
'spacer', 'frame'
]:
soup = self.soup("")
new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element)
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def assertDoctypeHandled(self, doctype_fragment): def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly.""" """Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment) doctype_str, soup = self._document_with_doctype(doctype_fragment)
@ -114,6 +302,27 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""), soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b"")) markup.replace(b"\n", b""))
def test_namespaced_html(self):
"""When a namespaced XML document is parsed as HTML it should
be treated as HTML with weird tag names.
"""
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
soup = self.soup(markup)
self.assertEqual(2, len(soup.find_all("ns1:foo")))
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = """<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.decode())
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_deepcopy(self): def test_deepcopy(self):
"""Make sure you can copy the tree builder. """Make sure you can copy the tree builder.
@ -155,6 +364,23 @@ class HTMLTreeBuilderSmokeTest(object):
def test_nested_formatting_elements(self): def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>") self.assertSoupEquals("<em><em></em></em>")
def test_double_head(self):
html = '''<!DOCTYPE html>
<html>
<head>
<title>Ordinary HEAD element test</title>
</head>
<script type="text/javascript">
alert("Help!");
</script>
<body>
Hello, world!
</body>
</html>
'''
soup = self.soup(html)
self.assertEqual("text/javascript", soup.find('script')['type'])
def test_comment(self): def test_comment(self):
# Comments are represented as Comment objects. # Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>" markup = "<p>foo<!--foobar-->baz</p>"
@ -171,9 +397,22 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertEqual(comment, baz.previous_element) self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self): def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags.""" """Whitespace must be preserved in <pre> and <textarea> tags,
self.assertSoupEquals("<pre> </pre>") even if that would mean not prettifying the markup.
self.assertSoupEquals("<textarea> woo </textarea>") """
pre_markup = "<pre> </pre>"
textarea_markup = "<textarea> woo\nwoo </textarea>"
self.assertSoupEquals(pre_markup)
self.assertSoupEquals(textarea_markup)
soup = self.soup(pre_markup)
self.assertEqual(soup.pre.prettify(), pre_markup)
soup = self.soup(textarea_markup)
self.assertEqual(soup.textarea.prettify(), textarea_markup)
soup = self.soup("<textarea></textarea>")
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
def test_nested_inline_elements(self): def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely.""" """Inline elements can be nested indefinitely."""
@ -213,6 +452,18 @@ class HTMLTreeBuilderSmokeTest(object):
"<tbody><tr><td>Bar</td></tr></tbody>" "<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>") "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_multivalued_attribute_with_whitespace(self):
# Whitespace separating the values of a multi-valued attribute
# should be ignored.
markup = '<div class=" foo bar "></a>'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.div['class'])
# If you search by the literal name of the class it's like the whitespace
# wasn't there.
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
def test_deeply_nested_multivalued_attribute(self): def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times # html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with # as it rearranges the tree. This has caused problems with
@ -221,18 +472,52 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class']) self.assertEqual(["css"], soup.div.div['class'])
def test_multivalued_attribute_on_html(self):
# html5lib uses a different API to set the attributes ot the
# <html> tag. This has caused problems with multivalued
# attributes.
markup = '<html class="a b"></html>'
soup = self.soup(markup)
self.assertEqual(["a", "b"], soup.html['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self): def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>') self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_strings_resembling_character_entity_references(self):
# "&T" and "&p" look like incomplete character entities, but they are
# not.
self.assertSoupEquals(
"<p>&bull; AT&T is in the s&p 500</p>",
"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
)
def test_apos_entity(self):
self.assertSoupEquals(
"<p>Bob&apos;s Bar</p>",
"<p>Bob's Bar</p>",
)
def test_entities_in_foreign_document_encoding(self):
# &#147; and &#148; are invalid numeric entities referencing
# Windows-1252 characters. &#45; references a character common
# to Windows-1252 and Unicode, and &#9731; references a
# character only found in Unicode.
#
# All of these entities should be converted to Unicode
# characters.
markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
soup = self.soup(markup)
self.assertEqual("“Hello” -☃", soup.p.string)
def test_entities_in_attributes_converted_to_unicode(self): def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect) self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self): def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect) self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -243,16 +528,52 @@ class HTMLTreeBuilderSmokeTest(object):
'<p>I said "good day!"</p>') '<p>I said "good day!"</p>')
def test_out_of_range_entity(self): def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}" expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect) self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self): def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder." "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name) self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name) self.assertEqual("p", soup.p.name)
self.assertConnectedness(soup)
def test_empty_element_tags(self):
"""Verify consistent handling of empty-element tags,
no matter how they come in through the markup.
"""
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
def test_head_tag_between_head_and_body(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<html><head></head>
<link></link>
<body>foo</body>
</html>
"""
soup = self.soup(content)
self.assertNotEqual(None, soup.html.body)
self.assertConnectedness(soup)
def test_multiple_copies_of_a_tag(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<!DOCTYPE html>
<html>
<body>
<article id="a" >
<div><a href="1"></div>
<footer>
<a href="2"></a>
</footer>
</article>
</body>
</html>
"""
soup = self.soup(content)
self.assertConnectedness(soup.article)
def test_basic_namespaces(self): def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the """Parsers don't need to *understand* namespaces, but at the
@ -285,9 +606,9 @@ class HTMLTreeBuilderSmokeTest(object):
# A seemingly innocuous document... but it's in Unicode! And # A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the # it contains characters that can't be represented in the
# encoding found in the declaration! The horror! # encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self): def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers.""" """Parsers should be able to work with SoupStrainers."""
@ -327,7 +648,7 @@ class HTMLTreeBuilderSmokeTest(object):
# Both XML and HTML entities are converted to Unicode characters # Both XML and HTML entities are converted to Unicode characters
# during parsing. # during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>" expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected) self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self): def test_smart_quotes_converted_on_the_way_in(self):
@ -337,15 +658,15 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(quote) soup = self.soup(quote)
self.assertEqual( self.assertEqual(
soup.p.string, soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self): def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>") soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self): def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8") expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text) soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected) self.assertEqual(soup.p.encode("utf-8"), expected)
@ -354,7 +675,7 @@ class HTMLTreeBuilderSmokeTest(object):
# easy-to-understand document. # easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use # That's because we're going to encode it into ISO-Latin-1, and use
# that to test. # that to test.
@ -399,7 +720,9 @@ class HTMLTreeBuilderSmokeTest(object):
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
soup = self.soup( soup = self.soup(
hebrew_document, from_encoding="iso8859-8") hebrew_document, from_encoding="iso8859-8")
self.assertEqual(soup.original_encoding, 'iso8859-8') # Some tree builders call it iso8859-8, others call it iso-8859-9.
# That's not a difference we really care about.
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
self.assertEqual( self.assertEqual(
soup.encode('utf-8'), soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8")) hebrew_document.decode("iso8859-8").encode("utf-8"))
@ -461,13 +784,39 @@ class HTMLTreeBuilderSmokeTest(object):
data.a['foo'] = 'bar' data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode()) self.assertEqual('<a foo="bar">text</a>', data.a.decode())
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
class XMLTreeBuilderSmokeTest(object): class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def test_docstring_generated(self): def test_docstring_generated(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
self.assertEqual( self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
def test_xml_declaration(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_processing_instruction(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_real_xhtml_document(self): def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in.""" """A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?> markup = b"""<?xml version="1.0" encoding="utf-8"?>
@ -480,12 +829,23 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual( self.assertEqual(
soup.encode("utf-8"), markup) soup.encode("utf-8"), markup)
def test_nested_namespaces(self):
doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<parent xmlns="http://ns1/">
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
</child>
</parent>"""
soup = self.soup(doc)
self.assertEqual(doc, soup.encode())
def test_formatter_processes_script_tag_for_xml_documents(self): def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """ doc = """
<script type="text/javascript"> <script type="text/javascript">
</script> </script>
""" """
soup = BeautifulSoup(doc, "xml") soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add # lxml would have stripped this while parsing, but we can add
# it later. # it later.
soup.script.string = 'console.log("< < hey > > ");' soup.script.string = 'console.log("< < hey > > ");'
@ -493,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded) self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self): def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self): def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual( self.assertEqual(
unicode(soup.rss), markup) str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self): def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
@ -532,17 +892,57 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self): def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup) self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self): def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self): def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>' markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?>
<Document xmlns="http://example.com/ns0"
xmlns:ns1="http://example.com/ns1"
xmlns:ns2="http://example.com/ns2"
<ns1:tag>foo</ns1:tag>
<ns1:tag>bar</ns1:tag>
<ns2:tag key="value">baz</ns2:tag>
</Document>
"""
soup = self.soup(doc)
# There are three <tag> tags.
self.assertEqual(3, len(soup.find_all('tag')))
# But two of them are ns1:tag and one of them is ns2:tag.
self.assertEqual(2, len(soup.find_all('ns1:tag')))
self.assertEqual(1, len(soup.find_all('ns2:tag')))
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
def test_copy_tag_preserves_namespace(self):
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://example.com/ns0"/>"""
soup = self.soup(xml)
tag = soup.document
duplicate = copy.copy(tag)
# The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix)
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5.""" """Smoke test for a tree builder that supports HTML5."""

View file

@ -0,0 +1 @@
"The beautifulsoup tests."

View file

@ -0,0 +1,147 @@
"""Tests of the builder registry."""
import unittest
import warnings
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry as registry,
HTMLParserTreeBuilder,
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
class BuiltInRegistryTest(unittest.TestCase):
"""Test the built-in registry with the default builders registered."""
def test_combination(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('fast', 'html'),
LXMLTreeBuilder)
if LXML_PRESENT:
self.assertEqual(registry.lookup('permissive', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('strict', 'html'),
HTMLParserTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib', 'html'),
HTML5TreeBuilder)
def test_lookup_by_markup_type(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
else:
self.assertEqual(registry.lookup('xml'), None)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
else:
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
def test_named_library(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('lxml', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('lxml', 'html'),
LXMLTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib'),
HTML5TreeBuilder)
self.assertEqual(registry.lookup('html.parser'),
HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self):
with warnings.catch_warnings(record=True) as w:
# This will create a warning about not explicitly
# specifying a parser, but we'll ignore it.
# You can pass in a string.
BeautifulSoup("", features="html")
# Or a list of strings.
BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate
# builder.
self.assertRaises(ValueError, BeautifulSoup,
"", features="no-such-feature")
class RegistryTest(unittest.TestCase):
"""Test the TreeBuilderRegistry class in general."""
def setUp(self):
self.registry = TreeBuilderRegistry()
def builder_for_features(self, *feature_list):
cls = type('Builder_' + '_'.join(feature_list),
(object,), {'features' : feature_list})
self.registry.register(cls)
return cls
def test_register_with_no_features(self):
builder = self.builder_for_features()
# Since the builder advertises no features, you can't find it
# by looking up features.
self.assertEqual(self.registry.lookup('foo'), None)
# But you can find it by doing a lookup with no features, if
# this happens to be the only registered builder.
self.assertEqual(self.registry.lookup(), builder)
def test_register_with_features_makes_lookup_succeed(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('foo'), builder)
self.assertEqual(self.registry.lookup('bar'), builder)
def test_lookup_fails_when_no_builder_implements_feature(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('baz'), None)
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
builder1 = self.builder_for_features('foo')
builder2 = self.builder_for_features('bar')
self.assertEqual(self.registry.lookup(), builder2)
def test_lookup_fails_when_no_tree_builders_registered(self):
self.assertEqual(self.registry.lookup(), None)
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
has_one = self.builder_for_features('foo')
has_the_other = self.builder_for_features('bar')
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
lacks_one = self.builder_for_features('bar')
has_the_other = self.builder_for_features('foo')
# There are two builders featuring 'foo' and 'bar', but
# the one that also features 'quux' was registered later.
self.assertEqual(self.registry.lookup('foo', 'bar'),
has_both_late)
# There is only one builder featuring 'foo', 'bar', and 'baz'.
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
has_both_early)
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
builder1 = self.builder_for_features('foo', 'bar')
builder2 = self.builder_for_features('foo', 'baz')
self.assertEqual(self.registry.lookup('bar', 'baz'), None)

View file

@ -0,0 +1,36 @@
"Test harness for doctests."
# pylint: disable-msg=E0611,W0142
__metaclass__ = type
__all__ = [
'additional_tests',
]
import atexit
import doctest
import os
#from pkg_resources import (
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (
doctest.ELLIPSIS |
doctest.NORMALIZE_WHITESPACE |
doctest.REPORT_NDIFF)
# def additional_tests():
# "Run the doc tests (README.txt and docs/*, if any exist)"
# doctest_files = [
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
# if resource_exists('bs4', 'docs'):
# for name in resource_listdir('bs4', 'docs'):
# if name.endswith('.txt'):
# doctest_files.append(
# os.path.abspath(
# resource_filename('bs4', 'docs/%s' % name)))
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
# atexit.register(cleanup_resources)
# return unittest.TestSuite((
# doctest.DocFileSuite(*doctest_files, **kwargs)))

View file

@ -0,0 +1,184 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError as e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
return HTML5TreeBuilder
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
self.assertTrue(
"the html5lib tree builder doesn't support parse_only" in
str(w[0].message))
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tbody><tr><td>Here\'s another table:'
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
'</td></tr></tbody></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_xml_declaration_followed_by_doctype(self):
markup = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<p>foo</p>
</body>
</html>'''
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEqual(b"<p>foo</p>", soup.p.encode())
def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup)
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup)
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self):
"""Verify that we keep the two whitespace nodes in this
document distinct when reparenting the adjacent <tbody> tags.
"""
markup = '<table> <tbody><tbody><ims></tbody> </table>'
soup = self.soup(markup)
space1, space2 = soup.find_all(string=' ')
tbody1, tbody2 = soup.find_all('tbody')
assert space1.next_element is tbody1
assert tbody2.next_element is space2
def test_reparented_markup_containing_children(self):
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
soup = self.soup(markup)
noscript = soup.noscript
self.assertEqual("target", noscript.next_element)
target = soup.find(string='target')
# The 'aftermath' string was duplicated; we want the second one.
final_aftermath = soup.find_all(string='aftermath')[-1]
# The <noscript> tag was moved beneath a copy of the <a> tag,
# but the 'target' string within is still connected to the
# (second) 'aftermath' string.
self.assertEqual(final_aftermath, target.next_element)
self.assertEqual(target, final_aftermath.previous_element)
def test_processing_instruction(self):
"""Processing instructions become comments."""
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
assert str(soup).startswith("<!--?PITarget PIContent?-->")
def test_cloned_multivalue_node(self):
markup = b"""<a class="my_class"><p></a>"""
soup = self.soup(markup)
a1, a2 = soup.find_all('a')
self.assertEqual(a1, a2)
assert a1 is not a2
def test_foster_parenting(self):
markup = b"""<table><td></tbody>A"""
soup = self.soup(markup)
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
def test_extraction(self):
"""
Test that extraction does not destroy the tree.
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
"""
markup = """
<html><head></head>
<style>
</style><script></script><body><p>hello</p></body></html>
"""
soup = self.soup(markup)
[s.extract() for s in soup('script')]
[s.extract() for s in soup('style')]
self.assertEqual(len(soup.find_all("p")), 1)
def test_empty_comment(self):
"""
Test that empty comment does not break structure.
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
"""
markup = """
<html>
<body>
<form>
<!----><input type="text">
</form>
</body>
</html>
"""
soup = self.soup(markup)
inputs = []
for form in soup.find_all('form'):
inputs.extend(form.find_all('input'))
self.assertEqual(len(inputs), 1)
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
soup = self.soup(markup)
self.assertEqual(2, soup.p.sourceline)
self.assertEqual(5, soup.p.sourcepos)
self.assertEqual("sourceline", soup.p.find('sourceline').name)
# You can deactivate this behavior.
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)

View file

@ -0,0 +1,61 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
default_builder = HTMLParserTreeBuilder
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "")
def test_empty_element(self):
# This verifies that any buffered data present when the parser
# finishes working is handled.
self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
def test_tracking_line_numbers(self):
# The html.parser TreeBuilder keeps track of line number and
# position of each element.
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
soup = self.soup(markup)
self.assertEqual(2, soup.p.sourceline)
self.assertEqual(3, soup.p.sourcepos)
self.assertEqual("sourceline", soup.p.find('sourceline').name)
# You can deactivate this behavior.
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
parser.error("don't crash")

115
lib/bs4/tests/test_lxml.py Normal file
View file

@ -0,0 +1,115 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import re
import warnings
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import Comment, Doctype, SoupStrainer
from bs4.testing import skipIf
from bs4.tests import test_htmlparser
from bs4.testing import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilder
def test_out_of_range_entity(self):
self.assertSoupEquals(
"<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into
# a string like u'\x93'
pass
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
self.assertEqual("<b/>", str(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
def test_tracking_line_numbers(self):
# The lxml TreeBuilder cannot keep track of line numbers from
# the original markup. Even if you ask for line numbers, we
# don't have 'em.
#
# This means that if you have a tag like <sourceline> or
# <sourcepos>, attribute access will find it rather than
# giving you a numeric answer.
soup = self.soup(
"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
store_line_numbers=True
)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilderForXML
def test_namespace_indexing(self):
# We should not track un-prefixed namespaces as we can only hold one
# and it will be recognized as the default namespace by soupsieve,
# which may be confusing in some situations. When no namespace is provided
# for a selector, the default namespace (if defined) is assumed.
soup = self.soup(
'<?xml version="1.1"?>\n'
'<root>'
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
'</root>'
)
self.assertEqual(
soup._namespaces,
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
)

682
lib/bs4/tests/test_soup.py Normal file
View file

@ -0,0 +1,682 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
from pdb import set_trace
import logging
import unittest
import sys
import tempfile
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.builder import (
TreeBuilder,
ParserRejectedMarkup,
)
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
Tag,
NavigableString,
)
import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
EncodingDetector,
)
from bs4.testing import (
default_builder,
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
data = "<h1>éé</h1>"
soup = self.soup(data)
self.assertEqual("éé", soup.h1.string)
def test_embedded_null(self):
data = "<h1>foo\0bar</h1>"
soup = self.soup(data)
self.assertEqual("foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
utf8_data = "Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
def test_custom_builder_class(self):
# Verify that you can pass in a custom Builder class and
# it'll be instantiated with the appropriate keyword arguments.
class Mock(object):
def __init__(self, **kwargs):
self.called_with = kwargs
self.is_xml = True
self.store_line_numbers = False
self.cdata_list_attributes = []
self.preserve_whitespace_tags = []
def initialize_soup(self, soup):
pass
def feed(self, markup):
self.fed = markup
def reset(self):
pass
def ignore(self, ignore):
pass
set_up_substitutions = can_be_empty_element = ignore
def prepare_markup(self, *args, **kwargs):
yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
kwargs = dict(
var="value",
# This is a deprecated BS3-era keyword argument, which
# will be stripped out.
convertEntities=True,
)
with warnings.catch_warnings(record=True):
soup = BeautifulSoup('', builder=Mock, **kwargs)
assert isinstance(soup.builder, Mock)
self.assertEqual(dict(var="value"), soup.builder.called_with)
self.assertEqual("prepared markup", soup.builder.fed)
# You can also instantiate the TreeBuilder yourself. In this
# case, that specific object is used and any keyword arguments
# to the BeautifulSoup constructor are ignored.
builder = Mock(**kwargs)
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup(
'', builder=builder, ignored_value=True,
)
msg = str(w[0].message)
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
self.assertEqual(builder, soup.builder)
self.assertEqual(kwargs, builder.called_with)
def test_parser_markup_rejection(self):
# If markup is completely rejected by the parser, an
# explanatory ParserRejectedMarkup exception is raised.
class Mock(TreeBuilder):
def feed(self, *args, **kwargs):
raise ParserRejectedMarkup("Nope.")
def prepare_markup(self, *args, **kwargs):
# We're going to try two different ways of preparing this markup,
# but feed() will reject both of them.
yield markup, None, None, False
yield markup, None, None, False
import re
self.assertRaisesRegex(
ParserRejectedMarkup,
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
BeautifulSoup, '', builder=Mock,
)
def test_cdata_list_attributes(self):
# Most attribute values are represented as scalars, but the
# HTML standard says that some attributes, like 'class' have
# space-separated lists as values.
markup = '<a id=" an id " class=" a class "></a>'
soup = self.soup(markup)
# Note that the spaces are stripped for 'class' but not for 'id'.
a = soup.a
self.assertEqual(" an id ", a['id'])
self.assertEqual(["a", "class"], a['class'])
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
self.assertEqual(" a class ", soup.a['class'])
# Here are two ways of saying that `id` is a multi-valued
# attribute in this context, but 'class' is not.
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
with warnings.catch_warnings(record=True) as w:
# This will create a warning about not explicitly
# specifying a parser, but we'll ignore it.
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
a = soup.a
self.assertEqual(["an", "id"], a['id'])
self.assertEqual(" a class ", a['class'])
def test_replacement_classes(self):
# Test the ability to pass in replacements for element classes
# which will be used when building the tree.
class TagPlus(Tag):
pass
class StringPlus(NavigableString):
pass
class CommentPlus(Comment):
pass
soup = self.soup(
"<a><b>foo</b>bar</a><!--whee-->",
element_classes = {
Tag: TagPlus,
NavigableString: StringPlus,
Comment: CommentPlus,
}
)
# The tree was built with TagPlus, StringPlus, and CommentPlus objects,
# rather than Tag, String, and Comment objects.
assert all(
isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
self.assertTrue(v)
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
self.assertTrue("parseOnlyThese" in msg)
self.assertTrue("parse_only" in msg)
self.assertEqual(b"<b></b>", soup.encode())
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
self.assertTrue("fromEncoding" in msg)
self.assertTrue("from_encoding" in msg)
self.assertEqual("utf8", soup.original_encoding)
def test_unrecognized_keyword_argument(self):
self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True)
class TestWarnings(SoupTest):
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
msg = str(w[0].message)
self.assertTrue("looks like a filename" in msg)
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
self.assertEqual(0, len(w))
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
# Be aware this isn't the only warning that can be raised during
# execution..
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup("http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_bytes_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup("http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
"foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
self.assertEqual(self.sub.substitute_xml("Welcome", True),
'"Welcome"')
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
'"Bob\'s Bar"')
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, True),
"'Welcome to \"my bar\"'")
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
self.assertEqual(
self.sub.substitute_xml(s, True),
'"Welcome to &quot;Bob\'s Bar&quot;"')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
def test_xml_quoting_handles_angle_brackets(self):
self.assertEqual(
self.sub.substitute_xml("foo<bar>"),
"foo&lt;bar&gt;")
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"),
"&amp;Aacute;T&amp;T")
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
self.assertEqual(self.sub.substitute_html(text), text)
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setUp(self):
super(TestEncodingConversion, self).setUp()
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set to 'utf-8', a superset of ASCII.
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
# Disable chardet, which will realize that the ASCII is ASCII.
bs4.dammit.chardet_dammit = noop
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, str))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
@skipIf(
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
# And if we exclude that, there is no valid guess at all.
dammit = UnicodeDammit(
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
self.assertEqual(dammit.original_encoding, None)
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
for data in (
b'<html><meta charset="euc-jp" /></html>',
b"<html><meta charset='euc-jp' /></html>",
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_byte_order_mark_removed(self):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
def test_find_declared_encoding(self):
# Test our ability to find a declared encoding inside an
# XML or HTML document.
#
# Even if the document comes in as Unicode, it may be
# interesting to know what encoding was claimed
# originally.
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
html_bytes = html_unicode.encode("ascii")
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
xml_bytes = xml_unicode.encode("ascii")
m = EncodingDetector.find_declared_encoding
self.assertEqual(None, m(html_unicode, is_html=False))
self.assertEqual("utf-8", m(html_unicode, is_html=True))
self.assertEqual("utf-8", m(html_bytes, is_html=True))
self.assertEqual("iso-8859-1", m(xml_unicode))
self.assertEqual("iso-8859-1", m(xml_bytes))
# Normally, only the first few kilobytes of a document are checked for
# an encoding.
spacer = b' ' * 5000
self.assertEqual(None, m(spacer + html_bytes))
self.assertEqual(None, m(spacer + xml_bytes))
# But you can tell find_declared_encoding to search an entire
# HTML document.
self.assertEqual(
"utf-8",
m(spacer + html_bytes, is_html=True, search_entire_document=True)
)
# The XML encoding declaration has to be the very first thing
# in the document. We'll allow whitespace before the document
# starts, but nothing else.
self.assertEqual(
"iso-8859-1",
m(xml_bytes, search_entire_document=True)
)
self.assertEqual(
None, m(b'a' + xml_bytes, search_entire_document=True)
)
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none_or_missing(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
a = NamespacedAttribute("xmlns")
self.assertEqual(a, "xmlns")
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
a = NamespacedAttribute("a", "b", "c")
b = NamespacedAttribute("a", "b", "c")
self.assertEqual(a, b)
# The actual namespace is not considered.
c = NamespacedAttribute("a", "b", None)
self.assertEqual(a, c)
# But name and prefix are important.
d = NamespacedAttribute("a", "z", "c")
self.assertNotEqual(a, d)
e = NamespacedAttribute("z", "b", "c")
self.assertNotEqual(a, e)
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
def test_content_meta_attribute_value(self):
value = CharsetMetaAttributeValue("euc-jp")
self.assertEqual("euc-jp", value)
self.assertEqual("euc-jp", value.original_value)
self.assertEqual("utf8", value.encode("utf8"))
def test_content_meta_attribute_value(self):
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
self.assertEqual("text/html; charset=euc-jp", value)
self.assertEqual("text/html; charset=euc-jp", value.original_value)
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

2254
lib/bs4/tests/test_tree.py Normal file

File diff suppressed because it is too large Load diff