mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 21:21:15 -07:00
Update bs4 to 4.8.1 (with 2to3)
This commit is contained in:
parent
23c4e5b09d
commit
f28e741ad7
19 changed files with 5487 additions and 792 deletions
|
@ -5,26 +5,30 @@ http://www.crummy.com/software/BeautifulSoup/
|
||||||
|
|
||||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||||
provides provides methods and Pythonic idioms that make it easy to
|
provides methods and Pythonic idioms that make it easy to navigate,
|
||||||
navigate, search, and modify the parse tree.
|
search, and modify the parse tree.
|
||||||
|
|
||||||
Beautiful Soup works with Python 2.6 and up. It works better if lxml
|
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||||
and/or html5lib is installed.
|
and/or html5lib is installed.
|
||||||
|
|
||||||
For more than you ever wanted to know about Beautiful Soup, see the
|
For more than you ever wanted to know about Beautiful Soup, see the
|
||||||
documentation:
|
documentation:
|
||||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.3.2"
|
__version__ = "4.8.1"
|
||||||
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .builder import builder_registry, ParserRejectedMarkup
|
from .builder import builder_registry, ParserRejectedMarkup
|
||||||
|
@ -45,7 +49,7 @@ from .element import (
|
||||||
|
|
||||||
# The very first thing we do is give a useful error if someone is
|
# The very first thing we do is give a useful error if someone is
|
||||||
# running this code under Python 3 without converting it.
|
# running this code under Python 3 without converting it.
|
||||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||||
|
|
||||||
class BeautifulSoup(Tag):
|
class BeautifulSoup(Tag):
|
||||||
"""
|
"""
|
||||||
|
@ -59,7 +63,7 @@ class BeautifulSoup(Tag):
|
||||||
handle_starttag(name, attrs) # See note about return value
|
handle_starttag(name, attrs) # See note about return value
|
||||||
handle_endtag(name)
|
handle_endtag(name)
|
||||||
handle_data(data) # Appends to the current data node
|
handle_data(data) # Appends to the current data node
|
||||||
endData(containerClass=NavigableString) # Ends the current data node
|
endData(containerClass) # Ends the current data node
|
||||||
|
|
||||||
No matter how complicated the underlying parser is, you should be
|
No matter how complicated the underlying parser is, you should be
|
||||||
able to build a tree using 'start tag' events, 'end tag' events,
|
able to build a tree using 'start tag' events, 'end tag' events,
|
||||||
|
@ -69,7 +73,7 @@ class BeautifulSoup(Tag):
|
||||||
like HTML's <br> tag), call handle_starttag and then
|
like HTML's <br> tag), call handle_starttag and then
|
||||||
handle_endtag.
|
handle_endtag.
|
||||||
"""
|
"""
|
||||||
ROOT_TAG_NAME = u'[document]'
|
ROOT_TAG_NAME = '[document]'
|
||||||
|
|
||||||
# If the end-user gives no indication which tree builder they
|
# If the end-user gives no indication which tree builder they
|
||||||
# want, look for one with these features.
|
# want, look for one with these features.
|
||||||
|
@ -77,13 +81,62 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, **kwargs):
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
"""The Soup object is initialized as the 'root tag', and the
|
element_classes=None, **kwargs):
|
||||||
provided markup (which can be a string or a file-like object)
|
"""Constructor.
|
||||||
is fed into the underlying parser."""
|
|
||||||
|
:param markup: A string or a file-like object representing
|
||||||
|
markup to be parsed.
|
||||||
|
|
||||||
|
:param features: Desirable features of the parser to be used. This
|
||||||
|
may be the name of a specific parser ("lxml", "lxml-xml",
|
||||||
|
"html.parser", or "html5lib") or it may be the type of markup
|
||||||
|
to be used ("html", "html5", "xml"). It's recommended that you
|
||||||
|
name a specific parser, so that Beautiful Soup gives you the
|
||||||
|
same results across platforms and virtual environments.
|
||||||
|
|
||||||
|
:param builder: A TreeBuilder subclass to instantiate (or
|
||||||
|
instance to use) instead of looking one up based on
|
||||||
|
`features`. You only need to use this if you've implemented a
|
||||||
|
custom TreeBuilder.
|
||||||
|
|
||||||
|
:param parse_only: A SoupStrainer. Only parts of the document
|
||||||
|
matching the SoupStrainer will be considered. This is useful
|
||||||
|
when parsing part of a document that would otherwise be too
|
||||||
|
large to fit into memory.
|
||||||
|
|
||||||
|
:param from_encoding: A string indicating the encoding of the
|
||||||
|
document to be parsed. Pass this in if Beautiful Soup is
|
||||||
|
guessing wrongly about the document's encoding.
|
||||||
|
|
||||||
|
:param exclude_encodings: A list of strings indicating
|
||||||
|
encodings known to be wrong. Pass this in if you don't know
|
||||||
|
the document's encoding but you know Beautiful Soup's guess is
|
||||||
|
wrong.
|
||||||
|
|
||||||
|
:param element_classes: A dictionary mapping BeautifulSoup
|
||||||
|
classes like Tag and NavigableString to other classes you'd
|
||||||
|
like to be instantiated instead as the parse tree is
|
||||||
|
built. This is useful for using subclasses to modify the
|
||||||
|
default behavior of Tag or NavigableString.
|
||||||
|
|
||||||
|
:param kwargs: For backwards compatibility purposes, the
|
||||||
|
constructor accepts certain keyword arguments used in
|
||||||
|
Beautiful Soup 3. None of these arguments do anything in
|
||||||
|
Beautiful Soup 4; they will result in a warning and then be ignored.
|
||||||
|
|
||||||
|
Apart from this, any keyword arguments passed into the BeautifulSoup
|
||||||
|
constructor are propagated to the TreeBuilder constructor. This
|
||||||
|
makes it possible to configure a TreeBuilder beyond saying
|
||||||
|
which one to use.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
if 'convertEntities' in kwargs:
|
if 'convertEntities' in kwargs:
|
||||||
|
del kwargs['convertEntities']
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"BS4 does not respect the convertEntities argument to the "
|
"BS4 does not respect the convertEntities argument to the "
|
||||||
"BeautifulSoup constructor. Entities are always converted "
|
"BeautifulSoup constructor. Entities are always converted "
|
||||||
|
@ -114,9 +167,9 @@ class BeautifulSoup(Tag):
|
||||||
del kwargs['isHTML']
|
del kwargs['isHTML']
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"BS4 does not respect the isHTML argument to the "
|
"BS4 does not respect the isHTML argument to the "
|
||||||
"BeautifulSoup constructor. You can pass in features='html' "
|
"BeautifulSoup constructor. Suggest you use "
|
||||||
"or features='xml' to get a builder capable of handling "
|
"features='lxml' for HTML and features='lxml-xml' for "
|
||||||
"one or the other.")
|
"XML.")
|
||||||
|
|
||||||
def deprecated_argument(old_name, new_name):
|
def deprecated_argument(old_name, new_name):
|
||||||
if old_name in kwargs:
|
if old_name in kwargs:
|
||||||
|
@ -134,13 +187,24 @@ class BeautifulSoup(Tag):
|
||||||
from_encoding = from_encoding or deprecated_argument(
|
from_encoding = from_encoding or deprecated_argument(
|
||||||
"fromEncoding", "from_encoding")
|
"fromEncoding", "from_encoding")
|
||||||
|
|
||||||
if len(kwargs) > 0:
|
if from_encoding and isinstance(markup, str):
|
||||||
arg = kwargs.keys().pop()
|
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||||
raise TypeError(
|
from_encoding = None
|
||||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
|
||||||
|
|
||||||
if builder is None:
|
self.element_classes = element_classes or dict()
|
||||||
if isinstance(features, basestring):
|
|
||||||
|
# We need this information to track whether or not the builder
|
||||||
|
# was specified well enough that we can omit the 'you need to
|
||||||
|
# specify a parser' warning.
|
||||||
|
original_builder = builder
|
||||||
|
original_features = features
|
||||||
|
|
||||||
|
if isinstance(builder, type):
|
||||||
|
# A builder class was passed in; it needs to be instantiated.
|
||||||
|
builder_class = builder
|
||||||
|
builder = None
|
||||||
|
elif builder is None:
|
||||||
|
if isinstance(features, str):
|
||||||
features = [features]
|
features = [features]
|
||||||
if features is None or len(features) == 0:
|
if features is None or len(features) == 0:
|
||||||
features = self.DEFAULT_BUILDER_FEATURES
|
features = self.DEFAULT_BUILDER_FEATURES
|
||||||
|
@ -150,21 +214,73 @@ class BeautifulSoup(Tag):
|
||||||
"Couldn't find a tree builder with the features you "
|
"Couldn't find a tree builder with the features you "
|
||||||
"requested: %s. Do you need to install a parser library?"
|
"requested: %s. Do you need to install a parser library?"
|
||||||
% ",".join(features))
|
% ",".join(features))
|
||||||
builder = builder_class()
|
|
||||||
|
# At this point either we have a TreeBuilder instance in
|
||||||
|
# builder, or we have a builder_class that we can instantiate
|
||||||
|
# with the remaining **kwargs.
|
||||||
|
if builder is None:
|
||||||
|
builder = builder_class(**kwargs)
|
||||||
|
if not original_builder and not (
|
||||||
|
original_features == builder.NAME or
|
||||||
|
original_features in builder.ALTERNATE_NAMES
|
||||||
|
):
|
||||||
|
if builder.is_xml:
|
||||||
|
markup_type = "XML"
|
||||||
|
else:
|
||||||
|
markup_type = "HTML"
|
||||||
|
|
||||||
|
# This code adapted from warnings.py so that we get the same line
|
||||||
|
# of code as our warnings.warn() call gets, even if the answer is wrong
|
||||||
|
# (as it may be in a multithreading situation).
|
||||||
|
caller = None
|
||||||
|
try:
|
||||||
|
caller = sys._getframe(1)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
if caller:
|
||||||
|
globals = caller.f_globals
|
||||||
|
line_number = caller.f_lineno
|
||||||
|
else:
|
||||||
|
globals = sys.__dict__
|
||||||
|
line_number= 1
|
||||||
|
filename = globals.get('__file__')
|
||||||
|
if filename:
|
||||||
|
fnl = filename.lower()
|
||||||
|
if fnl.endswith((".pyc", ".pyo")):
|
||||||
|
filename = filename[:-1]
|
||||||
|
if filename:
|
||||||
|
# If there is no filename at all, the user is most likely in a REPL,
|
||||||
|
# and the warning is not necessary.
|
||||||
|
values = dict(
|
||||||
|
filename=filename,
|
||||||
|
line_number=line_number,
|
||||||
|
parser=builder.NAME,
|
||||||
|
markup_type=markup_type
|
||||||
|
)
|
||||||
|
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
||||||
|
else:
|
||||||
|
if kwargs:
|
||||||
|
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
||||||
|
|
||||||
self.builder = builder
|
self.builder = builder
|
||||||
self.is_xml = builder.is_xml
|
self.is_xml = builder.is_xml
|
||||||
self.builder.soup = self
|
self.known_xml = self.is_xml
|
||||||
|
self._namespaces = dict()
|
||||||
self.parse_only = parse_only
|
self.parse_only = parse_only
|
||||||
|
|
||||||
|
self.builder.initialize_soup(self)
|
||||||
|
|
||||||
if hasattr(markup, 'read'): # It's a file-type object.
|
if hasattr(markup, 'read'): # It's a file-type object.
|
||||||
markup = markup.read()
|
markup = markup.read()
|
||||||
elif len(markup) <= 256:
|
elif len(markup) <= 256 and (
|
||||||
|
(isinstance(markup, bytes) and not b'<' in markup)
|
||||||
|
or (isinstance(markup, str) and not '<' in markup)
|
||||||
|
):
|
||||||
# Print out warnings for a couple beginner problems
|
# Print out warnings for a couple beginner problems
|
||||||
# involving passing non-markup to Beautiful Soup.
|
# involving passing non-markup to Beautiful Soup.
|
||||||
# Beautiful Soup will still parse the input as markup,
|
# Beautiful Soup will still parse the input as markup,
|
||||||
# just in case that's what the user really wants.
|
# just in case that's what the user really wants.
|
||||||
if (isinstance(markup, unicode)
|
if (isinstance(markup, str)
|
||||||
and not os.path.supports_unicode_filenames):
|
and not os.path.supports_unicode_filenames):
|
||||||
possible_filename = markup.encode("utf8")
|
possible_filename = markup.encode("utf8")
|
||||||
else:
|
else:
|
||||||
|
@ -172,37 +288,93 @@ class BeautifulSoup(Tag):
|
||||||
is_file = False
|
is_file = False
|
||||||
try:
|
try:
|
||||||
is_file = os.path.exists(possible_filename)
|
is_file = os.path.exists(possible_filename)
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# This is almost certainly a problem involving
|
# This is almost certainly a problem involving
|
||||||
# characters not valid in filenames on this
|
# characters not valid in filenames on this
|
||||||
# system. Just let it go.
|
# system. Just let it go.
|
||||||
pass
|
pass
|
||||||
if is_file:
|
if is_file:
|
||||||
|
if isinstance(markup, str):
|
||||||
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
|
'"%s" looks like a filename, not markup. You should'
|
||||||
if markup[:5] == "http:" or markup[:6] == "https:":
|
' probably open this file and pass the filehandle into'
|
||||||
# TODO: This is ugly but I couldn't get it to work in
|
' Beautiful Soup.' % markup)
|
||||||
# Python 3 otherwise.
|
self._check_markup_is_url(markup)
|
||||||
if ((isinstance(markup, bytes) and not b' ' in markup)
|
|
||||||
or (isinstance(markup, unicode) and not u' ' in markup)):
|
|
||||||
warnings.warn(
|
|
||||||
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
|
||||||
|
|
||||||
|
rejections = []
|
||||||
|
success = False
|
||||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||||
self.contains_replacement_characters) in (
|
self.contains_replacement_characters) in (
|
||||||
self.builder.prepare_markup(markup, from_encoding)):
|
self.builder.prepare_markup(
|
||||||
|
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||||
self.reset()
|
self.reset()
|
||||||
try:
|
try:
|
||||||
self._feed()
|
self._feed()
|
||||||
|
success = True
|
||||||
break
|
break
|
||||||
except ParserRejectedMarkup:
|
except ParserRejectedMarkup as e:
|
||||||
|
rejections.append(e)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
other_exceptions = [str(e) for e in rejections]
|
||||||
|
raise ParserRejectedMarkup(
|
||||||
|
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
|
||||||
|
)
|
||||||
|
|
||||||
# Clear out the markup and remove the builder's circular
|
# Clear out the markup and remove the builder's circular
|
||||||
# reference to this object.
|
# reference to this object.
|
||||||
self.markup = None
|
self.markup = None
|
||||||
self.builder.soup = None
|
self.builder.soup = None
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
copy = type(self)(
|
||||||
|
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Although we encoded the tree to UTF-8, that may not have
|
||||||
|
# been the encoding of the original markup. Set the copy's
|
||||||
|
# .original_encoding to reflect the original object's
|
||||||
|
# .original_encoding.
|
||||||
|
copy.original_encoding = self.original_encoding
|
||||||
|
return copy
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
# Frequently a tree builder can't be pickled.
|
||||||
|
d = dict(self.__dict__)
|
||||||
|
if 'builder' in d and not self.builder.picklable:
|
||||||
|
d['builder'] = None
|
||||||
|
return d
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _check_markup_is_url(markup):
|
||||||
|
"""
|
||||||
|
Check if markup looks like it's actually a url and raise a warning
|
||||||
|
if so. Markup can be unicode or str (py2) / bytes (py3).
|
||||||
|
"""
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
space = b' '
|
||||||
|
cant_start_with = (b"http:", b"https:")
|
||||||
|
elif isinstance(markup, str):
|
||||||
|
space = ' '
|
||||||
|
cant_start_with = ("http:", "https:")
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||||
|
if not space in markup:
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
decoded_markup = markup.decode('utf-8', 'replace')
|
||||||
|
else:
|
||||||
|
decoded_markup = markup
|
||||||
|
warnings.warn(
|
||||||
|
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||||
|
' HTTP client. You should probably use an HTTP client like'
|
||||||
|
' requests to get the document behind the URL, and feed'
|
||||||
|
' that document to Beautiful Soup.' % decoded_markup
|
||||||
|
)
|
||||||
|
|
||||||
def _feed(self):
|
def _feed(self):
|
||||||
# Convert the document to Unicode.
|
# Convert the document to Unicode.
|
||||||
self.builder.reset()
|
self.builder.reset()
|
||||||
|
@ -223,15 +395,21 @@ class BeautifulSoup(Tag):
|
||||||
self.preserve_whitespace_tag_stack = []
|
self.preserve_whitespace_tag_stack = []
|
||||||
self.pushTag(self)
|
self.pushTag(self)
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
||||||
|
sourceline=None, sourcepos=None, **kwattrs):
|
||||||
"""Create a new tag associated with this soup."""
|
"""Create a new tag associated with this soup."""
|
||||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
kwattrs.update(attrs)
|
||||||
|
return self.element_classes.get(Tag, Tag)(
|
||||||
|
None, self.builder, name, namespace, nsprefix, kwattrs,
|
||||||
|
sourceline=sourceline, sourcepos=sourcepos
|
||||||
|
)
|
||||||
|
|
||||||
def new_string(self, s, subclass=NavigableString):
|
def new_string(self, s, subclass=None):
|
||||||
"""Create a new NavigableString associated with this soup."""
|
"""Create a new NavigableString associated with this soup."""
|
||||||
navigable = subclass(s)
|
subclass = subclass or self.element_classes.get(
|
||||||
navigable.setup()
|
NavigableString, NavigableString
|
||||||
return navigable
|
)
|
||||||
|
return subclass(s)
|
||||||
|
|
||||||
def insert_before(self, successor):
|
def insert_before(self, successor):
|
||||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||||
|
@ -250,16 +428,26 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def pushTag(self, tag):
|
def pushTag(self, tag):
|
||||||
#print "Push", tag.name
|
#print "Push", tag.name
|
||||||
if self.currentTag:
|
if self.currentTag is not None:
|
||||||
self.currentTag.contents.append(tag)
|
self.currentTag.contents.append(tag)
|
||||||
self.tagStack.append(tag)
|
self.tagStack.append(tag)
|
||||||
self.currentTag = self.tagStack[-1]
|
self.currentTag = self.tagStack[-1]
|
||||||
if tag.name in self.builder.preserve_whitespace_tags:
|
if tag.name in self.builder.preserve_whitespace_tags:
|
||||||
self.preserve_whitespace_tag_stack.append(tag)
|
self.preserve_whitespace_tag_stack.append(tag)
|
||||||
|
|
||||||
def endData(self, containerClass=NavigableString):
|
def endData(self, containerClass=None):
|
||||||
|
|
||||||
|
# Default container is NavigableString.
|
||||||
|
containerClass = containerClass or NavigableString
|
||||||
|
|
||||||
|
# The user may want us to instantiate some alias for the
|
||||||
|
# container class.
|
||||||
|
containerClass = self.element_classes.get(
|
||||||
|
containerClass, containerClass
|
||||||
|
)
|
||||||
|
|
||||||
if self.current_data:
|
if self.current_data:
|
||||||
current_data = u''.join(self.current_data)
|
current_data = ''.join(self.current_data)
|
||||||
# If whitespace is not preserved, and this string contains
|
# If whitespace is not preserved, and this string contains
|
||||||
# nothing but ASCII spaces, replace it with a single space
|
# nothing but ASCII spaces, replace it with a single space
|
||||||
# or newline.
|
# or newline.
|
||||||
|
@ -289,15 +477,72 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||||
"""Add an object to the parse tree."""
|
"""Add an object to the parse tree."""
|
||||||
parent = parent or self.currentTag
|
if parent is None:
|
||||||
most_recent_element = most_recent_element or self._most_recent_element
|
parent = self.currentTag
|
||||||
o.setup(parent, most_recent_element)
|
|
||||||
|
|
||||||
if most_recent_element is not None:
|
if most_recent_element is not None:
|
||||||
most_recent_element.next_element = o
|
previous_element = most_recent_element
|
||||||
|
else:
|
||||||
|
previous_element = self._most_recent_element
|
||||||
|
|
||||||
|
next_element = previous_sibling = next_sibling = None
|
||||||
|
if isinstance(o, Tag):
|
||||||
|
next_element = o.next_element
|
||||||
|
next_sibling = o.next_sibling
|
||||||
|
previous_sibling = o.previous_sibling
|
||||||
|
if previous_element is None:
|
||||||
|
previous_element = o.previous_element
|
||||||
|
|
||||||
|
fix = parent.next_element is not None
|
||||||
|
|
||||||
|
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||||
|
|
||||||
self._most_recent_element = o
|
self._most_recent_element = o
|
||||||
parent.contents.append(o)
|
parent.contents.append(o)
|
||||||
|
|
||||||
|
# Check if we are inserting into an already parsed node.
|
||||||
|
if fix:
|
||||||
|
self._linkage_fixer(parent)
|
||||||
|
|
||||||
|
def _linkage_fixer(self, el):
|
||||||
|
"""Make sure linkage of this fragment is sound."""
|
||||||
|
|
||||||
|
first = el.contents[0]
|
||||||
|
child = el.contents[-1]
|
||||||
|
descendant = child
|
||||||
|
|
||||||
|
if child is first and el.parent is not None:
|
||||||
|
# Parent should be linked to first child
|
||||||
|
el.next_element = child
|
||||||
|
# We are no longer linked to whatever this element is
|
||||||
|
prev_el = child.previous_element
|
||||||
|
if prev_el is not None and prev_el is not el:
|
||||||
|
prev_el.next_element = None
|
||||||
|
# First child should be linked to the parent, and no previous siblings.
|
||||||
|
child.previous_element = el
|
||||||
|
child.previous_sibling = None
|
||||||
|
|
||||||
|
# We have no sibling as we've been appended as the last.
|
||||||
|
child.next_sibling = None
|
||||||
|
|
||||||
|
# This index is a tag, dig deeper for a "last descendant"
|
||||||
|
if isinstance(child, Tag) and child.contents:
|
||||||
|
descendant = child._last_descendant(False)
|
||||||
|
|
||||||
|
# As the final step, link last descendant. It should be linked
|
||||||
|
# to the parent's next sibling (if found), else walk up the chain
|
||||||
|
# and find a parent with a sibling. It should have no next sibling.
|
||||||
|
descendant.next_element = None
|
||||||
|
descendant.next_sibling = None
|
||||||
|
target = el
|
||||||
|
while True:
|
||||||
|
if target is None:
|
||||||
|
break
|
||||||
|
elif target.next_sibling is not None:
|
||||||
|
descendant.next_element = target.next_sibling
|
||||||
|
target.next_sibling.previous_element = child
|
||||||
|
break
|
||||||
|
target = target.parent
|
||||||
|
|
||||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
"""Pops the tag stack up to and including the most recent
|
"""Pops the tag stack up to and including the most recent
|
||||||
instance of the given tag. If inclusivePop is false, pops the tag
|
instance of the given tag. If inclusivePop is false, pops the tag
|
||||||
|
@ -321,11 +566,12 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
return most_recently_popped
|
return most_recently_popped
|
||||||
|
|
||||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
||||||
|
sourcepos=None):
|
||||||
"""Push a start tag on to the stack.
|
"""Push a start tag on to the stack.
|
||||||
|
|
||||||
If this method returns None, the tag was rejected by the
|
If this method returns None, the tag was rejected by the
|
||||||
SoupStrainer. You should proceed as if the tag had not occured
|
SoupStrainer. You should proceed as if the tag had not occurred
|
||||||
in the document. For instance, if this was a self-closing tag,
|
in the document. For instance, if this was a self-closing tag,
|
||||||
don't call handle_endtag.
|
don't call handle_endtag.
|
||||||
"""
|
"""
|
||||||
|
@ -338,11 +584,14 @@ class BeautifulSoup(Tag):
|
||||||
or not self.parse_only.search_tag(name, attrs))):
|
or not self.parse_only.search_tag(name, attrs))):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
tag = self.element_classes.get(Tag, Tag)(
|
||||||
self.currentTag, self._most_recent_element)
|
self, self.builder, name, namespace, nsprefix, attrs,
|
||||||
|
self.currentTag, self._most_recent_element,
|
||||||
|
sourceline=sourceline, sourcepos=sourcepos
|
||||||
|
)
|
||||||
if tag is None:
|
if tag is None:
|
||||||
return tag
|
return tag
|
||||||
if self._most_recent_element:
|
if self._most_recent_element is not None:
|
||||||
self._most_recent_element.next_element = tag
|
self._most_recent_element.next_element = tag
|
||||||
self._most_recent_element = tag
|
self._most_recent_element = tag
|
||||||
self.pushTag(tag)
|
self.pushTag(tag)
|
||||||
|
@ -367,9 +616,9 @@ class BeautifulSoup(Tag):
|
||||||
encoding_part = ''
|
encoding_part = ''
|
||||||
if eventual_encoding != None:
|
if eventual_encoding != None:
|
||||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||||
else:
|
else:
|
||||||
prefix = u''
|
prefix = ''
|
||||||
if not pretty_print:
|
if not pretty_print:
|
||||||
indent_level = None
|
indent_level = None
|
||||||
else:
|
else:
|
||||||
|
@ -403,4 +652,4 @@ class FeatureNotFound(ValueError):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
soup = BeautifulSoup(sys.stdin)
|
soup = BeautifulSoup(sys.stdin)
|
||||||
print soup.prettify()
|
print(soup.prettify())
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
import sys
|
import sys
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CharsetMetaAttributeValue,
|
CharsetMetaAttributeValue,
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
whitespace_re
|
nonwhitespace_re
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
@ -80,20 +83,69 @@ builder_registry = TreeBuilderRegistry()
|
||||||
class TreeBuilder(object):
|
class TreeBuilder(object):
|
||||||
"""Turn a document into a Beautiful Soup object tree."""
|
"""Turn a document into a Beautiful Soup object tree."""
|
||||||
|
|
||||||
|
NAME = "[Unknown tree builder]"
|
||||||
|
ALTERNATE_NAMES = []
|
||||||
features = []
|
features = []
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
preserve_whitespace_tags = set()
|
picklable = False
|
||||||
empty_element_tags = None # A tag will be considered an empty-element
|
empty_element_tags = None # A tag will be considered an empty-element
|
||||||
# tag when and only when it has no contents.
|
# tag when and only when it has no contents.
|
||||||
|
|
||||||
# A value for these tag/attribute combinations is a space- or
|
# A value for these tag/attribute combinations is a space- or
|
||||||
# comma-separated list of CDATA, rather than a single CDATA.
|
# comma-separated list of CDATA, rather than a single CDATA.
|
||||||
cdata_list_attributes = {}
|
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||||
|
|
||||||
|
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||||
|
|
||||||
def __init__(self):
|
USE_DEFAULT = object()
|
||||||
|
|
||||||
|
# Most parsers don't keep track of line numbers.
|
||||||
|
TRACKS_LINE_NUMBERS = False
|
||||||
|
|
||||||
|
def __init__(self, multi_valued_attributes=USE_DEFAULT,
|
||||||
|
preserve_whitespace_tags=USE_DEFAULT,
|
||||||
|
store_line_numbers=USE_DEFAULT):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
:param multi_valued_attributes: If this is set to None, the
|
||||||
|
TreeBuilder will not turn any values for attributes like
|
||||||
|
'class' into lists. Setting this do a dictionary will
|
||||||
|
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||||
|
for an example.
|
||||||
|
|
||||||
|
Internally, these are called "CDATA list attributes", but that
|
||||||
|
probably doesn't make sense to an end-user, so the argument name
|
||||||
|
is `multi_valued_attributes`.
|
||||||
|
|
||||||
|
:param preserve_whitespace_tags: A list of tags to treat
|
||||||
|
the way <pre> tags are treated in HTML. Tags in this list
|
||||||
|
will have
|
||||||
|
|
||||||
|
:param store_line_numbers: If the parser keeps track of the
|
||||||
|
line numbers and positions of the original markup, that
|
||||||
|
information will, by default, be stored in each corresponding
|
||||||
|
`Tag` object. You can turn this off by passing
|
||||||
|
store_line_numbers=False. If the parser you're using doesn't
|
||||||
|
keep track of this information, then setting store_line_numbers=True
|
||||||
|
will do nothing.
|
||||||
|
"""
|
||||||
self.soup = None
|
self.soup = None
|
||||||
|
if multi_valued_attributes is self.USE_DEFAULT:
|
||||||
|
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||||
|
self.cdata_list_attributes = multi_valued_attributes
|
||||||
|
if preserve_whitespace_tags is self.USE_DEFAULT:
|
||||||
|
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
|
||||||
|
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||||
|
if store_line_numbers == self.USE_DEFAULT:
|
||||||
|
store_line_numbers = self.TRACKS_LINE_NUMBERS
|
||||||
|
self.store_line_numbers = store_line_numbers
|
||||||
|
|
||||||
|
def initialize_soup(self, soup):
|
||||||
|
"""The BeautifulSoup object has been initialized and is now
|
||||||
|
being associated with the TreeBuilder.
|
||||||
|
"""
|
||||||
|
self.soup = soup
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
pass
|
pass
|
||||||
|
@ -123,8 +175,8 @@ class TreeBuilder(object):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
document_declared_encoding=None):
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
return markup, None, None, False
|
yield markup, None, None, False
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""Wrap an HTML fragment to make it look like a document.
|
"""Wrap an HTML fragment to make it look like a document.
|
||||||
|
@ -153,14 +205,14 @@ class TreeBuilder(object):
|
||||||
universal = self.cdata_list_attributes.get('*', [])
|
universal = self.cdata_list_attributes.get('*', [])
|
||||||
tag_specific = self.cdata_list_attributes.get(
|
tag_specific = self.cdata_list_attributes.get(
|
||||||
tag_name.lower(), None)
|
tag_name.lower(), None)
|
||||||
for attr in attrs.keys():
|
for attr in list(attrs.keys()):
|
||||||
if attr in universal or (tag_specific and attr in tag_specific):
|
if attr in universal or (tag_specific and attr in tag_specific):
|
||||||
# We have a "class"-type attribute whose string
|
# We have a "class"-type attribute whose string
|
||||||
# value is a whitespace-separated list of
|
# value is a whitespace-separated list of
|
||||||
# values. Split it into a list.
|
# values. Split it into a list.
|
||||||
value = attrs[attr]
|
value = attrs[attr]
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
values = whitespace_re.split(value)
|
values = nonwhitespace_re.findall(value)
|
||||||
else:
|
else:
|
||||||
# html5lib sometimes calls setAttributes twice
|
# html5lib sometimes calls setAttributes twice
|
||||||
# for the same tag when rearranging the parse
|
# for the same tag when rearranging the parse
|
||||||
|
@ -224,9 +276,19 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
Such as which tags are empty-element tags.
|
Such as which tags are empty-element tags.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
preserve_whitespace_tags = set(['pre', 'textarea'])
|
empty_element_tags = set([
|
||||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
# These are from HTML5.
|
||||||
'spacer', 'link', 'frame', 'base'])
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||||
|
|
||||||
|
# These are from earlier versions of HTML and are removed in HTML5.
|
||||||
|
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
|
||||||
|
])
|
||||||
|
|
||||||
|
# The HTML standard defines these as block-level elements. Beautiful
|
||||||
|
# Soup does not treat these elements differently from other elements,
|
||||||
|
# but it may do so eventually, and this information is available if
|
||||||
|
# you need to use it.
|
||||||
|
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||||
|
|
||||||
# The HTML standard defines these attributes as containing a
|
# The HTML standard defines these attributes as containing a
|
||||||
# space-separated list of values, not a single value. That is,
|
# space-separated list of values, not a single value. That is,
|
||||||
|
@ -235,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
# encounter one of these attributes, we will parse its value into
|
# encounter one of these attributes, we will parse its value into
|
||||||
# a list of values if possible. Upon output, the list will be
|
# a list of values if possible. Upon output, the list will be
|
||||||
# converted back into a string.
|
# converted back into a string.
|
||||||
cdata_list_attributes = {
|
DEFAULT_CDATA_LIST_ATTRIBUTES = {
|
||||||
"*" : ['class', 'accesskey', 'dropzone'],
|
"*" : ['class', 'accesskey', 'dropzone'],
|
||||||
"a" : ['rel', 'rev'],
|
"a" : ['rel', 'rev'],
|
||||||
"link" : ['rel', 'rev'],
|
"link" : ['rel', 'rev'],
|
||||||
|
@ -252,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
"output" : ["for"],
|
"output" : ["for"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
||||||
|
|
||||||
def set_up_substitutions(self, tag):
|
def set_up_substitutions(self, tag):
|
||||||
# We are only interested in <meta> tags
|
# We are only interested in <meta> tags
|
||||||
if tag.name != 'meta':
|
if tag.name != 'meta':
|
||||||
|
@ -299,7 +363,14 @@ def register_treebuilders_from(module):
|
||||||
this_module.builder_registry.register(obj)
|
this_module.builder_registry.register(obj)
|
||||||
|
|
||||||
class ParserRejectedMarkup(Exception):
|
class ParserRejectedMarkup(Exception):
|
||||||
pass
|
def __init__(self, message_or_exception):
|
||||||
|
"""Explain why the parser rejected the given markup, either
|
||||||
|
with a textual explanation or another exception.
|
||||||
|
"""
|
||||||
|
if isinstance(message_or_exception, Exception):
|
||||||
|
e = message_or_exception
|
||||||
|
message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
|
||||||
|
super(ParserRejectedMarkup, self).__init__(message_or_exception)
|
||||||
|
|
||||||
# Builders are registered in reverse order of priority, so that custom
|
# Builders are registered in reverse order of priority, so that custom
|
||||||
# builder registrations will take precedence. In general, we want lxml
|
# builder registrations will take precedence. In general, we want lxml
|
||||||
|
|
|
@ -1,17 +1,27 @@
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTML5TreeBuilder',
|
'HTML5TreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
import re
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
PERMISSIVE,
|
PERMISSIVE,
|
||||||
HTML,
|
HTML,
|
||||||
HTML_5,
|
HTML_5,
|
||||||
HTMLTreeBuilder,
|
HTMLTreeBuilder,
|
||||||
)
|
)
|
||||||
from bs4.element import NamespacedAttribute
|
from bs4.element import (
|
||||||
|
NamespacedAttribute,
|
||||||
|
nonwhitespace_re,
|
||||||
|
)
|
||||||
import html5lib
|
import html5lib
|
||||||
from html5lib.constants import namespaces
|
from html5lib.constants import (
|
||||||
|
namespaces,
|
||||||
|
prefixes,
|
||||||
|
)
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
@ -19,14 +29,36 @@ from bs4.element import (
|
||||||
Tag,
|
Tag,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Pre-0.99999999
|
||||||
|
from html5lib.treebuilders import _base as treebuilder_base
|
||||||
|
new_html5lib = False
|
||||||
|
except ImportError as e:
|
||||||
|
# 0.99999999 and up
|
||||||
|
from html5lib.treebuilders import base as treebuilder_base
|
||||||
|
new_html5lib = True
|
||||||
|
|
||||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
"""Use html5lib to build a tree."""
|
"""Use html5lib to build a tree."""
|
||||||
|
|
||||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
NAME = "html5lib"
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding):
|
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||||
|
|
||||||
|
# html5lib can tell us which line number and position in the
|
||||||
|
# original file is the source of an element.
|
||||||
|
TRACKS_LINE_NUMBERS = True
|
||||||
|
|
||||||
|
def prepare_markup(self, markup, user_specified_encoding,
|
||||||
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
# Store the user-specified encoding for use later on.
|
# Store the user-specified encoding for use later on.
|
||||||
self.user_specified_encoding = user_specified_encoding
|
self.user_specified_encoding = user_specified_encoding
|
||||||
|
|
||||||
|
# document_declared_encoding and exclude_encodings aren't used
|
||||||
|
# ATM because the html5lib TreeBuilder doesn't use
|
||||||
|
# UnicodeDammit.
|
||||||
|
if exclude_encodings:
|
||||||
|
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
|
|
||||||
# These methods are defined by Beautiful Soup.
|
# These methods are defined by Beautiful Soup.
|
||||||
|
@ -34,32 +66,63 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
if self.soup.parse_only is not None:
|
if self.soup.parse_only is not None:
|
||||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||||
doc = parser.parse(markup, encoding=self.user_specified_encoding)
|
self.underlying_builder.parser = parser
|
||||||
|
extra_kwargs = dict()
|
||||||
|
if not isinstance(markup, str):
|
||||||
|
if new_html5lib:
|
||||||
|
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||||
|
else:
|
||||||
|
extra_kwargs['encoding'] = self.user_specified_encoding
|
||||||
|
doc = parser.parse(markup, **extra_kwargs)
|
||||||
|
|
||||||
# Set the character encoding detected by the tokenizer.
|
# Set the character encoding detected by the tokenizer.
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
# We need to special-case this because html5lib sets
|
# We need to special-case this because html5lib sets
|
||||||
# charEncoding to UTF-8 if it gets Unicode input.
|
# charEncoding to UTF-8 if it gets Unicode input.
|
||||||
doc.original_encoding = None
|
doc.original_encoding = None
|
||||||
else:
|
else:
|
||||||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
|
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||||
|
if not isinstance(original_encoding, str):
|
||||||
|
# In 0.99999999 and up, the encoding is an html5lib
|
||||||
|
# Encoding object. We want to use a string for compatibility
|
||||||
|
# with other tree builders.
|
||||||
|
original_encoding = original_encoding.name
|
||||||
|
doc.original_encoding = original_encoding
|
||||||
|
self.underlying_builder.parser = None
|
||||||
|
|
||||||
def create_treebuilder(self, namespaceHTMLElements):
|
def create_treebuilder(self, namespaceHTMLElements):
|
||||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||||
self.soup, namespaceHTMLElements)
|
namespaceHTMLElements, self.soup,
|
||||||
|
store_line_numbers=self.store_line_numbers
|
||||||
|
)
|
||||||
return self.underlying_builder
|
return self.underlying_builder
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""See `TreeBuilder`."""
|
"""See `TreeBuilder`."""
|
||||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
return '<html><head></head><body>%s</body></html>' % fragment
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
|
|
||||||
def __init__(self, soup, namespaceHTMLElements):
|
def __init__(self, namespaceHTMLElements, soup=None,
|
||||||
|
store_line_numbers=True, **kwargs):
|
||||||
|
if soup:
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
|
else:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
# TODO: Why is the parser 'html.parser' here? To avoid an
|
||||||
|
# infinite loop?
|
||||||
|
self.soup = BeautifulSoup(
|
||||||
|
"", "html.parser", store_line_numbers=store_line_numbers,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||||
|
|
||||||
|
# This will be set later to an html5lib.html5parser.HTMLParser
|
||||||
|
# object, which we can use to track the current line number.
|
||||||
|
self.parser = None
|
||||||
|
self.store_line_numbers = store_line_numbers
|
||||||
|
|
||||||
def documentClass(self):
|
def documentClass(self):
|
||||||
self.soup.reset()
|
self.soup.reset()
|
||||||
return Element(self.soup, self.soup, None)
|
return Element(self.soup, self.soup, None)
|
||||||
|
@ -73,14 +136,26 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||||
self.soup.object_was_parsed(doctype)
|
self.soup.object_was_parsed(doctype)
|
||||||
|
|
||||||
def elementClass(self, name, namespace):
|
def elementClass(self, name, namespace):
|
||||||
tag = self.soup.new_tag(name, namespace)
|
kwargs = {}
|
||||||
|
if self.parser and self.store_line_numbers:
|
||||||
|
# This represents the point immediately after the end of the
|
||||||
|
# tag. We don't know when the tag started, but we do know
|
||||||
|
# where it ended -- the character just before this one.
|
||||||
|
sourceline, sourcepos = self.parser.tokenizer.stream.position()
|
||||||
|
kwargs['sourceline'] = sourceline
|
||||||
|
kwargs['sourcepos'] = sourcepos-1
|
||||||
|
tag = self.soup.new_tag(name, namespace, **kwargs)
|
||||||
|
|
||||||
return Element(tag, self.soup, namespace)
|
return Element(tag, self.soup, namespace)
|
||||||
|
|
||||||
def commentClass(self, data):
|
def commentClass(self, data):
|
||||||
return TextNode(Comment(data), self.soup)
|
return TextNode(Comment(data), self.soup)
|
||||||
|
|
||||||
def fragmentClass(self):
|
def fragmentClass(self):
|
||||||
self.soup = BeautifulSoup("")
|
from bs4 import BeautifulSoup
|
||||||
|
# TODO: Why is the parser 'html.parser' here? To avoid an
|
||||||
|
# infinite loop?
|
||||||
|
self.soup = BeautifulSoup("", "html.parser")
|
||||||
self.soup.name = "[document_fragment]"
|
self.soup.name = "[document_fragment]"
|
||||||
return Element(self.soup, self.soup, None)
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
|
@ -92,7 +167,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
|
||||||
return self.soup
|
return self.soup
|
||||||
|
|
||||||
def getFragment(self):
|
def getFragment(self):
|
||||||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
|
return treebuilder_base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
rv = []
|
||||||
|
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
|
||||||
|
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if isinstance(element, BeautifulSoup):
|
||||||
|
pass
|
||||||
|
if isinstance(element, Doctype):
|
||||||
|
m = doctype_re.match(element)
|
||||||
|
if m:
|
||||||
|
name = m.group(1)
|
||||||
|
if m.lastindex > 1:
|
||||||
|
publicId = m.group(2) or ""
|
||||||
|
systemId = m.group(3) or m.group(4) or ""
|
||||||
|
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(' ' * indent, name, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||||
|
elif isinstance(element, Comment):
|
||||||
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
|
||||||
|
elif isinstance(element, NavigableString):
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||||
|
else:
|
||||||
|
if element.namespace:
|
||||||
|
name = "%s %s" % (prefixes[element.namespace],
|
||||||
|
element.name)
|
||||||
|
else:
|
||||||
|
name = element.name
|
||||||
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
|
if element.attrs:
|
||||||
|
attributes = []
|
||||||
|
for name, value in list(element.attrs.items()):
|
||||||
|
if isinstance(name, NamespacedAttribute):
|
||||||
|
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join(value)
|
||||||
|
attributes.append((name, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
indent += 2
|
||||||
|
for child in element.children:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
class AttrList(object):
|
class AttrList(object):
|
||||||
def __init__(self, element):
|
def __init__(self, element):
|
||||||
|
@ -101,7 +226,16 @@ class AttrList(object):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return list(self.attrs.items()).__iter__()
|
return list(self.attrs.items()).__iter__()
|
||||||
def __setitem__(self, name, value):
|
def __setitem__(self, name, value):
|
||||||
"set attr", name, value
|
# If this attribute is a multi-valued attribute for this element,
|
||||||
|
# turn its value into a list.
|
||||||
|
list_attr = self.element.cdata_list_attributes
|
||||||
|
if (name in list_attr['*']
|
||||||
|
or (self.element.name in list_attr
|
||||||
|
and name in list_attr[self.element.name])):
|
||||||
|
# A node that is being cloned may have already undergone
|
||||||
|
# this procedure.
|
||||||
|
if not isinstance(value, list):
|
||||||
|
value = nonwhitespace_re.findall(value)
|
||||||
self.element[name] = value
|
self.element[name] = value
|
||||||
def items(self):
|
def items(self):
|
||||||
return list(self.attrs.items())
|
return list(self.attrs.items())
|
||||||
|
@ -115,16 +249,16 @@ class AttrList(object):
|
||||||
return name in list(self.attrs.keys())
|
return name in list(self.attrs.keys())
|
||||||
|
|
||||||
|
|
||||||
class Element(html5lib.treebuilders._base.Node):
|
class Element(treebuilder_base.Node):
|
||||||
def __init__(self, element, soup, namespace):
|
def __init__(self, element, soup, namespace):
|
||||||
html5lib.treebuilders._base.Node.__init__(self, element.name)
|
treebuilder_base.Node.__init__(self, element.name)
|
||||||
self.element = element
|
self.element = element
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
self.namespace = namespace
|
self.namespace = namespace
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
string_child = child = None
|
string_child = child = None
|
||||||
if isinstance(node, basestring):
|
if isinstance(node, str):
|
||||||
# Some other piece of code decided to pass in a string
|
# Some other piece of code decided to pass in a string
|
||||||
# instead of creating a TextElement object to contain the
|
# instead of creating a TextElement object to contain the
|
||||||
# string.
|
# string.
|
||||||
|
@ -136,13 +270,15 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
child = node
|
child = node
|
||||||
elif node.element.__class__ == NavigableString:
|
elif node.element.__class__ == NavigableString:
|
||||||
string_child = child = node.element
|
string_child = child = node.element
|
||||||
|
node.parent = self
|
||||||
else:
|
else:
|
||||||
child = node.element
|
child = node.element
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
if not isinstance(child, basestring) and child.parent is not None:
|
if not isinstance(child, str) and child.parent is not None:
|
||||||
node.element.extract()
|
node.element.extract()
|
||||||
|
|
||||||
if (string_child and self.element.contents
|
if (string_child is not None and self.element.contents
|
||||||
and self.element.contents[-1].__class__ == NavigableString):
|
and self.element.contents[-1].__class__ == NavigableString):
|
||||||
# We are appending a string onto another string.
|
# We are appending a string onto another string.
|
||||||
# TODO This has O(n^2) performance, for input like
|
# TODO This has O(n^2) performance, for input like
|
||||||
|
@ -152,7 +288,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
old_element.replace_with(new_element)
|
old_element.replace_with(new_element)
|
||||||
self.soup._most_recent_element = new_element
|
self.soup._most_recent_element = new_element
|
||||||
else:
|
else:
|
||||||
if isinstance(node, basestring):
|
if isinstance(node, str):
|
||||||
# Create a brand new NavigableString from this string.
|
# Create a brand new NavigableString from this string.
|
||||||
child = self.soup.new_string(node)
|
child = self.soup.new_string(node)
|
||||||
|
|
||||||
|
@ -161,6 +297,12 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
# immediately after the parent, if it has no children.)
|
# immediately after the parent, if it has no children.)
|
||||||
if self.element.contents:
|
if self.element.contents:
|
||||||
most_recent_element = self.element._last_descendant(False)
|
most_recent_element = self.element._last_descendant(False)
|
||||||
|
elif self.element.next_element is not None:
|
||||||
|
# Something from further ahead in the parse tree is
|
||||||
|
# being inserted into this earlier element. This is
|
||||||
|
# very annoying because it means an expensive search
|
||||||
|
# for the last element in the tree.
|
||||||
|
most_recent_element = self.soup._last_descendant()
|
||||||
else:
|
else:
|
||||||
most_recent_element = self.element
|
most_recent_element = self.element
|
||||||
|
|
||||||
|
@ -169,9 +311,12 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
most_recent_element=most_recent_element)
|
most_recent_element=most_recent_element)
|
||||||
|
|
||||||
def getAttributes(self):
|
def getAttributes(self):
|
||||||
|
if isinstance(self.element, Comment):
|
||||||
|
return {}
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
def setAttributes(self, attributes):
|
||||||
|
|
||||||
if attributes is not None and len(attributes) > 0:
|
if attributes is not None and len(attributes) > 0:
|
||||||
|
|
||||||
converted_attributes = []
|
converted_attributes = []
|
||||||
|
@ -183,7 +328,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
|
|
||||||
self.soup.builder._replace_cdata_list_attribute_values(
|
self.soup.builder._replace_cdata_list_attribute_values(
|
||||||
self.name, attributes)
|
self.name, attributes)
|
||||||
for name, value in attributes.items():
|
for name, value in list(attributes.items()):
|
||||||
self.element[name] = value
|
self.element[name] = value
|
||||||
|
|
||||||
# The attributes may contain variables that need substitution.
|
# The attributes may contain variables that need substitution.
|
||||||
|
@ -195,11 +340,11 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
attributes = property(getAttributes, setAttributes)
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
def insertText(self, data, insertBefore=None):
|
||||||
if insertBefore:
|
|
||||||
text = TextNode(self.soup.new_string(data), self.soup)
|
text = TextNode(self.soup.new_string(data), self.soup)
|
||||||
self.insertBefore(data, insertBefore)
|
if insertBefore:
|
||||||
|
self.insertBefore(text, insertBefore)
|
||||||
else:
|
else:
|
||||||
self.appendChild(data)
|
self.appendChild(text)
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
def insertBefore(self, node, refNode):
|
||||||
index = self.element.index(refNode.element)
|
index = self.element.index(refNode.element)
|
||||||
|
@ -218,6 +363,10 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
|
|
||||||
def reparentChildren(self, new_parent):
|
def reparentChildren(self, new_parent):
|
||||||
"""Move all of this tag's children into another tag."""
|
"""Move all of this tag's children into another tag."""
|
||||||
|
# print "MOVE", self.element.contents
|
||||||
|
# print "FROM", self.element
|
||||||
|
# print "TO", new_parent.element
|
||||||
|
|
||||||
element = self.element
|
element = self.element
|
||||||
new_parent_element = new_parent.element
|
new_parent_element = new_parent.element
|
||||||
# Determine what this tag's next_element will be once all the children
|
# Determine what this tag's next_element will be once all the children
|
||||||
|
@ -236,18 +385,35 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||||
|
|
||||||
to_append = element.contents
|
to_append = element.contents
|
||||||
append_after = new_parent.element.contents
|
|
||||||
if len(to_append) > 0:
|
if len(to_append) > 0:
|
||||||
# Set the first child's previous_element and previous_sibling
|
# Set the first child's previous_element and previous_sibling
|
||||||
# to elements within the new parent
|
# to elements within the new parent
|
||||||
first_child = to_append[0]
|
first_child = to_append[0]
|
||||||
|
if new_parents_last_descendant is not None:
|
||||||
first_child.previous_element = new_parents_last_descendant
|
first_child.previous_element = new_parents_last_descendant
|
||||||
|
else:
|
||||||
|
first_child.previous_element = new_parent_element
|
||||||
first_child.previous_sibling = new_parents_last_child
|
first_child.previous_sibling = new_parents_last_child
|
||||||
|
if new_parents_last_descendant is not None:
|
||||||
|
new_parents_last_descendant.next_element = first_child
|
||||||
|
else:
|
||||||
|
new_parent_element.next_element = first_child
|
||||||
|
if new_parents_last_child is not None:
|
||||||
|
new_parents_last_child.next_sibling = first_child
|
||||||
|
|
||||||
# Fix the last child's next_element and next_sibling
|
# Find the very last element being moved. It is now the
|
||||||
last_child = to_append[-1]
|
# parent's last descendant. It has no .next_sibling and
|
||||||
last_child.next_element = new_parents_last_descendant_next_element
|
# its .next_element is whatever the previous last
|
||||||
last_child.next_sibling = None
|
# descendant had.
|
||||||
|
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||||
|
|
||||||
|
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||||
|
if new_parents_last_descendant_next_element is not None:
|
||||||
|
# TODO: This code has no test coverage and I'm not sure
|
||||||
|
# how to get html5lib to go through this path, but it's
|
||||||
|
# just the other side of the previous line.
|
||||||
|
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
|
||||||
|
last_childs_last_descendant.next_sibling = None
|
||||||
|
|
||||||
for child in to_append:
|
for child in to_append:
|
||||||
child.parent = new_parent_element
|
child.parent = new_parent_element
|
||||||
|
@ -257,6 +423,10 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
element.contents = []
|
element.contents = []
|
||||||
element.next_element = final_next_element
|
element.next_element = final_next_element
|
||||||
|
|
||||||
|
# print "DONE WITH MOVE"
|
||||||
|
# print "FROM", self.element
|
||||||
|
# print "TO", new_parent_element
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||||
node = Element(tag, self.soup, self.namespace)
|
node = Element(tag, self.soup, self.namespace)
|
||||||
|
@ -268,7 +438,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
return self.element.contents
|
return self.element.contents
|
||||||
|
|
||||||
def getNameTuple(self):
|
def getNameTuple(self):
|
||||||
if self.namespace is None:
|
if self.namespace == None:
|
||||||
return namespaces["html"], self.name
|
return namespaces["html"], self.name
|
||||||
else:
|
else:
|
||||||
return self.namespace, self.name
|
return self.namespace, self.name
|
||||||
|
@ -277,7 +447,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
|
|
||||||
class TextNode(Element):
|
class TextNode(Element):
|
||||||
def __init__(self, element, soup):
|
def __init__(self, element, soup):
|
||||||
html5lib.treebuilders._base.Node.__init__(self, None)
|
treebuilder_base.Node.__init__(self, None)
|
||||||
self.element = element
|
self.element = element
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,23 @@
|
||||||
|
# encoding: utf-8
|
||||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||||
|
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTMLParserTreeBuilder',
|
'HTMLParserTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
from HTMLParser import (
|
from html.parser import HTMLParser
|
||||||
HTMLParser,
|
|
||||||
HTMLParseError,
|
try:
|
||||||
)
|
from html.parser import HTMLParseError
|
||||||
|
except ImportError as e:
|
||||||
|
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||||
|
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||||
|
class HTMLParseError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
@ -19,10 +29,10 @@ import warnings
|
||||||
# At the end of this file, we monkeypatch HTMLParser so that
|
# At the end of this file, we monkeypatch HTMLParser so that
|
||||||
# strict=True works well on Python 3.2.2.
|
# strict=True works well on Python 3.2.2.
|
||||||
major, minor, release = sys.version_info[:3]
|
major, minor, release = sys.version_info[:3]
|
||||||
CONSTRUCTOR_TAKES_STRICT = (
|
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||||
major > 3
|
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||||
or (major == 3 and minor > 2)
|
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||||
or (major == 3 and minor == 2 and release >= 3))
|
|
||||||
|
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CData,
|
CData,
|
||||||
|
@ -43,7 +53,42 @@ from bs4.builder import (
|
||||||
HTMLPARSER = 'html.parser'
|
HTMLPARSER = 'html.parser'
|
||||||
|
|
||||||
class BeautifulSoupHTMLParser(HTMLParser):
|
class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
def handle_starttag(self, name, attrs):
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
# Keep a list of empty-element tags that were encountered
|
||||||
|
# without an explicit closing tag. If we encounter a closing tag
|
||||||
|
# of this type, we'll associate it with one of those entries.
|
||||||
|
#
|
||||||
|
# This isn't a stack because we don't care about the
|
||||||
|
# order. It's a list of closing tags we've already handled and
|
||||||
|
# will ignore, assuming they ever show up.
|
||||||
|
self.already_closed_empty_element = []
|
||||||
|
|
||||||
|
def error(self, msg):
|
||||||
|
"""In Python 3, HTMLParser subclasses must implement error(), although this
|
||||||
|
requirement doesn't appear to be documented.
|
||||||
|
|
||||||
|
In Python 2, HTMLParser implements error() as raising an exception.
|
||||||
|
|
||||||
|
In any event, this method is called only on very strange markup and our best strategy
|
||||||
|
is to pretend it didn't happen and keep going.
|
||||||
|
"""
|
||||||
|
warnings.warn(msg)
|
||||||
|
|
||||||
|
def handle_startendtag(self, name, attrs):
|
||||||
|
# This is only called when the markup looks like
|
||||||
|
# <tag/>.
|
||||||
|
|
||||||
|
# is_startend() tells handle_starttag not to close the tag
|
||||||
|
# just because its name matches a known empty-element tag. We
|
||||||
|
# know that this is an empty-element tag and we want to call
|
||||||
|
# handle_endtag ourselves.
|
||||||
|
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||||
|
self.handle_endtag(name)
|
||||||
|
|
||||||
|
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||||
# XXX namespace
|
# XXX namespace
|
||||||
attr_dict = {}
|
attr_dict = {}
|
||||||
for key, value in attrs:
|
for key, value in attrs:
|
||||||
|
@ -53,9 +98,37 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
value = ''
|
value = ''
|
||||||
attr_dict[key] = value
|
attr_dict[key] = value
|
||||||
attrvalue = '""'
|
attrvalue = '""'
|
||||||
self.soup.handle_starttag(name, None, None, attr_dict)
|
#print "START", name
|
||||||
|
sourceline, sourcepos = self.getpos()
|
||||||
|
tag = self.soup.handle_starttag(
|
||||||
|
name, None, None, attr_dict, sourceline=sourceline,
|
||||||
|
sourcepos=sourcepos
|
||||||
|
)
|
||||||
|
if tag and tag.is_empty_element and handle_empty_element:
|
||||||
|
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||||
|
# events for empty-element tags. (It's handled in
|
||||||
|
# handle_startendtag, but only if the original markup looked like
|
||||||
|
# <tag/>.)
|
||||||
|
#
|
||||||
|
# So we need to call handle_endtag() ourselves. Since we
|
||||||
|
# know the start event is identical to the end event, we
|
||||||
|
# don't want handle_endtag() to cross off any previous end
|
||||||
|
# events for tags of this name.
|
||||||
|
self.handle_endtag(name, check_already_closed=False)
|
||||||
|
|
||||||
def handle_endtag(self, name):
|
# But we might encounter an explicit closing tag for this tag
|
||||||
|
# later on. If so, we want to ignore it.
|
||||||
|
self.already_closed_empty_element.append(name)
|
||||||
|
|
||||||
|
def handle_endtag(self, name, check_already_closed=True):
|
||||||
|
#print "END", name
|
||||||
|
if check_already_closed and name in self.already_closed_empty_element:
|
||||||
|
# This is a redundant end tag for an empty-element tag.
|
||||||
|
# We've already called handle_endtag() for it, so just
|
||||||
|
# check it off the list.
|
||||||
|
# print "ALREADY CLOSED", name
|
||||||
|
self.already_closed_empty_element.remove(name)
|
||||||
|
else:
|
||||||
self.soup.handle_endtag(name)
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
|
@ -63,7 +136,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
|
||||||
def handle_charref(self, name):
|
def handle_charref(self, name):
|
||||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||||
# it's fixed.
|
# it's fixed in all supported versions.
|
||||||
|
# http://bugs.python.org/issue13633
|
||||||
if name.startswith('x'):
|
if name.startswith('x'):
|
||||||
real_name = int(name.lstrip('x'), 16)
|
real_name = int(name.lstrip('x'), 16)
|
||||||
elif name.startswith('X'):
|
elif name.startswith('X'):
|
||||||
|
@ -71,11 +145,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
else:
|
else:
|
||||||
real_name = int(name)
|
real_name = int(name)
|
||||||
|
|
||||||
|
data = None
|
||||||
|
if real_name < 256:
|
||||||
|
# HTML numeric entities are supposed to reference Unicode
|
||||||
|
# code points, but sometimes they reference code points in
|
||||||
|
# some other encoding (ahem, Windows-1252). E.g. “
|
||||||
|
# instead of É for LEFT DOUBLE QUOTATION MARK. This
|
||||||
|
# code tries to detect this situation and compensate.
|
||||||
|
for encoding in (self.soup.original_encoding, 'windows-1252'):
|
||||||
|
if not encoding:
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
data = unichr(real_name)
|
data = bytearray([real_name]).decode(encoding)
|
||||||
except (ValueError, OverflowError), e:
|
except UnicodeDecodeError as e:
|
||||||
data = u"\N{REPLACEMENT CHARACTER}"
|
pass
|
||||||
|
if not data:
|
||||||
|
try:
|
||||||
|
data = chr(real_name)
|
||||||
|
except (ValueError, OverflowError) as e:
|
||||||
|
pass
|
||||||
|
data = data or "\N{REPLACEMENT CHARACTER}"
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
def handle_entityref(self, name):
|
||||||
|
@ -83,7 +172,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
if character is not None:
|
if character is not None:
|
||||||
data = character
|
data = character
|
||||||
else:
|
else:
|
||||||
data = "&%s;" % name
|
# If this were XML, it would be ambiguous whether "&foo"
|
||||||
|
# was an character entity reference with a missing
|
||||||
|
# semicolon or the literal string "&foo". Since this is
|
||||||
|
# HTML, we have a complete list of all character entity references,
|
||||||
|
# and this one wasn't found, so assume it's the literal string "&foo".
|
||||||
|
data = "&%s" % name
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
|
@ -113,14 +207,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
|
||||||
def handle_pi(self, data):
|
def handle_pi(self, data):
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
if data.endswith("?") and data.lower().startswith("xml"):
|
|
||||||
# "An XHTML processing instruction using the trailing '?'
|
|
||||||
# will cause the '?' to be included in data." - HTMLParser
|
|
||||||
# docs.
|
|
||||||
#
|
|
||||||
# Strip the question mark so we don't end up with two
|
|
||||||
# question marks.
|
|
||||||
data = data[:-1]
|
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(ProcessingInstruction)
|
self.soup.endData(ProcessingInstruction)
|
||||||
|
|
||||||
|
@ -128,26 +214,38 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
features = [HTML, STRICT, HTMLPARSER]
|
picklable = True
|
||||||
|
NAME = HTMLPARSER
|
||||||
|
features = [NAME, HTML, STRICT]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
# The html.parser knows which line number and position in the
|
||||||
if CONSTRUCTOR_TAKES_STRICT:
|
# original file is the source of an element.
|
||||||
kwargs['strict'] = False
|
TRACKS_LINE_NUMBERS = True
|
||||||
self.parser_args = (args, kwargs)
|
|
||||||
|
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
|
||||||
|
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
||||||
|
parser_args = parser_args or []
|
||||||
|
parser_kwargs = parser_kwargs or {}
|
||||||
|
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||||
|
parser_kwargs['strict'] = False
|
||||||
|
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||||
|
parser_kwargs['convert_charrefs'] = False
|
||||||
|
self.parser_args = (parser_args, parser_kwargs)
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
document_declared_encoding=None):
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
"""
|
"""
|
||||||
:return: A 4-tuple (markup, original encoding, encoding
|
:return: A 4-tuple (markup, original encoding, encoding
|
||||||
declared within markup, whether any characters had to be
|
declared within markup, whether any characters had to be
|
||||||
replaced with REPLACEMENT CHARACTER).
|
replaced with REPLACEMENT CHARACTER).
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
return
|
return
|
||||||
|
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||||
|
exclude_encodings=exclude_encodings)
|
||||||
yield (dammit.markup, dammit.original_encoding,
|
yield (dammit.markup, dammit.original_encoding,
|
||||||
dammit.declared_html_encoding,
|
dammit.declared_html_encoding,
|
||||||
dammit.contains_replacement_characters)
|
dammit.contains_replacement_characters)
|
||||||
|
@ -158,10 +256,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
try:
|
try:
|
||||||
parser.feed(markup)
|
parser.feed(markup)
|
||||||
except HTMLParseError, e:
|
parser.close()
|
||||||
|
except HTMLParseError as e:
|
||||||
warnings.warn(RuntimeWarning(
|
warnings.warn(RuntimeWarning(
|
||||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||||
raise e
|
raise e
|
||||||
|
parser.already_closed_empty_element = []
|
||||||
|
|
||||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||||
|
|
|
@ -1,13 +1,26 @@
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'LXMLTreeBuilderForXML',
|
'LXMLTreeBuilderForXML',
|
||||||
'LXMLTreeBuilder',
|
'LXMLTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import Callable # Python 3.6
|
||||||
|
except ImportError as e:
|
||||||
|
from collections import Callable
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from StringIO import StringIO
|
from io import StringIO
|
||||||
import collections
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
from bs4.element import (
|
||||||
|
Comment,
|
||||||
|
Doctype,
|
||||||
|
NamespacedAttribute,
|
||||||
|
ProcessingInstruction,
|
||||||
|
XMLProcessingInstruction,
|
||||||
|
)
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
FAST,
|
FAST,
|
||||||
HTML,
|
HTML,
|
||||||
|
@ -20,19 +33,55 @@ from bs4.dammit import EncodingDetector
|
||||||
|
|
||||||
LXML = 'lxml'
|
LXML = 'lxml'
|
||||||
|
|
||||||
|
def _invert(d):
|
||||||
|
"Invert a dictionary."
|
||||||
|
return dict((v,k) for k, v in list(d.items()))
|
||||||
|
|
||||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||||
|
|
||||||
is_xml = True
|
is_xml = True
|
||||||
|
processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
|
NAME = "lxml-xml"
|
||||||
|
ALTERNATE_NAMES = ["xml"]
|
||||||
|
|
||||||
# Well, it's permissive by XML parser standards.
|
# Well, it's permissive by XML parser standards.
|
||||||
features = [LXML, XML, FAST, PERMISSIVE]
|
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||||
|
|
||||||
CHUNK_SIZE = 512
|
CHUNK_SIZE = 512
|
||||||
|
|
||||||
# This namespace mapping is specified in the XML Namespace
|
# This namespace mapping is specified in the XML Namespace
|
||||||
# standard.
|
# standard.
|
||||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
|
||||||
|
|
||||||
|
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
|
||||||
|
|
||||||
|
# NOTE: If we parsed Element objects and looked at .sourceline,
|
||||||
|
# we'd be able to see the line numbers from the original document.
|
||||||
|
# But instead we build an XMLParser or HTMLParser object to serve
|
||||||
|
# as the target of parse messages, and those messages don't include
|
||||||
|
# line numbers.
|
||||||
|
|
||||||
|
def initialize_soup(self, soup):
|
||||||
|
"""Let the BeautifulSoup object know about the standard namespace
|
||||||
|
mapping.
|
||||||
|
"""
|
||||||
|
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||||
|
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||||
|
|
||||||
|
def _register_namespaces(self, mapping):
|
||||||
|
"""Let the BeautifulSoup object know about namespaces encountered
|
||||||
|
while parsing the document.
|
||||||
|
|
||||||
|
This might be useful later on when creating CSS selectors.
|
||||||
|
"""
|
||||||
|
for key, value in list(mapping.items()):
|
||||||
|
if key and key not in self.soup._namespaces:
|
||||||
|
# Let the BeautifulSoup object know about a new namespace.
|
||||||
|
# If there are multiple namespaces defined with the same
|
||||||
|
# prefix, the first one in the document takes precedence.
|
||||||
|
self.soup._namespaces[key] = value
|
||||||
|
|
||||||
def default_parser(self, encoding):
|
def default_parser(self, encoding):
|
||||||
# This can either return a parser object or a class, which
|
# This can either return a parser object or a class, which
|
||||||
|
@ -46,12 +95,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# Use the default parser.
|
# Use the default parser.
|
||||||
parser = self.default_parser(encoding)
|
parser = self.default_parser(encoding)
|
||||||
|
|
||||||
if isinstance(parser, collections.Callable):
|
if isinstance(parser, Callable):
|
||||||
# Instantiate the parser with default arguments
|
# Instantiate the parser with default arguments
|
||||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def __init__(self, parser=None, empty_element_tags=None):
|
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
|
||||||
# TODO: Issue a warning if parser is present but not a
|
# TODO: Issue a warning if parser is present but not a
|
||||||
# callable, since that means there's no way to create new
|
# callable, since that means there's no way to create new
|
||||||
# parsers for different encodings.
|
# parsers for different encodings.
|
||||||
|
@ -59,7 +108,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
if empty_element_tags is not None:
|
if empty_element_tags is not None:
|
||||||
self.empty_element_tags = set(empty_element_tags)
|
self.empty_element_tags = set(empty_element_tags)
|
||||||
self.soup = None
|
self.soup = None
|
||||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||||
|
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||||
|
|
||||||
def _getNsTag(self, tag):
|
def _getNsTag(self, tag):
|
||||||
# Split the namespace URL out of a fully-qualified lxml tag
|
# Split the namespace URL out of a fully-qualified lxml tag
|
||||||
|
@ -70,6 +120,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
return (None, tag)
|
return (None, tag)
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
|
exclude_encodings=None,
|
||||||
document_declared_encoding=None):
|
document_declared_encoding=None):
|
||||||
"""
|
"""
|
||||||
:yield: A series of 4-tuples.
|
:yield: A series of 4-tuples.
|
||||||
|
@ -78,31 +129,37 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
Each 4-tuple represents a strategy for parsing the document.
|
Each 4-tuple represents a strategy for parsing the document.
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, unicode):
|
|
||||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
|
||||||
# this system?
|
|
||||||
yield markup, None, document_declared_encoding, False
|
|
||||||
|
|
||||||
if isinstance(markup, unicode):
|
|
||||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
|
||||||
# tell lxml to parse it as UTF-8.
|
|
||||||
yield (markup.encode("utf8"), "utf8",
|
|
||||||
document_declared_encoding, False)
|
|
||||||
|
|
||||||
# Instead of using UnicodeDammit to convert the bytestring to
|
# Instead of using UnicodeDammit to convert the bytestring to
|
||||||
# Unicode using different encodings, use EncodingDetector to
|
# Unicode using different encodings, use EncodingDetector to
|
||||||
# iterate over the encodings, and tell lxml to try to parse
|
# iterate over the encodings, and tell lxml to try to parse
|
||||||
# the document as each one in turn.
|
# the document as each one in turn.
|
||||||
is_html = not self.is_xml
|
is_html = not self.is_xml
|
||||||
|
if is_html:
|
||||||
|
self.processing_instruction_class = ProcessingInstruction
|
||||||
|
else:
|
||||||
|
self.processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
|
if isinstance(markup, str):
|
||||||
|
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||||
|
# this system?
|
||||||
|
yield markup, None, document_declared_encoding, False
|
||||||
|
|
||||||
|
if isinstance(markup, str):
|
||||||
|
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||||
|
# tell lxml to parse it as UTF-8.
|
||||||
|
yield (markup.encode("utf8"), "utf8",
|
||||||
|
document_declared_encoding, False)
|
||||||
|
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
detector = EncodingDetector(markup, try_encodings, is_html)
|
detector = EncodingDetector(
|
||||||
|
markup, try_encodings, is_html, exclude_encodings)
|
||||||
for encoding in detector.encodings:
|
for encoding in detector.encodings:
|
||||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
markup = BytesIO(markup)
|
markup = BytesIO(markup)
|
||||||
elif isinstance(markup, unicode):
|
elif isinstance(markup, str):
|
||||||
markup = StringIO(markup)
|
markup = StringIO(markup)
|
||||||
|
|
||||||
# Call feed() at least once, even if the markup is empty,
|
# Call feed() at least once, even if the markup is empty,
|
||||||
|
@ -117,30 +174,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
if len(data) != 0:
|
if len(data) != 0:
|
||||||
self.parser.feed(data)
|
self.parser.feed(data)
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||||
raise ParserRejectedMarkup(str(e))
|
raise ParserRejectedMarkup(e)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||||
|
|
||||||
def start(self, name, attrs, nsmap={}):
|
def start(self, name, attrs, nsmap={}):
|
||||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
nsprefix = None
|
nsprefix = None
|
||||||
# Invert each namespace map as it comes in.
|
# Invert each namespace map as it comes in.
|
||||||
if len(self.nsmaps) > 1:
|
if len(nsmap) == 0 and len(self.nsmaps) > 1:
|
||||||
# There are no new namespaces for this tag, but
|
# There are no new namespaces for this tag, but
|
||||||
# non-default namespaces are in play, so we need a
|
# non-default namespaces are in play, so we need a
|
||||||
# separate tag stack to know when they end.
|
# separate tag stack to know when they end.
|
||||||
self.nsmaps.append(None)
|
self.nsmaps.append(None)
|
||||||
elif len(nsmap) > 0:
|
elif len(nsmap) > 0:
|
||||||
# A new namespace mapping has come into play.
|
# A new namespace mapping has come into play.
|
||||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
|
||||||
self.nsmaps.append(inverted_nsmap)
|
# First, Let the BeautifulSoup object know about it.
|
||||||
|
self._register_namespaces(nsmap)
|
||||||
|
|
||||||
|
# Then, add it to our running list of inverted namespace
|
||||||
|
# mappings.
|
||||||
|
self.nsmaps.append(_invert(nsmap))
|
||||||
|
|
||||||
# Also treat the namespace mapping as a set of attributes on the
|
# Also treat the namespace mapping as a set of attributes on the
|
||||||
# tag, so we can recreate it later.
|
# tag, so we can recreate it later.
|
||||||
attrs = attrs.copy()
|
attrs = attrs.copy()
|
||||||
for prefix, namespace in nsmap.items():
|
for prefix, namespace in list(nsmap.items()):
|
||||||
attribute = NamespacedAttribute(
|
attribute = NamespacedAttribute(
|
||||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||||
attrs[attribute] = namespace
|
attrs[attribute] = namespace
|
||||||
|
@ -149,7 +212,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# from lxml with namespaces attached to their names, and
|
# from lxml with namespaces attached to their names, and
|
||||||
# turn then into NamespacedAttribute objects.
|
# turn then into NamespacedAttribute objects.
|
||||||
new_attrs = {}
|
new_attrs = {}
|
||||||
for attr, value in attrs.items():
|
for attr, value in list(attrs.items()):
|
||||||
namespace, attr = self._getNsTag(attr)
|
namespace, attr = self._getNsTag(attr)
|
||||||
if namespace is None:
|
if namespace is None:
|
||||||
new_attrs[attr] = value
|
new_attrs[attr] = value
|
||||||
|
@ -189,7 +252,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
self.nsmaps.pop()
|
self.nsmaps.pop()
|
||||||
|
|
||||||
def pi(self, target, data):
|
def pi(self, target, data):
|
||||||
pass
|
self.soup.endData()
|
||||||
|
self.soup.handle_data(target + ' ' + data)
|
||||||
|
self.soup.endData(self.processing_instruction_class)
|
||||||
|
|
||||||
def data(self, content):
|
def data(self, content):
|
||||||
self.soup.handle_data(content)
|
self.soup.handle_data(content)
|
||||||
|
@ -207,13 +272,17 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""See `TreeBuilder`."""
|
"""See `TreeBuilder`."""
|
||||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||||
|
|
||||||
|
|
||||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
|
|
||||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
NAME = LXML
|
||||||
|
ALTERNATE_NAMES = ["lxml-html"]
|
||||||
|
|
||||||
|
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||||
is_xml = False
|
is_xml = False
|
||||||
|
processing_instruction_class = ProcessingInstruction
|
||||||
|
|
||||||
def default_parser(self, encoding):
|
def default_parser(self, encoding):
|
||||||
return etree.HTMLParser
|
return etree.HTMLParser
|
||||||
|
@ -224,10 +293,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
self.parser = self.parser_for(encoding)
|
self.parser = self.parser_for(encoding)
|
||||||
self.parser.feed(markup)
|
self.parser.feed(markup)
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||||
raise ParserRejectedMarkup(str(e))
|
raise ParserRejectedMarkup(e)
|
||||||
|
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""See `TreeBuilder`."""
|
"""See `TreeBuilder`."""
|
||||||
return u'<html><body>%s</body></html>' % fragment
|
return '<html><body>%s</body></html>' % fragment
|
||||||
|
|
4
lib/bs4/check_block.py
Normal file
4
lib/bs4/check_block.py
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
import requests
|
||||||
|
data = requests.get("https://www.crummy.com/").content
|
||||||
|
from bs4 import _s
|
||||||
|
data = [x for x in _s(data).block_text()]
|
|
@ -3,12 +3,14 @@
|
||||||
|
|
||||||
This library converts a bytestream to Unicode through any means
|
This library converts a bytestream to Unicode through any means
|
||||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||||
Feed Parser. It works best on XML and XML, but it does not rewrite the
|
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||||
"""
|
"""
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
from htmlentitydefs import codepoint2name
|
from html.entities import codepoint2name
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import string
|
import string
|
||||||
|
@ -20,6 +22,8 @@ try:
|
||||||
# PyPI package: cchardet
|
# PyPI package: cchardet
|
||||||
import cchardet
|
import cchardet
|
||||||
def chardet_dammit(s):
|
def chardet_dammit(s):
|
||||||
|
if isinstance(s, str):
|
||||||
|
return None
|
||||||
return cchardet.detect(s)['encoding']
|
return cchardet.detect(s)['encoding']
|
||||||
except ImportError:
|
except ImportError:
|
||||||
try:
|
try:
|
||||||
|
@ -28,6 +32,8 @@ except ImportError:
|
||||||
# PyPI package: chardet
|
# PyPI package: chardet
|
||||||
import chardet
|
import chardet
|
||||||
def chardet_dammit(s):
|
def chardet_dammit(s):
|
||||||
|
if isinstance(s, str):
|
||||||
|
return None
|
||||||
return chardet.detect(s)['encoding']
|
return chardet.detect(s)['encoding']
|
||||||
#import chardet.constants
|
#import chardet.constants
|
||||||
#chardet.constants._debug = 1
|
#chardet.constants._debug = 1
|
||||||
|
@ -42,10 +48,19 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
xml_encoding_re = re.compile(
|
# Build bytestring and Unicode versions of regular expressions for finding
|
||||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
# a declared encoding inside an XML or HTML document.
|
||||||
html_meta_re = re.compile(
|
xml_encoding = '^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
||||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
|
||||||
|
encoding_res = dict()
|
||||||
|
encoding_res[bytes] = {
|
||||||
|
'html' : re.compile(html_meta.encode("ascii"), re.I),
|
||||||
|
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
|
||||||
|
}
|
||||||
|
encoding_res[str] = {
|
||||||
|
'html' : re.compile(html_meta, re.I),
|
||||||
|
'xml' : re.compile(xml_encoding, re.I)
|
||||||
|
}
|
||||||
|
|
||||||
class EntitySubstitution(object):
|
class EntitySubstitution(object):
|
||||||
|
|
||||||
|
@ -55,15 +70,24 @@ class EntitySubstitution(object):
|
||||||
lookup = {}
|
lookup = {}
|
||||||
reverse_lookup = {}
|
reverse_lookup = {}
|
||||||
characters_for_re = []
|
characters_for_re = []
|
||||||
for codepoint, name in list(codepoint2name.items()):
|
|
||||||
character = unichr(codepoint)
|
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
|
||||||
if codepoint != 34:
|
# entity. We don't want to use it, but we want to recognize it on the way in.
|
||||||
|
#
|
||||||
|
# TODO: Ideally we would be able to recognize all HTML 5 named
|
||||||
|
# entities, but that's a little tricky.
|
||||||
|
extra = [(39, 'apos')]
|
||||||
|
for codepoint, name in list(codepoint2name.items()) + extra:
|
||||||
|
character = chr(codepoint)
|
||||||
|
if codepoint not in (34, 39):
|
||||||
# There's no point in turning the quotation mark into
|
# There's no point in turning the quotation mark into
|
||||||
# ", unless it happens within an attribute value, which
|
# " or the single quote into ', unless it
|
||||||
# is handled elsewhere.
|
# happens within an attribute value, which is handled
|
||||||
|
# elsewhere.
|
||||||
characters_for_re.append(character)
|
characters_for_re.append(character)
|
||||||
lookup[character] = name
|
lookup[character] = name
|
||||||
# But we do want to turn " into the quotation mark.
|
# But we do want to recognize those entities on the way in and
|
||||||
|
# convert them to Unicode characters.
|
||||||
reverse_lookup[name] = character
|
reverse_lookup[name] = character
|
||||||
re_definition = "[%s]" % "".join(characters_for_re)
|
re_definition = "[%s]" % "".join(characters_for_re)
|
||||||
return lookup, reverse_lookup, re.compile(re_definition)
|
return lookup, reverse_lookup, re.compile(re_definition)
|
||||||
|
@ -79,7 +103,7 @@ class EntitySubstitution(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
||||||
")")
|
")")
|
||||||
|
|
||||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||||
|
@ -212,8 +236,11 @@ class EncodingDetector:
|
||||||
|
|
||||||
5. Windows-1252.
|
5. Windows-1252.
|
||||||
"""
|
"""
|
||||||
def __init__(self, markup, override_encodings=None, is_html=False):
|
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||||
|
exclude_encodings=None):
|
||||||
self.override_encodings = override_encodings or []
|
self.override_encodings = override_encodings or []
|
||||||
|
exclude_encodings = exclude_encodings or []
|
||||||
|
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||||
self.chardet_encoding = None
|
self.chardet_encoding = None
|
||||||
self.is_html = is_html
|
self.is_html = is_html
|
||||||
self.declared_encoding = None
|
self.declared_encoding = None
|
||||||
|
@ -224,6 +251,8 @@ class EncodingDetector:
|
||||||
def _usable(self, encoding, tried):
|
def _usable(self, encoding, tried):
|
||||||
if encoding is not None:
|
if encoding is not None:
|
||||||
encoding = encoding.lower()
|
encoding = encoding.lower()
|
||||||
|
if encoding in self.exclude_encodings:
|
||||||
|
return False
|
||||||
if encoding not in tried:
|
if encoding not in tried:
|
||||||
tried.add(encoding)
|
tried.add(encoding)
|
||||||
return True
|
return True
|
||||||
|
@ -266,6 +295,9 @@ class EncodingDetector:
|
||||||
def strip_byte_order_mark(cls, data):
|
def strip_byte_order_mark(cls, data):
|
||||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||||
encoding = None
|
encoding = None
|
||||||
|
if isinstance(data, str):
|
||||||
|
# Unicode data cannot have a byte-order mark.
|
||||||
|
return data, encoding
|
||||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||||
and (data[2:4] != '\x00\x00'):
|
and (data[2:4] != '\x00\x00'):
|
||||||
encoding = 'utf-16be'
|
encoding = 'utf-16be'
|
||||||
|
@ -300,14 +332,22 @@ class EncodingDetector:
|
||||||
xml_endpos = 1024
|
xml_endpos = 1024
|
||||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||||
|
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
res = encoding_res[bytes]
|
||||||
|
else:
|
||||||
|
res = encoding_res[str]
|
||||||
|
|
||||||
|
xml_re = res['xml']
|
||||||
|
html_re = res['html']
|
||||||
declared_encoding = None
|
declared_encoding = None
|
||||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
|
||||||
if not declared_encoding_match and is_html:
|
if not declared_encoding_match and is_html:
|
||||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
|
||||||
if declared_encoding_match is not None:
|
if declared_encoding_match is not None:
|
||||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
declared_encoding = declared_encoding_match.groups()[0]
|
||||||
'ascii')
|
|
||||||
if declared_encoding:
|
if declared_encoding:
|
||||||
|
if isinstance(declared_encoding, bytes):
|
||||||
|
declared_encoding = declared_encoding.decode('ascii', 'replace')
|
||||||
return declared_encoding.lower()
|
return declared_encoding.lower()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -331,18 +371,19 @@ class UnicodeDammit:
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, markup, override_encodings=[],
|
def __init__(self, markup, override_encodings=[],
|
||||||
smart_quotes_to=None, is_html=False):
|
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||||
self.smart_quotes_to = smart_quotes_to
|
self.smart_quotes_to = smart_quotes_to
|
||||||
self.tried_encodings = []
|
self.tried_encodings = []
|
||||||
self.contains_replacement_characters = False
|
self.contains_replacement_characters = False
|
||||||
self.is_html = is_html
|
self.is_html = is_html
|
||||||
|
self.log = logging.getLogger(__name__)
|
||||||
self.detector = EncodingDetector(markup, override_encodings, is_html)
|
self.detector = EncodingDetector(
|
||||||
|
markup, override_encodings, is_html, exclude_encodings)
|
||||||
|
|
||||||
# Short-circuit if the data is in Unicode to begin with.
|
# Short-circuit if the data is in Unicode to begin with.
|
||||||
if isinstance(markup, unicode) or markup == '':
|
if isinstance(markup, str) or markup == '':
|
||||||
self.markup = markup
|
self.markup = markup
|
||||||
self.unicode_markup = unicode(markup)
|
self.unicode_markup = str(markup)
|
||||||
self.original_encoding = None
|
self.original_encoding = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -365,9 +406,10 @@ class UnicodeDammit:
|
||||||
if encoding != "ascii":
|
if encoding != "ascii":
|
||||||
u = self._convert_from(encoding, "replace")
|
u = self._convert_from(encoding, "replace")
|
||||||
if u is not None:
|
if u is not None:
|
||||||
logging.warning(
|
self.log.warning(
|
||||||
"Some characters could not be decoded, and were "
|
"Some characters could not be decoded, and were "
|
||||||
"replaced with REPLACEMENT CHARACTER.")
|
"replaced with REPLACEMENT CHARACTER."
|
||||||
|
)
|
||||||
self.contains_replacement_characters = True
|
self.contains_replacement_characters = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -425,7 +467,7 @@ class UnicodeDammit:
|
||||||
def _to_unicode(self, data, encoding, errors="strict"):
|
def _to_unicode(self, data, encoding, errors="strict"):
|
||||||
'''Given a string and its encoding, decodes the string into Unicode.
|
'''Given a string and its encoding, decodes the string into Unicode.
|
||||||
%encoding is a string recognized by encodings.aliases'''
|
%encoding is a string recognized by encodings.aliases'''
|
||||||
return unicode(data, encoding, errors)
|
return str(data, encoding, errors)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def declared_html_encoding(self):
|
def declared_html_encoding(self):
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||||
|
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
from StringIO import StringIO
|
from io import StringIO
|
||||||
from HTMLParser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
import bs4
|
import bs4
|
||||||
from bs4 import BeautifulSoup, __version__
|
from bs4 import BeautifulSoup, __version__
|
||||||
from bs4.builder import builder_registry
|
from bs4.builder import builder_registry
|
||||||
|
@ -17,8 +21,8 @@ import cProfile
|
||||||
|
|
||||||
def diagnose(data):
|
def diagnose(data):
|
||||||
"""Diagnostic suite for isolating common problems."""
|
"""Diagnostic suite for isolating common problems."""
|
||||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||||
print "Python version %s" % sys.version
|
print("Python version %s" % sys.version)
|
||||||
|
|
||||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||||
for name in basic_parsers:
|
for name in basic_parsers:
|
||||||
|
@ -27,44 +31,60 @@ def diagnose(data):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
basic_parsers.remove(name)
|
basic_parsers.remove(name)
|
||||||
print (
|
print((
|
||||||
"I noticed that %s is not installed. Installing it may help." %
|
"I noticed that %s is not installed. Installing it may help." %
|
||||||
name)
|
name))
|
||||||
|
|
||||||
if 'lxml' in basic_parsers:
|
if 'lxml' in basic_parsers:
|
||||||
basic_parsers.append(["lxml", "xml"])
|
basic_parsers.append("lxml-xml")
|
||||||
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||||
|
except ImportError as e:
|
||||||
|
print (
|
||||||
|
"lxml is not installed or couldn't be imported.")
|
||||||
|
|
||||||
|
|
||||||
if 'html5lib' in basic_parsers:
|
if 'html5lib' in basic_parsers:
|
||||||
|
try:
|
||||||
import html5lib
|
import html5lib
|
||||||
print "Found html5lib version %s" % html5lib.__version__
|
print("Found html5lib version %s" % html5lib.__version__)
|
||||||
|
except ImportError as e:
|
||||||
|
print (
|
||||||
|
"html5lib is not installed or couldn't be imported.")
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif os.path.exists(data):
|
|
||||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
|
||||||
data = open(data).read()
|
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||||
return
|
return
|
||||||
print
|
else:
|
||||||
|
try:
|
||||||
|
if os.path.exists(data):
|
||||||
|
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||||
|
with open(data) as fp:
|
||||||
|
data = fp.read()
|
||||||
|
except ValueError:
|
||||||
|
# This can happen on some platforms when the 'filename' is
|
||||||
|
# too long. Assume it's data and not a filename.
|
||||||
|
pass
|
||||||
|
print()
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print "Trying to parse your markup with %s" % parser
|
print("Trying to parse your markup with %s" % parser)
|
||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, features=parser)
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
print "%s could not parse the markup." % parser
|
print("%s could not parse the markup." % parser)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print "Here's what %s did with the markup:" % parser
|
print("Here's what %s did with the markup:" % parser)
|
||||||
print soup.prettify()
|
print(soup.prettify())
|
||||||
|
|
||||||
print "-" * 80
|
print("-" * 80)
|
||||||
|
|
||||||
def lxml_trace(data, html=True, **kwargs):
|
def lxml_trace(data, html=True, **kwargs):
|
||||||
"""Print out the lxml events that occur during parsing.
|
"""Print out the lxml events that occur during parsing.
|
||||||
|
@ -74,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
|
||||||
"""
|
"""
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||||
|
|
||||||
class AnnouncingParser(HTMLParser):
|
class AnnouncingParser(HTMLParser):
|
||||||
"""Announces HTMLParser parse events, without doing anything else."""
|
"""Announces HTMLParser parse events, without doing anything else."""
|
||||||
|
@ -156,9 +176,9 @@ def rdoc(num_elements=1000):
|
||||||
|
|
||||||
def benchmark_parsers(num_elements=100000):
|
def benchmark_parsers(num_elements=100000):
|
||||||
"""Very basic head-to-head performance benchmark."""
|
"""Very basic head-to-head performance benchmark."""
|
||||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||||
data = rdoc(num_elements)
|
data = rdoc(num_elements)
|
||||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||||
|
|
||||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||||
success = False
|
success = False
|
||||||
|
@ -167,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, parser)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
print "%s could not parse the markup." % parser
|
print("%s could not parse the markup." % parser)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
a = time.time()
|
a = time.time()
|
||||||
etree.HTML(data)
|
etree.HTML(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
parser = html5lib.HTMLParser()
|
parser = html5lib.HTMLParser()
|
||||||
a = time.time()
|
a = time.time()
|
||||||
parser.parse(data)
|
parser.parse(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||||
|
|
||||||
def profile(num_elements=100000, parser="lxml"):
|
def profile(num_elements=100000, parser="lxml"):
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
99
lib/bs4/formatter.py
Normal file
99
lib/bs4/formatter.py
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
from bs4.dammit import EntitySubstitution
|
||||||
|
|
||||||
|
class Formatter(EntitySubstitution):
|
||||||
|
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||||
|
|
||||||
|
Some parts of this strategy come from the distinction between
|
||||||
|
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||||
|
"""
|
||||||
|
# Registries of XML and HTML formatters.
|
||||||
|
XML_FORMATTERS = {}
|
||||||
|
HTML_FORMATTERS = {}
|
||||||
|
|
||||||
|
HTML = 'html'
|
||||||
|
XML = 'xml'
|
||||||
|
|
||||||
|
HTML_DEFAULTS = dict(
|
||||||
|
cdata_containing_tags=set(["script", "style"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _default(self, language, value, kwarg):
|
||||||
|
if value is not None:
|
||||||
|
return value
|
||||||
|
if language == self.XML:
|
||||||
|
return set()
|
||||||
|
return self.HTML_DEFAULTS[kwarg]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, language=None, entity_substitution=None,
|
||||||
|
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param void_element_close_prefix: By default, represent void
|
||||||
|
elements as <tag/> rather than <tag>
|
||||||
|
"""
|
||||||
|
self.language = language
|
||||||
|
self.entity_substitution = entity_substitution
|
||||||
|
self.void_element_close_prefix = void_element_close_prefix
|
||||||
|
self.cdata_containing_tags = self._default(
|
||||||
|
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||||
|
)
|
||||||
|
|
||||||
|
def substitute(self, ns):
|
||||||
|
"""Process a string that needs to undergo entity substitution."""
|
||||||
|
if not self.entity_substitution:
|
||||||
|
return ns
|
||||||
|
from .element import NavigableString
|
||||||
|
if (isinstance(ns, NavigableString)
|
||||||
|
and ns.parent is not None
|
||||||
|
and ns.parent.name in self.cdata_containing_tags):
|
||||||
|
# Do nothing.
|
||||||
|
return ns
|
||||||
|
# Substitute.
|
||||||
|
return self.entity_substitution(ns)
|
||||||
|
|
||||||
|
def attribute_value(self, value):
|
||||||
|
"""Process the value of an attribute."""
|
||||||
|
return self.substitute(value)
|
||||||
|
|
||||||
|
def attributes(self, tag):
|
||||||
|
"""Reorder a tag's attributes however you want."""
|
||||||
|
return sorted(tag.attrs.items())
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLFormatter(Formatter):
|
||||||
|
REGISTRY = {}
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class XMLFormatter(Formatter):
|
||||||
|
REGISTRY = {}
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# Set up aliases for the default formatters.
|
||||||
|
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||||
|
entity_substitution=EntitySubstitution.substitute_html
|
||||||
|
)
|
||||||
|
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||||
|
entity_substitution=EntitySubstitution.substitute_html,
|
||||||
|
void_element_close_prefix = None
|
||||||
|
)
|
||||||
|
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||||
|
entity_substitution=EntitySubstitution.substitute_xml
|
||||||
|
)
|
||||||
|
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
|
||||||
|
entity_substitution=None
|
||||||
|
)
|
||||||
|
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||||
|
entity_substitution=EntitySubstitution.substitute_html
|
||||||
|
)
|
||||||
|
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||||
|
entity_substitution=EntitySubstitution.substitute_xml
|
||||||
|
)
|
||||||
|
XMLFormatter.REGISTRY[None] = Formatter(
|
||||||
|
Formatter(Formatter.XML, entity_substitution=None)
|
||||||
|
)
|
|
@ -1,5 +1,10 @@
|
||||||
|
# encoding: utf-8
|
||||||
"""Helper classes for tests."""
|
"""Helper classes for tests."""
|
||||||
|
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
|
__license__ = "MIT"
|
||||||
|
|
||||||
|
import pickle
|
||||||
import copy
|
import copy
|
||||||
import functools
|
import functools
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -11,29 +16,66 @@ from bs4.element import (
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
Doctype,
|
Doctype,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
|
Tag
|
||||||
)
|
)
|
||||||
|
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
default_builder = HTMLParserTreeBuilder
|
default_builder = HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
BAD_DOCUMENT = """A bare string
|
||||||
|
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
|
||||||
|
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
|
||||||
|
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
|
||||||
|
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
|
||||||
|
<div>A <meta> tag</div>
|
||||||
|
<div>A <br> tag that supposedly has contents.</br></div>
|
||||||
|
<div>AT&T</div>
|
||||||
|
<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div>
|
||||||
|
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
|
||||||
|
<div>This numeric entity is missing the final semicolon: <x t="piñata"></div>
|
||||||
|
<div><a href="http://example.com/</a> that attribute value never got closed</div>
|
||||||
|
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
|
||||||
|
<! This document starts with a bogus declaration ><div>a</div>
|
||||||
|
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
|
||||||
|
<div>This document ends with <!an incomplete declaration
|
||||||
|
<div><a style={height:21px;}>That attribute value was bogus</a></div>
|
||||||
|
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
|
||||||
|
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
|
||||||
|
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
|
||||||
|
<div>This document ends before the entity finishes: >
|
||||||
|
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
|
||||||
|
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
|
||||||
|
<div><table><tr><td>Here's a table</td></tr></table></div>
|
||||||
|
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
|
||||||
|
<div>This tag contains nothing but whitespace: <b> </b></div>
|
||||||
|
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
|
||||||
|
<div><table><div>This table contains bare markup</div></table></div>
|
||||||
|
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
|
||||||
|
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
|
||||||
|
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
|
||||||
|
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
|
||||||
|
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class SoupTest(unittest.TestCase):
|
class SoupTest(unittest.TestCase):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_builder(self):
|
def default_builder(self):
|
||||||
return default_builder()
|
return default_builder
|
||||||
|
|
||||||
def soup(self, markup, **kwargs):
|
def soup(self, markup, **kwargs):
|
||||||
"""Build a Beautiful Soup object from markup."""
|
"""Build a Beautiful Soup object from markup."""
|
||||||
builder = kwargs.pop('builder', self.default_builder)
|
builder = kwargs.pop('builder', self.default_builder)
|
||||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||||
|
|
||||||
def document_for(self, markup):
|
def document_for(self, markup, **kwargs):
|
||||||
"""Turn an HTML fragment into a document.
|
"""Turn an HTML fragment into a document.
|
||||||
|
|
||||||
The details depend on the builder.
|
The details depend on the builder.
|
||||||
"""
|
"""
|
||||||
return self.default_builder.test_fragment_to_document(markup)
|
return self.default_builder(**kwargs).test_fragment_to_document(markup)
|
||||||
|
|
||||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||||
builder = self.default_builder
|
builder = self.default_builder
|
||||||
|
@ -43,6 +85,131 @@ class SoupTest(unittest.TestCase):
|
||||||
|
|
||||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||||
|
|
||||||
|
def assertConnectedness(self, element):
|
||||||
|
"""Ensure that next_element and previous_element are properly
|
||||||
|
set for all descendants of the given element.
|
||||||
|
"""
|
||||||
|
earlier = None
|
||||||
|
for e in element.descendants:
|
||||||
|
if earlier:
|
||||||
|
self.assertEqual(e, earlier.next_element)
|
||||||
|
self.assertEqual(earlier, e.previous_element)
|
||||||
|
earlier = e
|
||||||
|
|
||||||
|
def linkage_validator(self, el, _recursive_call=False):
|
||||||
|
"""Ensure proper linkage throughout the document."""
|
||||||
|
descendant = None
|
||||||
|
# Document element should have no previous element or previous sibling.
|
||||||
|
# It also shouldn't have a next sibling.
|
||||||
|
if el.parent is None:
|
||||||
|
assert el.previous_element is None,\
|
||||||
|
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.previous_element, None
|
||||||
|
)
|
||||||
|
assert el.previous_sibling is None,\
|
||||||
|
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.previous_sibling, None
|
||||||
|
)
|
||||||
|
assert el.next_sibling is None,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.next_sibling, None
|
||||||
|
)
|
||||||
|
|
||||||
|
idx = 0
|
||||||
|
child = None
|
||||||
|
last_child = None
|
||||||
|
last_idx = len(el.contents) - 1
|
||||||
|
for child in el.contents:
|
||||||
|
descendant = None
|
||||||
|
|
||||||
|
# Parent should link next element to their first child
|
||||||
|
# That child should have no previous sibling
|
||||||
|
if idx == 0:
|
||||||
|
if el.parent is not None:
|
||||||
|
assert el.next_element is child,\
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.next_element, child
|
||||||
|
)
|
||||||
|
assert child.previous_element is el,\
|
||||||
|
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||||
|
child, child.previous_element, el
|
||||||
|
)
|
||||||
|
assert child.previous_sibling is None,\
|
||||||
|
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
|
||||||
|
child, child.previous_sibling, None
|
||||||
|
)
|
||||||
|
|
||||||
|
# If not the first child, previous index should link as sibling to this index
|
||||||
|
# Previous element should match the last index or the last bubbled up descendant
|
||||||
|
else:
|
||||||
|
assert child.previous_sibling is el.contents[idx - 1],\
|
||||||
|
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
|
||||||
|
child, child.previous_sibling, el.contents[idx - 1]
|
||||||
|
)
|
||||||
|
assert el.contents[idx - 1].next_sibling is child,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
|
||||||
|
)
|
||||||
|
|
||||||
|
if last_child is not None:
|
||||||
|
assert child.previous_element is last_child,\
|
||||||
|
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
|
||||||
|
child, child.previous_element, last_child, child.parent.contents
|
||||||
|
)
|
||||||
|
assert last_child.next_element is child,\
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
last_child, last_child.next_element, child
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(child, Tag) and child.contents:
|
||||||
|
descendant = self.linkage_validator(child, True)
|
||||||
|
# A bubbled up descendant should have no next siblings
|
||||||
|
assert descendant.next_sibling is None,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
descendant, descendant.next_sibling, None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mark last child as either the bubbled up descendant or the current child
|
||||||
|
if descendant is not None:
|
||||||
|
last_child = descendant
|
||||||
|
else:
|
||||||
|
last_child = child
|
||||||
|
|
||||||
|
# If last child, there are non next siblings
|
||||||
|
if idx == last_idx:
|
||||||
|
assert child.next_sibling is None,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
child, child.next_sibling, None
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
child = descendant if descendant is not None else child
|
||||||
|
if child is None:
|
||||||
|
child = el
|
||||||
|
|
||||||
|
if not _recursive_call and child is not None:
|
||||||
|
target = el
|
||||||
|
while True:
|
||||||
|
if target is None:
|
||||||
|
assert child.next_element is None, \
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
child, child.next_element, None
|
||||||
|
)
|
||||||
|
break
|
||||||
|
elif target.next_sibling is not None:
|
||||||
|
assert child.next_element is target.next_sibling, \
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
child, child.next_element, target.next_sibling
|
||||||
|
)
|
||||||
|
break
|
||||||
|
target = target.parent
|
||||||
|
|
||||||
|
# We are done, so nothing to return
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
# Return the child to the recursive caller
|
||||||
|
return child
|
||||||
|
|
||||||
|
|
||||||
class HTMLTreeBuilderSmokeTest(object):
|
class HTMLTreeBuilderSmokeTest(object):
|
||||||
|
|
||||||
|
@ -54,6 +221,27 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
markup in these tests, there's not much room for interpretation.
|
markup in these tests, there's not much room for interpretation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def test_empty_element_tags(self):
|
||||||
|
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
|
||||||
|
are handled correctly.
|
||||||
|
"""
|
||||||
|
for name in [
|
||||||
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||||
|
'spacer', 'frame'
|
||||||
|
]:
|
||||||
|
soup = self.soup("")
|
||||||
|
new_tag = soup.new_tag(name)
|
||||||
|
self.assertEqual(True, new_tag.is_empty_element)
|
||||||
|
|
||||||
|
def test_pickle_and_unpickle_identity(self):
|
||||||
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
|
# to the original.
|
||||||
|
tree = self.soup("<a><b>foo</a>")
|
||||||
|
dumped = pickle.dumps(tree, 2)
|
||||||
|
loaded = pickle.loads(dumped)
|
||||||
|
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||||
|
self.assertEqual(loaded.decode(), tree.decode())
|
||||||
|
|
||||||
def assertDoctypeHandled(self, doctype_fragment):
|
def assertDoctypeHandled(self, doctype_fragment):
|
||||||
"""Assert that a given doctype string is handled correctly."""
|
"""Assert that a given doctype string is handled correctly."""
|
||||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||||
|
@ -114,6 +302,27 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
soup.encode("utf-8").replace(b"\n", b""),
|
soup.encode("utf-8").replace(b"\n", b""),
|
||||||
markup.replace(b"\n", b""))
|
markup.replace(b"\n", b""))
|
||||||
|
|
||||||
|
def test_namespaced_html(self):
|
||||||
|
"""When a namespaced XML document is parsed as HTML it should
|
||||||
|
be treated as HTML with weird tag names.
|
||||||
|
"""
|
||||||
|
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(2, len(soup.find_all("ns1:foo")))
|
||||||
|
|
||||||
|
def test_processing_instruction(self):
|
||||||
|
# We test both Unicode and bytestring to verify that
|
||||||
|
# process_markup correctly sets processing_instruction_class
|
||||||
|
# even when the markup is already Unicode and there is no
|
||||||
|
# need to process anything.
|
||||||
|
markup = """<?PITarget PIContent?>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(markup, soup.decode())
|
||||||
|
|
||||||
|
markup = b"""<?PITarget PIContent?>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(markup, soup.encode("utf8"))
|
||||||
|
|
||||||
def test_deepcopy(self):
|
def test_deepcopy(self):
|
||||||
"""Make sure you can copy the tree builder.
|
"""Make sure you can copy the tree builder.
|
||||||
|
|
||||||
|
@ -155,6 +364,23 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
def test_nested_formatting_elements(self):
|
def test_nested_formatting_elements(self):
|
||||||
self.assertSoupEquals("<em><em></em></em>")
|
self.assertSoupEquals("<em><em></em></em>")
|
||||||
|
|
||||||
|
def test_double_head(self):
|
||||||
|
html = '''<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Ordinary HEAD element test</title>
|
||||||
|
</head>
|
||||||
|
<script type="text/javascript">
|
||||||
|
alert("Help!");
|
||||||
|
</script>
|
||||||
|
<body>
|
||||||
|
Hello, world!
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
soup = self.soup(html)
|
||||||
|
self.assertEqual("text/javascript", soup.find('script')['type'])
|
||||||
|
|
||||||
def test_comment(self):
|
def test_comment(self):
|
||||||
# Comments are represented as Comment objects.
|
# Comments are represented as Comment objects.
|
||||||
markup = "<p>foo<!--foobar-->baz</p>"
|
markup = "<p>foo<!--foobar-->baz</p>"
|
||||||
|
@ -171,9 +397,22 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
self.assertEqual(comment, baz.previous_element)
|
self.assertEqual(comment, baz.previous_element)
|
||||||
|
|
||||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
||||||
self.assertSoupEquals("<pre> </pre>")
|
even if that would mean not prettifying the markup.
|
||||||
self.assertSoupEquals("<textarea> woo </textarea>")
|
"""
|
||||||
|
pre_markup = "<pre> </pre>"
|
||||||
|
textarea_markup = "<textarea> woo\nwoo </textarea>"
|
||||||
|
self.assertSoupEquals(pre_markup)
|
||||||
|
self.assertSoupEquals(textarea_markup)
|
||||||
|
|
||||||
|
soup = self.soup(pre_markup)
|
||||||
|
self.assertEqual(soup.pre.prettify(), pre_markup)
|
||||||
|
|
||||||
|
soup = self.soup(textarea_markup)
|
||||||
|
self.assertEqual(soup.textarea.prettify(), textarea_markup)
|
||||||
|
|
||||||
|
soup = self.soup("<textarea></textarea>")
|
||||||
|
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
|
||||||
|
|
||||||
def test_nested_inline_elements(self):
|
def test_nested_inline_elements(self):
|
||||||
"""Inline elements can be nested indefinitely."""
|
"""Inline elements can be nested indefinitely."""
|
||||||
|
@ -213,6 +452,18 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
|
||||||
|
def test_multivalued_attribute_with_whitespace(self):
|
||||||
|
# Whitespace separating the values of a multi-valued attribute
|
||||||
|
# should be ignored.
|
||||||
|
|
||||||
|
markup = '<div class=" foo bar "></a>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(['foo', 'bar'], soup.div['class'])
|
||||||
|
|
||||||
|
# If you search by the literal name of the class it's like the whitespace
|
||||||
|
# wasn't there.
|
||||||
|
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
|
||||||
|
|
||||||
def test_deeply_nested_multivalued_attribute(self):
|
def test_deeply_nested_multivalued_attribute(self):
|
||||||
# html5lib can set the attributes of the same tag many times
|
# html5lib can set the attributes of the same tag many times
|
||||||
# as it rearranges the tree. This has caused problems with
|
# as it rearranges the tree. This has caused problems with
|
||||||
|
@ -221,18 +472,52 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(["css"], soup.div.div['class'])
|
self.assertEqual(["css"], soup.div.div['class'])
|
||||||
|
|
||||||
|
def test_multivalued_attribute_on_html(self):
|
||||||
|
# html5lib uses a different API to set the attributes ot the
|
||||||
|
# <html> tag. This has caused problems with multivalued
|
||||||
|
# attributes.
|
||||||
|
markup = '<html class="a b"></html>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(["a", "b"], soup.html['class'])
|
||||||
|
|
||||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||||
|
|
||||||
|
def test_strings_resembling_character_entity_references(self):
|
||||||
|
# "&T" and "&p" look like incomplete character entities, but they are
|
||||||
|
# not.
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>• AT&T is in the s&p 500</p>",
|
||||||
|
"<p>\u2022 AT&T is in the s&p 500</p>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_apos_entity(self):
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>Bob's Bar</p>",
|
||||||
|
"<p>Bob's Bar</p>",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_entities_in_foreign_document_encoding(self):
|
||||||
|
# “ and ” are invalid numeric entities referencing
|
||||||
|
# Windows-1252 characters. - references a character common
|
||||||
|
# to Windows-1252 and Unicode, and ☃ references a
|
||||||
|
# character only found in Unicode.
|
||||||
|
#
|
||||||
|
# All of these entities should be converted to Unicode
|
||||||
|
# characters.
|
||||||
|
markup = "<p>“Hello” -☃</p>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("“Hello” -☃", soup.p.string)
|
||||||
|
|
||||||
def test_entities_in_attributes_converted_to_unicode(self):
|
def test_entities_in_attributes_converted_to_unicode(self):
|
||||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
|
||||||
def test_entities_in_text_converted_to_unicode(self):
|
def test_entities_in_text_converted_to_unicode(self):
|
||||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
@ -243,7 +528,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
'<p>I said "good day!"</p>')
|
'<p>I said "good day!"</p>')
|
||||||
|
|
||||||
def test_out_of_range_entity(self):
|
def test_out_of_range_entity(self):
|
||||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
expect = "\N{REPLACEMENT CHARACTER}"
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
|
@ -253,6 +538,42 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||||
self.assertEqual("p", soup.p.name)
|
self.assertEqual("p", soup.p.name)
|
||||||
|
self.assertConnectedness(soup)
|
||||||
|
|
||||||
|
def test_empty_element_tags(self):
|
||||||
|
"""Verify consistent handling of empty-element tags,
|
||||||
|
no matter how they come in through the markup.
|
||||||
|
"""
|
||||||
|
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
|
||||||
|
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
|
||||||
|
|
||||||
|
def test_head_tag_between_head_and_body(self):
|
||||||
|
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||||
|
content = """<html><head></head>
|
||||||
|
<link></link>
|
||||||
|
<body>foo</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
soup = self.soup(content)
|
||||||
|
self.assertNotEqual(None, soup.html.body)
|
||||||
|
self.assertConnectedness(soup)
|
||||||
|
|
||||||
|
def test_multiple_copies_of_a_tag(self):
|
||||||
|
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||||
|
content = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<article id="a" >
|
||||||
|
<div><a href="1"></div>
|
||||||
|
<footer>
|
||||||
|
<a href="2"></a>
|
||||||
|
</footer>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
soup = self.soup(content)
|
||||||
|
self.assertConnectedness(soup.article)
|
||||||
|
|
||||||
def test_basic_namespaces(self):
|
def test_basic_namespaces(self):
|
||||||
"""Parsers don't need to *understand* namespaces, but at the
|
"""Parsers don't need to *understand* namespaces, but at the
|
||||||
|
@ -285,9 +606,9 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
# A seemingly innocuous document... but it's in Unicode! And
|
# A seemingly innocuous document... but it's in Unicode! And
|
||||||
# it contains characters that can't be represented in the
|
# it contains characters that can't be represented in the
|
||||||
# encoding found in the declaration! The horror!
|
# encoding found in the declaration! The horror!
|
||||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
|
||||||
|
|
||||||
def test_soupstrainer(self):
|
def test_soupstrainer(self):
|
||||||
"""Parsers should be able to work with SoupStrainers."""
|
"""Parsers should be able to work with SoupStrainers."""
|
||||||
|
@ -327,7 +648,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
# Both XML and HTML entities are converted to Unicode characters
|
# Both XML and HTML entities are converted to Unicode characters
|
||||||
# during parsing.
|
# during parsing.
|
||||||
text = "<p><<sacré bleu!>></p>"
|
text = "<p><<sacré bleu!>></p>"
|
||||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||||
self.assertSoupEquals(text, expected)
|
self.assertSoupEquals(text, expected)
|
||||||
|
|
||||||
def test_smart_quotes_converted_on_the_way_in(self):
|
def test_smart_quotes_converted_on_the_way_in(self):
|
||||||
|
@ -337,15 +658,15 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup(quote)
|
soup = self.soup(quote)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.p.string,
|
soup.p.string,
|
||||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||||
|
|
||||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||||
soup = self.soup("<a> </a>")
|
soup = self.soup("<a> </a>")
|
||||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
|
||||||
|
|
||||||
def test_entities_converted_on_the_way_out(self):
|
def test_entities_converted_on_the_way_out(self):
|
||||||
text = "<p><<sacré bleu!>></p>"
|
text = "<p><<sacré bleu!>></p>"
|
||||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||||
soup = self.soup(text)
|
soup = self.soup(text)
|
||||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||||
|
|
||||||
|
@ -354,7 +675,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
# easy-to-understand document.
|
# easy-to-understand document.
|
||||||
|
|
||||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||||
|
|
||||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||||
# that to test.
|
# that to test.
|
||||||
|
@ -399,7 +720,9 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||||
soup = self.soup(
|
soup = self.soup(
|
||||||
hebrew_document, from_encoding="iso8859-8")
|
hebrew_document, from_encoding="iso8859-8")
|
||||||
self.assertEqual(soup.original_encoding, 'iso8859-8')
|
# Some tree builders call it iso8859-8, others call it iso-8859-9.
|
||||||
|
# That's not a difference we really care about.
|
||||||
|
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.encode('utf-8'),
|
soup.encode('utf-8'),
|
||||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||||
|
@ -461,13 +784,39 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
data.a['foo'] = 'bar'
|
data.a['foo'] = 'bar'
|
||||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||||
|
|
||||||
|
def test_worst_case(self):
|
||||||
|
"""Test the worst case (currently) for linking issues."""
|
||||||
|
|
||||||
|
soup = self.soup(BAD_DOCUMENT)
|
||||||
|
self.linkage_validator(soup)
|
||||||
|
|
||||||
|
|
||||||
class XMLTreeBuilderSmokeTest(object):
|
class XMLTreeBuilderSmokeTest(object):
|
||||||
|
|
||||||
|
def test_pickle_and_unpickle_identity(self):
|
||||||
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
|
# to the original.
|
||||||
|
tree = self.soup("<a><b>foo</a>")
|
||||||
|
dumped = pickle.dumps(tree, 2)
|
||||||
|
loaded = pickle.loads(dumped)
|
||||||
|
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||||
|
self.assertEqual(loaded.decode(), tree.decode())
|
||||||
|
|
||||||
def test_docstring_generated(self):
|
def test_docstring_generated(self):
|
||||||
soup = self.soup("<root/>")
|
soup = self.soup("<root/>")
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||||
|
|
||||||
|
def test_xml_declaration(self):
|
||||||
|
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(markup, soup.encode("utf8"))
|
||||||
|
|
||||||
|
def test_processing_instruction(self):
|
||||||
|
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(markup, soup.encode("utf8"))
|
||||||
|
|
||||||
def test_real_xhtml_document(self):
|
def test_real_xhtml_document(self):
|
||||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
@ -480,12 +829,23 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.encode("utf-8"), markup)
|
soup.encode("utf-8"), markup)
|
||||||
|
|
||||||
|
def test_nested_namespaces(self):
|
||||||
|
doc = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||||
|
<parent xmlns="http://ns1/">
|
||||||
|
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
|
||||||
|
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
|
||||||
|
</child>
|
||||||
|
</parent>"""
|
||||||
|
soup = self.soup(doc)
|
||||||
|
self.assertEqual(doc, soup.encode())
|
||||||
|
|
||||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||||
doc = """
|
doc = """
|
||||||
<script type="text/javascript">
|
<script type="text/javascript">
|
||||||
</script>
|
</script>
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(doc, "xml")
|
soup = BeautifulSoup(doc, "lxml-xml")
|
||||||
# lxml would have stripped this while parsing, but we can add
|
# lxml would have stripped this while parsing, but we can add
|
||||||
# it later.
|
# it later.
|
||||||
soup.script.string = 'console.log("< < hey > > ");'
|
soup.script.string = 'console.log("< < hey > > ");'
|
||||||
|
@ -493,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
self.assertTrue(b"< < hey > >" in encoded)
|
self.assertTrue(b"< < hey > >" in encoded)
|
||||||
|
|
||||||
def test_can_parse_unicode_document(self):
|
def test_can_parse_unicode_document(self):
|
||||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
|
||||||
|
|
||||||
def test_popping_namespaced_tag(self):
|
def test_popping_namespaced_tag(self):
|
||||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
unicode(soup.rss), markup)
|
str(soup.rss), markup)
|
||||||
|
|
||||||
def test_docstring_includes_correct_encoding(self):
|
def test_docstring_includes_correct_encoding(self):
|
||||||
soup = self.soup("<root/>")
|
soup = self.soup("<root/>")
|
||||||
|
@ -532,17 +892,57 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
def test_closing_namespaced_tag(self):
|
def test_closing_namespaced_tag(self):
|
||||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.p), markup)
|
self.assertEqual(str(soup.p), markup)
|
||||||
|
|
||||||
def test_namespaced_attributes(self):
|
def test_namespaced_attributes(self):
|
||||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(str(soup.foo), markup)
|
||||||
|
|
||||||
def test_namespaced_attributes_xml_namespace(self):
|
def test_namespaced_attributes_xml_namespace(self):
|
||||||
markup = '<foo xml:lang="fr">bar</foo>'
|
markup = '<foo xml:lang="fr">bar</foo>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(str(soup.foo), markup)
|
||||||
|
|
||||||
|
def test_find_by_prefixed_name(self):
|
||||||
|
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Document xmlns="http://example.com/ns0"
|
||||||
|
xmlns:ns1="http://example.com/ns1"
|
||||||
|
xmlns:ns2="http://example.com/ns2"
|
||||||
|
<ns1:tag>foo</ns1:tag>
|
||||||
|
<ns1:tag>bar</ns1:tag>
|
||||||
|
<ns2:tag key="value">baz</ns2:tag>
|
||||||
|
</Document>
|
||||||
|
"""
|
||||||
|
soup = self.soup(doc)
|
||||||
|
|
||||||
|
# There are three <tag> tags.
|
||||||
|
self.assertEqual(3, len(soup.find_all('tag')))
|
||||||
|
|
||||||
|
# But two of them are ns1:tag and one of them is ns2:tag.
|
||||||
|
self.assertEqual(2, len(soup.find_all('ns1:tag')))
|
||||||
|
self.assertEqual(1, len(soup.find_all('ns2:tag')))
|
||||||
|
|
||||||
|
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
|
||||||
|
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
|
||||||
|
|
||||||
|
def test_copy_tag_preserves_namespace(self):
|
||||||
|
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<w:document xmlns:w="http://example.com/ns0"/>"""
|
||||||
|
|
||||||
|
soup = self.soup(xml)
|
||||||
|
tag = soup.document
|
||||||
|
duplicate = copy.copy(tag)
|
||||||
|
|
||||||
|
# The two tags have the same namespace prefix.
|
||||||
|
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||||
|
|
||||||
|
def test_worst_case(self):
|
||||||
|
"""Test the worst case (currently) for linking issues."""
|
||||||
|
|
||||||
|
soup = self.soup(BAD_DOCUMENT)
|
||||||
|
self.linkage_validator(soup)
|
||||||
|
|
||||||
|
|
||||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||||
"""Smoke test for a tree builder that supports HTML5."""
|
"""Smoke test for a tree builder that supports HTML5."""
|
||||||
|
|
1
lib/bs4/tests/__init__.py
Normal file
1
lib/bs4/tests/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
"The beautifulsoup tests."
|
147
lib/bs4/tests/test_builder_registry.py
Normal file
147
lib/bs4/tests/test_builder_registry.py
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
"""Tests of the builder registry."""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.builder import (
|
||||||
|
builder_registry as registry,
|
||||||
|
HTMLParserTreeBuilder,
|
||||||
|
TreeBuilderRegistry,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import HTML5TreeBuilder
|
||||||
|
HTML5LIB_PRESENT = True
|
||||||
|
except ImportError:
|
||||||
|
HTML5LIB_PRESENT = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import (
|
||||||
|
LXMLTreeBuilderForXML,
|
||||||
|
LXMLTreeBuilder,
|
||||||
|
)
|
||||||
|
LXML_PRESENT = True
|
||||||
|
except ImportError:
|
||||||
|
LXML_PRESENT = False
|
||||||
|
|
||||||
|
|
||||||
|
class BuiltInRegistryTest(unittest.TestCase):
|
||||||
|
"""Test the built-in registry with the default builders registered."""
|
||||||
|
|
||||||
|
def test_combination(self):
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('fast', 'html'),
|
||||||
|
LXMLTreeBuilder)
|
||||||
|
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('permissive', 'xml'),
|
||||||
|
LXMLTreeBuilderForXML)
|
||||||
|
self.assertEqual(registry.lookup('strict', 'html'),
|
||||||
|
HTMLParserTreeBuilder)
|
||||||
|
if HTML5LIB_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html5lib', 'html'),
|
||||||
|
HTML5TreeBuilder)
|
||||||
|
|
||||||
|
def test_lookup_by_markup_type(self):
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
||||||
|
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
||||||
|
else:
|
||||||
|
self.assertEqual(registry.lookup('xml'), None)
|
||||||
|
if HTML5LIB_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
||||||
|
else:
|
||||||
|
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
||||||
|
|
||||||
|
def test_named_library(self):
|
||||||
|
if LXML_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('lxml', 'xml'),
|
||||||
|
LXMLTreeBuilderForXML)
|
||||||
|
self.assertEqual(registry.lookup('lxml', 'html'),
|
||||||
|
LXMLTreeBuilder)
|
||||||
|
if HTML5LIB_PRESENT:
|
||||||
|
self.assertEqual(registry.lookup('html5lib'),
|
||||||
|
HTML5TreeBuilder)
|
||||||
|
|
||||||
|
self.assertEqual(registry.lookup('html.parser'),
|
||||||
|
HTMLParserTreeBuilder)
|
||||||
|
|
||||||
|
def test_beautifulsoup_constructor_does_lookup(self):
|
||||||
|
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
# This will create a warning about not explicitly
|
||||||
|
# specifying a parser, but we'll ignore it.
|
||||||
|
|
||||||
|
# You can pass in a string.
|
||||||
|
BeautifulSoup("", features="html")
|
||||||
|
# Or a list of strings.
|
||||||
|
BeautifulSoup("", features=["html", "fast"])
|
||||||
|
|
||||||
|
# You'll get an exception if BS can't find an appropriate
|
||||||
|
# builder.
|
||||||
|
self.assertRaises(ValueError, BeautifulSoup,
|
||||||
|
"", features="no-such-feature")
|
||||||
|
|
||||||
|
class RegistryTest(unittest.TestCase):
|
||||||
|
"""Test the TreeBuilderRegistry class in general."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.registry = TreeBuilderRegistry()
|
||||||
|
|
||||||
|
def builder_for_features(self, *feature_list):
|
||||||
|
cls = type('Builder_' + '_'.join(feature_list),
|
||||||
|
(object,), {'features' : feature_list})
|
||||||
|
|
||||||
|
self.registry.register(cls)
|
||||||
|
return cls
|
||||||
|
|
||||||
|
def test_register_with_no_features(self):
|
||||||
|
builder = self.builder_for_features()
|
||||||
|
|
||||||
|
# Since the builder advertises no features, you can't find it
|
||||||
|
# by looking up features.
|
||||||
|
self.assertEqual(self.registry.lookup('foo'), None)
|
||||||
|
|
||||||
|
# But you can find it by doing a lookup with no features, if
|
||||||
|
# this happens to be the only registered builder.
|
||||||
|
self.assertEqual(self.registry.lookup(), builder)
|
||||||
|
|
||||||
|
def test_register_with_features_makes_lookup_succeed(self):
|
||||||
|
builder = self.builder_for_features('foo', 'bar')
|
||||||
|
self.assertEqual(self.registry.lookup('foo'), builder)
|
||||||
|
self.assertEqual(self.registry.lookup('bar'), builder)
|
||||||
|
|
||||||
|
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||||
|
builder = self.builder_for_features('foo', 'bar')
|
||||||
|
self.assertEqual(self.registry.lookup('baz'), None)
|
||||||
|
|
||||||
|
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||||
|
builder1 = self.builder_for_features('foo')
|
||||||
|
builder2 = self.builder_for_features('bar')
|
||||||
|
self.assertEqual(self.registry.lookup(), builder2)
|
||||||
|
|
||||||
|
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||||
|
self.assertEqual(self.registry.lookup(), None)
|
||||||
|
|
||||||
|
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||||
|
has_one = self.builder_for_features('foo')
|
||||||
|
has_the_other = self.builder_for_features('bar')
|
||||||
|
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
||||||
|
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
||||||
|
lacks_one = self.builder_for_features('bar')
|
||||||
|
has_the_other = self.builder_for_features('foo')
|
||||||
|
|
||||||
|
# There are two builders featuring 'foo' and 'bar', but
|
||||||
|
# the one that also features 'quux' was registered later.
|
||||||
|
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
||||||
|
has_both_late)
|
||||||
|
|
||||||
|
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||||
|
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
||||||
|
has_both_early)
|
||||||
|
|
||||||
|
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||||
|
builder1 = self.builder_for_features('foo', 'bar')
|
||||||
|
builder2 = self.builder_for_features('foo', 'baz')
|
||||||
|
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
36
lib/bs4/tests/test_docs.py
Normal file
36
lib/bs4/tests/test_docs.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
"Test harness for doctests."
|
||||||
|
|
||||||
|
# pylint: disable-msg=E0611,W0142
|
||||||
|
|
||||||
|
__metaclass__ = type
|
||||||
|
__all__ = [
|
||||||
|
'additional_tests',
|
||||||
|
]
|
||||||
|
|
||||||
|
import atexit
|
||||||
|
import doctest
|
||||||
|
import os
|
||||||
|
#from pkg_resources import (
|
||||||
|
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
DOCTEST_FLAGS = (
|
||||||
|
doctest.ELLIPSIS |
|
||||||
|
doctest.NORMALIZE_WHITESPACE |
|
||||||
|
doctest.REPORT_NDIFF)
|
||||||
|
|
||||||
|
|
||||||
|
# def additional_tests():
|
||||||
|
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
||||||
|
# doctest_files = [
|
||||||
|
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
||||||
|
# if resource_exists('bs4', 'docs'):
|
||||||
|
# for name in resource_listdir('bs4', 'docs'):
|
||||||
|
# if name.endswith('.txt'):
|
||||||
|
# doctest_files.append(
|
||||||
|
# os.path.abspath(
|
||||||
|
# resource_filename('bs4', 'docs/%s' % name)))
|
||||||
|
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
||||||
|
# atexit.register(cleanup_resources)
|
||||||
|
# return unittest.TestSuite((
|
||||||
|
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
184
lib/bs4/tests/test_html5lib.py
Normal file
184
lib/bs4/tests/test_html5lib.py
Normal file
|
@ -0,0 +1,184 @@
|
||||||
|
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import HTML5TreeBuilder
|
||||||
|
HTML5LIB_PRESENT = True
|
||||||
|
except ImportError as e:
|
||||||
|
HTML5LIB_PRESENT = False
|
||||||
|
from bs4.element import SoupStrainer
|
||||||
|
from bs4.testing import (
|
||||||
|
HTML5TreeBuilderSmokeTest,
|
||||||
|
SoupTest,
|
||||||
|
skipIf,
|
||||||
|
)
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not HTML5LIB_PRESENT,
|
||||||
|
"html5lib seems not to be present, not testing its tree builder.")
|
||||||
|
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
|
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return HTML5TreeBuilder
|
||||||
|
|
||||||
|
def test_soupstrainer(self):
|
||||||
|
# The html5lib tree builder does not support SoupStrainers.
|
||||||
|
strainer = SoupStrainer("b")
|
||||||
|
markup = "<p>A <b>bold</b> statement.</p>"
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup(markup, parse_only=strainer)
|
||||||
|
self.assertEqual(
|
||||||
|
soup.decode(), self.document_for(markup))
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
"the html5lib tree builder doesn't support parse_only" in
|
||||||
|
str(w[0].message))
|
||||||
|
|
||||||
|
def test_correctly_nested_tables(self):
|
||||||
|
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||||
|
markup = ('<table id="1">'
|
||||||
|
'<tr>'
|
||||||
|
"<td>Here's another table:"
|
||||||
|
'<table id="2">'
|
||||||
|
'<tr><td>foo</td></tr>'
|
||||||
|
'</table></td>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
markup,
|
||||||
|
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||||
|
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||||
|
'</td></tr></tbody></table>')
|
||||||
|
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||||
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
|
||||||
|
def test_xml_declaration_followed_by_doctype(self):
|
||||||
|
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>foo</p>
|
||||||
|
</body>
|
||||||
|
</html>'''
|
||||||
|
soup = self.soup(markup)
|
||||||
|
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||||
|
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
||||||
|
|
||||||
|
def test_reparented_markup(self):
|
||||||
|
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||||
|
self.assertEqual(2, len(soup.find_all('p')))
|
||||||
|
|
||||||
|
|
||||||
|
def test_reparented_markup_ends_with_whitespace(self):
|
||||||
|
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||||
|
self.assertEqual(2, len(soup.find_all('p')))
|
||||||
|
|
||||||
|
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||||
|
"""Verify that we keep the two whitespace nodes in this
|
||||||
|
document distinct when reparenting the adjacent <tbody> tags.
|
||||||
|
"""
|
||||||
|
markup = '<table> <tbody><tbody><ims></tbody> </table>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
space1, space2 = soup.find_all(string=' ')
|
||||||
|
tbody1, tbody2 = soup.find_all('tbody')
|
||||||
|
assert space1.next_element is tbody1
|
||||||
|
assert tbody2.next_element is space2
|
||||||
|
|
||||||
|
def test_reparented_markup_containing_children(self):
|
||||||
|
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
noscript = soup.noscript
|
||||||
|
self.assertEqual("target", noscript.next_element)
|
||||||
|
target = soup.find(string='target')
|
||||||
|
|
||||||
|
# The 'aftermath' string was duplicated; we want the second one.
|
||||||
|
final_aftermath = soup.find_all(string='aftermath')[-1]
|
||||||
|
|
||||||
|
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
||||||
|
# but the 'target' string within is still connected to the
|
||||||
|
# (second) 'aftermath' string.
|
||||||
|
self.assertEqual(final_aftermath, target.next_element)
|
||||||
|
self.assertEqual(target, final_aftermath.previous_element)
|
||||||
|
|
||||||
|
def test_processing_instruction(self):
|
||||||
|
"""Processing instructions become comments."""
|
||||||
|
markup = b"""<?PITarget PIContent?>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
assert str(soup).startswith("<!--?PITarget PIContent?-->")
|
||||||
|
|
||||||
|
def test_cloned_multivalue_node(self):
|
||||||
|
markup = b"""<a class="my_class"><p></a>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
a1, a2 = soup.find_all('a')
|
||||||
|
self.assertEqual(a1, a2)
|
||||||
|
assert a1 is not a2
|
||||||
|
|
||||||
|
def test_foster_parenting(self):
|
||||||
|
markup = b"""<table><td></tbody>A"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||||
|
|
||||||
|
def test_extraction(self):
|
||||||
|
"""
|
||||||
|
Test that extraction does not destroy the tree.
|
||||||
|
|
||||||
|
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
|
||||||
|
"""
|
||||||
|
|
||||||
|
markup = """
|
||||||
|
<html><head></head>
|
||||||
|
<style>
|
||||||
|
</style><script></script><body><p>hello</p></body></html>
|
||||||
|
"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
[s.extract() for s in soup('script')]
|
||||||
|
[s.extract() for s in soup('style')]
|
||||||
|
|
||||||
|
self.assertEqual(len(soup.find_all("p")), 1)
|
||||||
|
|
||||||
|
def test_empty_comment(self):
|
||||||
|
"""
|
||||||
|
Test that empty comment does not break structure.
|
||||||
|
|
||||||
|
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
|
||||||
|
"""
|
||||||
|
|
||||||
|
markup = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<form>
|
||||||
|
<!----><input type="text">
|
||||||
|
</form>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
inputs = []
|
||||||
|
for form in soup.find_all('form'):
|
||||||
|
inputs.extend(form.find_all('input'))
|
||||||
|
self.assertEqual(len(inputs), 1)
|
||||||
|
|
||||||
|
def test_tracking_line_numbers(self):
|
||||||
|
# The html.parser TreeBuilder keeps track of line number and
|
||||||
|
# position of each element.
|
||||||
|
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(2, soup.p.sourceline)
|
||||||
|
self.assertEqual(5, soup.p.sourcepos)
|
||||||
|
self.assertEqual("sourceline", soup.p.find('sourceline').name)
|
||||||
|
|
||||||
|
# You can deactivate this behavior.
|
||||||
|
soup = self.soup(markup, store_line_numbers=False)
|
||||||
|
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||||
|
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
61
lib/bs4/tests/test_htmlparser.py
Normal file
61
lib/bs4/tests/test_htmlparser.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
"""Tests to ensure that the html.parser tree builder generates good
|
||||||
|
trees."""
|
||||||
|
|
||||||
|
from pdb import set_trace
|
||||||
|
import pickle
|
||||||
|
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||||
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
|
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||||
|
|
||||||
|
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
|
default_builder = HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
def test_namespaced_system_doctype(self):
|
||||||
|
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_namespaced_public_doctype(self):
|
||||||
|
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_builder_is_pickled(self):
|
||||||
|
"""Unlike most tree builders, HTMLParserTreeBuilder and will
|
||||||
|
be restored after pickling.
|
||||||
|
"""
|
||||||
|
tree = self.soup("<a><b>foo</a>")
|
||||||
|
dumped = pickle.dumps(tree, 2)
|
||||||
|
loaded = pickle.loads(dumped)
|
||||||
|
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
||||||
|
|
||||||
|
def test_redundant_empty_element_closing_tags(self):
|
||||||
|
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||||
|
self.assertSoupEquals('</br></br></br>', "")
|
||||||
|
|
||||||
|
def test_empty_element(self):
|
||||||
|
# This verifies that any buffered data present when the parser
|
||||||
|
# finishes working is handled.
|
||||||
|
self.assertSoupEquals("foo &# bar", "foo &# bar")
|
||||||
|
|
||||||
|
def test_tracking_line_numbers(self):
|
||||||
|
# The html.parser TreeBuilder keeps track of line number and
|
||||||
|
# position of each element.
|
||||||
|
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(2, soup.p.sourceline)
|
||||||
|
self.assertEqual(3, soup.p.sourcepos)
|
||||||
|
self.assertEqual("sourceline", soup.p.find('sourceline').name)
|
||||||
|
|
||||||
|
# You can deactivate this behavior.
|
||||||
|
soup = self.soup(markup, store_line_numbers=False)
|
||||||
|
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||||
|
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLParserSubclass(SoupTest):
|
||||||
|
def test_error(self):
|
||||||
|
"""Verify that our HTMLParser subclass implements error() in a way
|
||||||
|
that doesn't cause a crash.
|
||||||
|
"""
|
||||||
|
parser = BeautifulSoupHTMLParser()
|
||||||
|
parser.error("don't crash")
|
115
lib/bs4/tests/test_lxml.py
Normal file
115
lib/bs4/tests/test_lxml.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
try:
|
||||||
|
import lxml.etree
|
||||||
|
LXML_PRESENT = True
|
||||||
|
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||||
|
except ImportError as e:
|
||||||
|
LXML_PRESENT = False
|
||||||
|
LXML_VERSION = (0,)
|
||||||
|
|
||||||
|
if LXML_PRESENT:
|
||||||
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||||
|
|
||||||
|
from bs4 import (
|
||||||
|
BeautifulSoup,
|
||||||
|
BeautifulStoneSoup,
|
||||||
|
)
|
||||||
|
from bs4.element import Comment, Doctype, SoupStrainer
|
||||||
|
from bs4.testing import skipIf
|
||||||
|
from bs4.tests import test_htmlparser
|
||||||
|
from bs4.testing import (
|
||||||
|
HTMLTreeBuilderSmokeTest,
|
||||||
|
XMLTreeBuilderSmokeTest,
|
||||||
|
SoupTest,
|
||||||
|
skipIf,
|
||||||
|
)
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not LXML_PRESENT,
|
||||||
|
"lxml seems not to be present, not testing its tree builder.")
|
||||||
|
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return LXMLTreeBuilder
|
||||||
|
|
||||||
|
def test_out_of_range_entity(self):
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
|
||||||
|
def test_entities_in_foreign_document_encoding(self):
|
||||||
|
# We can't implement this case correctly because by the time we
|
||||||
|
# hear about markup like "“", it's been (incorrectly) converted into
|
||||||
|
# a string like u'\x93'
|
||||||
|
pass
|
||||||
|
|
||||||
|
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||||
|
# test if an old version of lxml is installed.
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
|
||||||
|
"Skipping doctype test for old version of lxml to avoid segfault.")
|
||||||
|
def test_empty_doctype(self):
|
||||||
|
soup = self.soup("<!DOCTYPE>")
|
||||||
|
doctype = soup.contents[0]
|
||||||
|
self.assertEqual("", doctype.strip())
|
||||||
|
|
||||||
|
def test_beautifulstonesoup_is_xml_parser(self):
|
||||||
|
# Make sure that the deprecated BSS class uses an xml builder
|
||||||
|
# if one is installed.
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = BeautifulStoneSoup("<b />")
|
||||||
|
self.assertEqual("<b/>", str(soup.b))
|
||||||
|
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||||
|
|
||||||
|
def test_tracking_line_numbers(self):
|
||||||
|
# The lxml TreeBuilder cannot keep track of line numbers from
|
||||||
|
# the original markup. Even if you ask for line numbers, we
|
||||||
|
# don't have 'em.
|
||||||
|
#
|
||||||
|
# This means that if you have a tag like <sourceline> or
|
||||||
|
# <sourcepos>, attribute access will find it rather than
|
||||||
|
# giving you a numeric answer.
|
||||||
|
soup = self.soup(
|
||||||
|
"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
|
||||||
|
store_line_numbers=True
|
||||||
|
)
|
||||||
|
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||||
|
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not LXML_PRESENT,
|
||||||
|
"lxml seems not to be present, not testing its XML tree builder.")
|
||||||
|
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
|
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_builder(self):
|
||||||
|
return LXMLTreeBuilderForXML
|
||||||
|
|
||||||
|
def test_namespace_indexing(self):
|
||||||
|
# We should not track un-prefixed namespaces as we can only hold one
|
||||||
|
# and it will be recognized as the default namespace by soupsieve,
|
||||||
|
# which may be confusing in some situations. When no namespace is provided
|
||||||
|
# for a selector, the default namespace (if defined) is assumed.
|
||||||
|
|
||||||
|
soup = self.soup(
|
||||||
|
'<?xml version="1.1"?>\n'
|
||||||
|
'<root>'
|
||||||
|
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
||||||
|
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
|
||||||
|
'</root>'
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
soup._namespaces,
|
||||||
|
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
|
||||||
|
)
|
682
lib/bs4/tests/test_soup.py
Normal file
682
lib/bs4/tests/test_soup.py
Normal file
|
@ -0,0 +1,682 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Tests of Beautiful Soup as a whole."""
|
||||||
|
|
||||||
|
from pdb import set_trace
|
||||||
|
import logging
|
||||||
|
import unittest
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from bs4 import (
|
||||||
|
BeautifulSoup,
|
||||||
|
BeautifulStoneSoup,
|
||||||
|
)
|
||||||
|
from bs4.builder import (
|
||||||
|
TreeBuilder,
|
||||||
|
ParserRejectedMarkup,
|
||||||
|
)
|
||||||
|
from bs4.element import (
|
||||||
|
CharsetMetaAttributeValue,
|
||||||
|
Comment,
|
||||||
|
ContentMetaAttributeValue,
|
||||||
|
SoupStrainer,
|
||||||
|
NamespacedAttribute,
|
||||||
|
Tag,
|
||||||
|
NavigableString,
|
||||||
|
)
|
||||||
|
|
||||||
|
import bs4.dammit
|
||||||
|
from bs4.dammit import (
|
||||||
|
EntitySubstitution,
|
||||||
|
UnicodeDammit,
|
||||||
|
EncodingDetector,
|
||||||
|
)
|
||||||
|
from bs4.testing import (
|
||||||
|
default_builder,
|
||||||
|
SoupTest,
|
||||||
|
skipIf,
|
||||||
|
)
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||||
|
LXML_PRESENT = True
|
||||||
|
except ImportError as e:
|
||||||
|
LXML_PRESENT = False
|
||||||
|
|
||||||
|
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||||
|
|
||||||
|
class TestConstructor(SoupTest):
|
||||||
|
|
||||||
|
def test_short_unicode_input(self):
|
||||||
|
data = "<h1>éé</h1>"
|
||||||
|
soup = self.soup(data)
|
||||||
|
self.assertEqual("éé", soup.h1.string)
|
||||||
|
|
||||||
|
def test_embedded_null(self):
|
||||||
|
data = "<h1>foo\0bar</h1>"
|
||||||
|
soup = self.soup(data)
|
||||||
|
self.assertEqual("foo\0bar", soup.h1.string)
|
||||||
|
|
||||||
|
def test_exclude_encodings(self):
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||||
|
self.assertEqual("windows-1252", soup.original_encoding)
|
||||||
|
|
||||||
|
def test_custom_builder_class(self):
|
||||||
|
# Verify that you can pass in a custom Builder class and
|
||||||
|
# it'll be instantiated with the appropriate keyword arguments.
|
||||||
|
class Mock(object):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.called_with = kwargs
|
||||||
|
self.is_xml = True
|
||||||
|
self.store_line_numbers = False
|
||||||
|
self.cdata_list_attributes = []
|
||||||
|
self.preserve_whitespace_tags = []
|
||||||
|
def initialize_soup(self, soup):
|
||||||
|
pass
|
||||||
|
def feed(self, markup):
|
||||||
|
self.fed = markup
|
||||||
|
def reset(self):
|
||||||
|
pass
|
||||||
|
def ignore(self, ignore):
|
||||||
|
pass
|
||||||
|
set_up_substitutions = can_be_empty_element = ignore
|
||||||
|
def prepare_markup(self, *args, **kwargs):
|
||||||
|
yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
|
||||||
|
|
||||||
|
kwargs = dict(
|
||||||
|
var="value",
|
||||||
|
# This is a deprecated BS3-era keyword argument, which
|
||||||
|
# will be stripped out.
|
||||||
|
convertEntities=True,
|
||||||
|
)
|
||||||
|
with warnings.catch_warnings(record=True):
|
||||||
|
soup = BeautifulSoup('', builder=Mock, **kwargs)
|
||||||
|
assert isinstance(soup.builder, Mock)
|
||||||
|
self.assertEqual(dict(var="value"), soup.builder.called_with)
|
||||||
|
self.assertEqual("prepared markup", soup.builder.fed)
|
||||||
|
|
||||||
|
# You can also instantiate the TreeBuilder yourself. In this
|
||||||
|
# case, that specific object is used and any keyword arguments
|
||||||
|
# to the BeautifulSoup constructor are ignored.
|
||||||
|
builder = Mock(**kwargs)
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = BeautifulSoup(
|
||||||
|
'', builder=builder, ignored_value=True,
|
||||||
|
)
|
||||||
|
msg = str(w[0].message)
|
||||||
|
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
|
||||||
|
self.assertEqual(builder, soup.builder)
|
||||||
|
self.assertEqual(kwargs, builder.called_with)
|
||||||
|
|
||||||
|
def test_parser_markup_rejection(self):
|
||||||
|
# If markup is completely rejected by the parser, an
|
||||||
|
# explanatory ParserRejectedMarkup exception is raised.
|
||||||
|
class Mock(TreeBuilder):
|
||||||
|
def feed(self, *args, **kwargs):
|
||||||
|
raise ParserRejectedMarkup("Nope.")
|
||||||
|
|
||||||
|
def prepare_markup(self, *args, **kwargs):
|
||||||
|
# We're going to try two different ways of preparing this markup,
|
||||||
|
# but feed() will reject both of them.
|
||||||
|
yield markup, None, None, False
|
||||||
|
yield markup, None, None, False
|
||||||
|
|
||||||
|
import re
|
||||||
|
self.assertRaisesRegex(
|
||||||
|
ParserRejectedMarkup,
|
||||||
|
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
|
||||||
|
BeautifulSoup, '', builder=Mock,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_cdata_list_attributes(self):
|
||||||
|
# Most attribute values are represented as scalars, but the
|
||||||
|
# HTML standard says that some attributes, like 'class' have
|
||||||
|
# space-separated lists as values.
|
||||||
|
markup = '<a id=" an id " class=" a class "></a>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
|
||||||
|
# Note that the spaces are stripped for 'class' but not for 'id'.
|
||||||
|
a = soup.a
|
||||||
|
self.assertEqual(" an id ", a['id'])
|
||||||
|
self.assertEqual(["a", "class"], a['class'])
|
||||||
|
|
||||||
|
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
|
||||||
|
# you customize or disable this. As always, you can customize the TreeBuilder
|
||||||
|
# by passing in a keyword argument to the BeautifulSoup constructor.
|
||||||
|
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
|
||||||
|
self.assertEqual(" a class ", soup.a['class'])
|
||||||
|
|
||||||
|
# Here are two ways of saying that `id` is a multi-valued
|
||||||
|
# attribute in this context, but 'class' is not.
|
||||||
|
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
# This will create a warning about not explicitly
|
||||||
|
# specifying a parser, but we'll ignore it.
|
||||||
|
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
|
||||||
|
a = soup.a
|
||||||
|
self.assertEqual(["an", "id"], a['id'])
|
||||||
|
self.assertEqual(" a class ", a['class'])
|
||||||
|
|
||||||
|
def test_replacement_classes(self):
|
||||||
|
# Test the ability to pass in replacements for element classes
|
||||||
|
# which will be used when building the tree.
|
||||||
|
class TagPlus(Tag):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class StringPlus(NavigableString):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class CommentPlus(Comment):
|
||||||
|
pass
|
||||||
|
|
||||||
|
soup = self.soup(
|
||||||
|
"<a><b>foo</b>bar</a><!--whee-->",
|
||||||
|
element_classes = {
|
||||||
|
Tag: TagPlus,
|
||||||
|
NavigableString: StringPlus,
|
||||||
|
Comment: CommentPlus,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# The tree was built with TagPlus, StringPlus, and CommentPlus objects,
|
||||||
|
# rather than Tag, String, and Comment objects.
|
||||||
|
assert all(
|
||||||
|
isinstance(x, (TagPlus, StringPlus, CommentPlus))
|
||||||
|
for x in soup.recursiveChildGenerator()
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestWarnings(SoupTest):
|
||||||
|
|
||||||
|
def _no_parser_specified(self, s, is_there=True):
|
||||||
|
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
|
||||||
|
self.assertTrue(v)
|
||||||
|
|
||||||
|
def test_warning_if_no_parser_specified(self):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup("<a><b></b></a>")
|
||||||
|
msg = str(w[0].message)
|
||||||
|
self._assert_no_parser_specified(msg)
|
||||||
|
|
||||||
|
def test_warning_if_parser_specified_too_vague(self):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup("<a><b></b></a>", "html")
|
||||||
|
msg = str(w[0].message)
|
||||||
|
self._assert_no_parser_specified(msg)
|
||||||
|
|
||||||
|
def test_no_warning_if_explicit_parser_specified(self):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup("<a><b></b></a>", "html.parser")
|
||||||
|
self.assertEqual([], w)
|
||||||
|
|
||||||
|
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
||||||
|
msg = str(w[0].message)
|
||||||
|
self.assertTrue("parseOnlyThese" in msg)
|
||||||
|
self.assertTrue("parse_only" in msg)
|
||||||
|
self.assertEqual(b"<b></b>", soup.encode())
|
||||||
|
|
||||||
|
def test_fromEncoding_renamed_to_from_encoding(self):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
utf8 = b"\xc3\xa9"
|
||||||
|
soup = self.soup(utf8, fromEncoding="utf8")
|
||||||
|
msg = str(w[0].message)
|
||||||
|
self.assertTrue("fromEncoding" in msg)
|
||||||
|
self.assertTrue("from_encoding" in msg)
|
||||||
|
self.assertEqual("utf8", soup.original_encoding)
|
||||||
|
|
||||||
|
def test_unrecognized_keyword_argument(self):
|
||||||
|
self.assertRaises(
|
||||||
|
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||||
|
|
||||||
|
class TestWarnings(SoupTest):
|
||||||
|
|
||||||
|
def test_disk_file_warning(self):
|
||||||
|
filehandle = tempfile.NamedTemporaryFile()
|
||||||
|
filename = filehandle.name
|
||||||
|
try:
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup(filename)
|
||||||
|
msg = str(w[0].message)
|
||||||
|
self.assertTrue("looks like a filename" in msg)
|
||||||
|
finally:
|
||||||
|
filehandle.close()
|
||||||
|
|
||||||
|
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup(filename)
|
||||||
|
self.assertEqual(0, len(w))
|
||||||
|
|
||||||
|
def test_url_warning_with_bytes_url(self):
|
||||||
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
|
soup = self.soup(b"http://www.crummybytes.com/")
|
||||||
|
# Be aware this isn't the only warning that can be raised during
|
||||||
|
# execution..
|
||||||
|
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||||
|
for w in warning_list))
|
||||||
|
|
||||||
|
def test_url_warning_with_unicode_url(self):
|
||||||
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
|
# note - this url must differ from the bytes one otherwise
|
||||||
|
# python's warnings system swallows the second warning
|
||||||
|
soup = self.soup("http://www.crummyunicode.com/")
|
||||||
|
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||||
|
for w in warning_list))
|
||||||
|
|
||||||
|
def test_url_warning_with_bytes_and_space(self):
|
||||||
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
|
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||||
|
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||||
|
for w in warning_list))
|
||||||
|
|
||||||
|
def test_url_warning_with_unicode_and_space(self):
|
||||||
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
|
soup = self.soup("http://www.crummyuncode.com/ is great")
|
||||||
|
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||||
|
for w in warning_list))
|
||||||
|
|
||||||
|
|
||||||
|
class TestSelectiveParsing(SoupTest):
|
||||||
|
|
||||||
|
def test_parse_with_soupstrainer(self):
|
||||||
|
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
||||||
|
strainer = SoupStrainer("b")
|
||||||
|
soup = self.soup(markup, parse_only=strainer)
|
||||||
|
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
||||||
|
|
||||||
|
|
||||||
|
class TestEntitySubstitution(unittest.TestCase):
|
||||||
|
"""Standalone tests of the EntitySubstitution class."""
|
||||||
|
def setUp(self):
|
||||||
|
self.sub = EntitySubstitution
|
||||||
|
|
||||||
|
def test_simple_html_substitution(self):
|
||||||
|
# Unicode characters corresponding to named HTML entites
|
||||||
|
# are substituted, and no others.
|
||||||
|
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||||
|
self.assertEqual(self.sub.substitute_html(s),
|
||||||
|
"foo∀\N{SNOWMAN}õbar")
|
||||||
|
|
||||||
|
def test_smart_quote_substitution(self):
|
||||||
|
# MS smart quotes are a common source of frustration, so we
|
||||||
|
# give them a special test.
|
||||||
|
quotes = b"\x91\x92foo\x93\x94"
|
||||||
|
dammit = UnicodeDammit(quotes)
|
||||||
|
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||||
|
"‘’foo“”")
|
||||||
|
|
||||||
|
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||||
|
s = 'Welcome to "my bar"'
|
||||||
|
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||||
|
|
||||||
|
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
||||||
|
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
||||||
|
'"Welcome"')
|
||||||
|
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
||||||
|
'"Bob\'s Bar"')
|
||||||
|
|
||||||
|
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
||||||
|
s = 'Welcome to "my bar"'
|
||||||
|
self.assertEqual(self.sub.substitute_xml(s, True),
|
||||||
|
"'Welcome to \"my bar\"'")
|
||||||
|
|
||||||
|
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
||||||
|
s = 'Welcome to "Bob\'s Bar"'
|
||||||
|
self.assertEqual(
|
||||||
|
self.sub.substitute_xml(s, True),
|
||||||
|
'"Welcome to "Bob\'s Bar""')
|
||||||
|
|
||||||
|
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
||||||
|
quoted = 'Welcome to "Bob\'s Bar"'
|
||||||
|
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
||||||
|
|
||||||
|
def test_xml_quoting_handles_angle_brackets(self):
|
||||||
|
self.assertEqual(
|
||||||
|
self.sub.substitute_xml("foo<bar>"),
|
||||||
|
"foo<bar>")
|
||||||
|
|
||||||
|
def test_xml_quoting_handles_ampersands(self):
|
||||||
|
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
||||||
|
|
||||||
|
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
||||||
|
self.assertEqual(
|
||||||
|
self.sub.substitute_xml("ÁT&T"),
|
||||||
|
"&Aacute;T&T")
|
||||||
|
|
||||||
|
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
||||||
|
self.assertEqual(
|
||||||
|
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||||
|
"ÁT&T")
|
||||||
|
|
||||||
|
def test_quotes_not_html_substituted(self):
|
||||||
|
"""There's no need to do this except inside attribute values."""
|
||||||
|
text = 'Bob\'s "bar"'
|
||||||
|
self.assertEqual(self.sub.substitute_html(text), text)
|
||||||
|
|
||||||
|
|
||||||
|
class TestEncodingConversion(SoupTest):
|
||||||
|
# Test Beautiful Soup's ability to decode and encode from various
|
||||||
|
# encodings.
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(TestEncodingConversion, self).setUp()
|
||||||
|
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||||
|
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||||
|
# Just so you know what it looks like.
|
||||||
|
self.assertEqual(
|
||||||
|
self.utf8_data,
|
||||||
|
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
||||||
|
|
||||||
|
def test_ascii_in_unicode_out(self):
|
||||||
|
# ASCII input is converted to Unicode. The original_encoding
|
||||||
|
# attribute is set to 'utf-8', a superset of ASCII.
|
||||||
|
chardet = bs4.dammit.chardet_dammit
|
||||||
|
logging.disable(logging.WARNING)
|
||||||
|
try:
|
||||||
|
def noop(str):
|
||||||
|
return None
|
||||||
|
# Disable chardet, which will realize that the ASCII is ASCII.
|
||||||
|
bs4.dammit.chardet_dammit = noop
|
||||||
|
ascii = b"<foo>a</foo>"
|
||||||
|
soup_from_ascii = self.soup(ascii)
|
||||||
|
unicode_output = soup_from_ascii.decode()
|
||||||
|
self.assertTrue(isinstance(unicode_output, str))
|
||||||
|
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||||
|
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||||
|
finally:
|
||||||
|
logging.disable(logging.NOTSET)
|
||||||
|
bs4.dammit.chardet_dammit = chardet
|
||||||
|
|
||||||
|
def test_unicode_in_unicode_out(self):
|
||||||
|
# Unicode input is left alone. The original_encoding attribute
|
||||||
|
# is not set.
|
||||||
|
soup_from_unicode = self.soup(self.unicode_data)
|
||||||
|
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||||
|
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
|
||||||
|
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||||
|
|
||||||
|
def test_utf8_in_unicode_out(self):
|
||||||
|
# UTF-8 input is converted to Unicode. The original_encoding
|
||||||
|
# attribute is set.
|
||||||
|
soup_from_utf8 = self.soup(self.utf8_data)
|
||||||
|
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||||
|
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
|
||||||
|
|
||||||
|
def test_utf8_out(self):
|
||||||
|
# The internal data structures can be encoded as UTF-8.
|
||||||
|
soup_from_unicode = self.soup(self.unicode_data)
|
||||||
|
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
PYTHON_3_PRE_3_2,
|
||||||
|
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||||
|
def test_attribute_name_containing_unicode_characters(self):
|
||||||
|
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||||
|
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||||
|
|
||||||
|
class TestUnicodeDammit(unittest.TestCase):
|
||||||
|
"""Standalone tests of UnicodeDammit."""
|
||||||
|
|
||||||
|
def test_unicode_input(self):
|
||||||
|
markup = "I'm already Unicode! \N{SNOWMAN}"
|
||||||
|
dammit = UnicodeDammit(markup)
|
||||||
|
self.assertEqual(dammit.unicode_markup, markup)
|
||||||
|
|
||||||
|
def test_smart_quotes_to_unicode(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup)
|
||||||
|
self.assertEqual(
|
||||||
|
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||||
|
|
||||||
|
def test_smart_quotes_to_xml_entities(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||||
|
self.assertEqual(
|
||||||
|
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||||
|
|
||||||
|
def test_smart_quotes_to_html_entities(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||||
|
self.assertEqual(
|
||||||
|
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||||
|
|
||||||
|
def test_smart_quotes_to_ascii(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||||
|
self.assertEqual(
|
||||||
|
dammit.unicode_markup, """<foo>''""</foo>""")
|
||||||
|
|
||||||
|
def test_detect_utf8(self):
|
||||||
|
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||||
|
dammit = UnicodeDammit(utf8)
|
||||||
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
|
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_hebrew(self):
|
||||||
|
hebrew = b"\xed\xe5\xec\xf9"
|
||||||
|
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||||
|
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||||
|
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
|
||||||
|
|
||||||
|
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||||
|
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||||
|
dammit = UnicodeDammit(utf_8)
|
||||||
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
|
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||||
|
|
||||||
|
def test_ignore_inappropriate_codecs(self):
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||||
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
|
|
||||||
|
def test_ignore_invalid_codecs(self):
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||||
|
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||||
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
|
|
||||||
|
def test_exclude_encodings(self):
|
||||||
|
# This is UTF-8.
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
|
||||||
|
# But if we exclude UTF-8 from consideration, the guess is
|
||||||
|
# Windows-1252.
|
||||||
|
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
||||||
|
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
|
||||||
|
|
||||||
|
# And if we exclude that, there is no valid guess at all.
|
||||||
|
dammit = UnicodeDammit(
|
||||||
|
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
||||||
|
self.assertEqual(dammit.original_encoding, None)
|
||||||
|
|
||||||
|
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
||||||
|
detected = EncodingDetector(
|
||||||
|
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||||
|
encodings = list(detected.encodings)
|
||||||
|
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||||
|
|
||||||
|
def test_detect_html5_style_meta_tag(self):
|
||||||
|
|
||||||
|
for data in (
|
||||||
|
b'<html><meta charset="euc-jp" /></html>',
|
||||||
|
b"<html><meta charset='euc-jp' /></html>",
|
||||||
|
b"<html><meta charset=euc-jp /></html>",
|
||||||
|
b"<html><meta charset=euc-jp/></html>"):
|
||||||
|
dammit = UnicodeDammit(data, is_html=True)
|
||||||
|
self.assertEqual(
|
||||||
|
"euc-jp", dammit.original_encoding)
|
||||||
|
|
||||||
|
def test_last_ditch_entity_replacement(self):
|
||||||
|
# This is a UTF-8 document that contains bytestrings
|
||||||
|
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||||
|
# encoding).
|
||||||
|
#
|
||||||
|
# Since there is no consistent encoding for the document,
|
||||||
|
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||||
|
# and encode the incompatible characters as REPLACEMENT
|
||||||
|
# CHARACTER.
|
||||||
|
#
|
||||||
|
# If chardet is installed, it will detect that the document
|
||||||
|
# can be converted into ISO-8859-1 without errors. This happens
|
||||||
|
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||||
|
# code we're testing here won't run.
|
||||||
|
#
|
||||||
|
# So we temporarily disable chardet if it's present.
|
||||||
|
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<html><b>\330\250\330\252\330\261</b>
|
||||||
|
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||||
|
chardet = bs4.dammit.chardet_dammit
|
||||||
|
logging.disable(logging.WARNING)
|
||||||
|
try:
|
||||||
|
def noop(str):
|
||||||
|
return None
|
||||||
|
bs4.dammit.chardet_dammit = noop
|
||||||
|
dammit = UnicodeDammit(doc)
|
||||||
|
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||||
|
self.assertTrue("\ufffd" in dammit.unicode_markup)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(doc, "html.parser")
|
||||||
|
self.assertTrue(soup.contains_replacement_characters)
|
||||||
|
finally:
|
||||||
|
logging.disable(logging.NOTSET)
|
||||||
|
bs4.dammit.chardet_dammit = chardet
|
||||||
|
|
||||||
|
def test_byte_order_mark_removed(self):
|
||||||
|
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||||
|
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||||
|
dammit = UnicodeDammit(data)
|
||||||
|
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
|
||||||
|
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||||
|
|
||||||
|
def test_detwingle(self):
|
||||||
|
# Here's a UTF8 document.
|
||||||
|
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
||||||
|
|
||||||
|
# Here's a Windows-1252 document.
|
||||||
|
windows_1252 = (
|
||||||
|
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||||
|
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||||
|
|
||||||
|
# Through some unholy alchemy, they've been stuck together.
|
||||||
|
doc = utf8 + windows_1252 + utf8
|
||||||
|
|
||||||
|
# The document can't be turned into UTF-8:
|
||||||
|
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
||||||
|
|
||||||
|
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||||
|
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||||
|
|
||||||
|
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||||
|
|
||||||
|
fixed = UnicodeDammit.detwingle(doc)
|
||||||
|
self.assertEqual(
|
||||||
|
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||||
|
|
||||||
|
def test_detwingle_ignores_multibyte_characters(self):
|
||||||
|
# Each of these characters has a UTF-8 representation ending
|
||||||
|
# in \x93. \x93 is a smart quote if interpreted as
|
||||||
|
# Windows-1252. But our code knows to skip over multibyte
|
||||||
|
# UTF-8 characters, so they'll survive the process unscathed.
|
||||||
|
for tricky_unicode_char in (
|
||||||
|
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||||
|
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||||
|
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||||
|
):
|
||||||
|
input = tricky_unicode_char.encode("utf8")
|
||||||
|
self.assertTrue(input.endswith(b'\x93'))
|
||||||
|
output = UnicodeDammit.detwingle(input)
|
||||||
|
self.assertEqual(output, input)
|
||||||
|
|
||||||
|
def test_find_declared_encoding(self):
|
||||||
|
# Test our ability to find a declared encoding inside an
|
||||||
|
# XML or HTML document.
|
||||||
|
#
|
||||||
|
# Even if the document comes in as Unicode, it may be
|
||||||
|
# interesting to know what encoding was claimed
|
||||||
|
# originally.
|
||||||
|
|
||||||
|
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
|
||||||
|
html_bytes = html_unicode.encode("ascii")
|
||||||
|
|
||||||
|
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
|
||||||
|
xml_bytes = xml_unicode.encode("ascii")
|
||||||
|
|
||||||
|
m = EncodingDetector.find_declared_encoding
|
||||||
|
self.assertEqual(None, m(html_unicode, is_html=False))
|
||||||
|
self.assertEqual("utf-8", m(html_unicode, is_html=True))
|
||||||
|
self.assertEqual("utf-8", m(html_bytes, is_html=True))
|
||||||
|
|
||||||
|
self.assertEqual("iso-8859-1", m(xml_unicode))
|
||||||
|
self.assertEqual("iso-8859-1", m(xml_bytes))
|
||||||
|
|
||||||
|
# Normally, only the first few kilobytes of a document are checked for
|
||||||
|
# an encoding.
|
||||||
|
spacer = b' ' * 5000
|
||||||
|
self.assertEqual(None, m(spacer + html_bytes))
|
||||||
|
self.assertEqual(None, m(spacer + xml_bytes))
|
||||||
|
|
||||||
|
# But you can tell find_declared_encoding to search an entire
|
||||||
|
# HTML document.
|
||||||
|
self.assertEqual(
|
||||||
|
"utf-8",
|
||||||
|
m(spacer + html_bytes, is_html=True, search_entire_document=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
# The XML encoding declaration has to be the very first thing
|
||||||
|
# in the document. We'll allow whitespace before the document
|
||||||
|
# starts, but nothing else.
|
||||||
|
self.assertEqual(
|
||||||
|
"iso-8859-1",
|
||||||
|
m(xml_bytes, search_entire_document=True)
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
None, m(b'a' + xml_bytes, search_entire_document=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestNamedspacedAttribute(SoupTest):
|
||||||
|
|
||||||
|
def test_name_may_be_none_or_missing(self):
|
||||||
|
a = NamespacedAttribute("xmlns", None)
|
||||||
|
self.assertEqual(a, "xmlns")
|
||||||
|
|
||||||
|
a = NamespacedAttribute("xmlns")
|
||||||
|
self.assertEqual(a, "xmlns")
|
||||||
|
|
||||||
|
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||||
|
a = NamespacedAttribute("a", "b")
|
||||||
|
self.assertEqual("a:b", a)
|
||||||
|
|
||||||
|
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
||||||
|
a = NamespacedAttribute("a", "b", "c")
|
||||||
|
b = NamespacedAttribute("a", "b", "c")
|
||||||
|
self.assertEqual(a, b)
|
||||||
|
|
||||||
|
# The actual namespace is not considered.
|
||||||
|
c = NamespacedAttribute("a", "b", None)
|
||||||
|
self.assertEqual(a, c)
|
||||||
|
|
||||||
|
# But name and prefix are important.
|
||||||
|
d = NamespacedAttribute("a", "z", "c")
|
||||||
|
self.assertNotEqual(a, d)
|
||||||
|
|
||||||
|
e = NamespacedAttribute("z", "b", "c")
|
||||||
|
self.assertNotEqual(a, e)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_content_meta_attribute_value(self):
|
||||||
|
value = CharsetMetaAttributeValue("euc-jp")
|
||||||
|
self.assertEqual("euc-jp", value)
|
||||||
|
self.assertEqual("euc-jp", value.original_value)
|
||||||
|
self.assertEqual("utf8", value.encode("utf8"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_meta_attribute_value(self):
|
||||||
|
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
||||||
|
self.assertEqual("text/html; charset=euc-jp", value)
|
||||||
|
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
||||||
|
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
2254
lib/bs4/tests/test_tree.py
Normal file
2254
lib/bs4/tests/test_tree.py
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue