Update beautifulsoup4-4.10.0

This commit is contained in:
JonnyWong16 2021-10-14 20:46:06 -07:00
parent b581460b51
commit ab8fa4d5b3
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
16 changed files with 4599 additions and 743 deletions

View file

@ -1,6 +1,5 @@
"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
Beautiful Soup works with Python 2.7 and up. It works better if lxml
Beautiful Soup works with Python 3.5 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.8.1"
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
__version__ = "4.10.0"
__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
from collections import Counter
import os
import re
import sys
import traceback
import warnings
# The very first thing we do is give a useful error if someone is
# running this code under Python 2.
if sys.version_info.major < 3:
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit
from .element import (
@ -42,28 +46,49 @@ from .element import (
NavigableString,
PageElement,
ProcessingInstruction,
PYTHON_SPECIFIC_ENCODINGS,
ResultSet,
Script,
Stylesheet,
SoupStrainer,
Tag,
TemplateString,
)
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
# Define some custom warnings.
class GuessedAtParserWarning(UserWarning):
"""The warning issued when BeautifulSoup has to guess what parser to
use -- probably because no parser was specified in the constructor.
"""
class MarkupResemblesLocatorWarning(UserWarning):
"""The warning issued when BeautifulSoup is given 'markup' that
actually looks like a resource locator -- a URL or a path to a file
on disk.
"""
class BeautifulSoup(Tag):
"""
This class defines the basic interface called by the tree builders.
"""A data structure representing a parsed HTML or XML document.
These methods will be called by the parser:
reset()
feed(markup)
Most of the methods you'll call on a BeautifulSoup object are inherited from
PageElement or Tag.
Internally, this class defines the basic interface called by the
tree builders when converting an HTML/XML document into a data
structure. The interface abstracts away the differences between
parsers. To write a new tree builder, you'll need to understand
these methods as a whole.
These methods will be called by the BeautifulSoup constructor:
* reset()
* feed(markup)
The tree builder may call these methods from its feed() implementation:
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
endData(containerClass) # Ends the current data node
* handle_starttag(name, attrs) # See note about return value
* handle_endtag(name)
* handle_data(data) # Appends to the current data node
* endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
@ -73,68 +98,75 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
# a Tag with a .name. This name makes it clear the BeautifulSoup
# object isn't a real markup tag.
ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
# A string containing all ASCII whitespace characters, used in
# endData() to detect data chunks that seem 'empty'.
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
"""Constructor.
:param markup: A string or a file-like object representing
markup to be parsed.
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param features: Desirable features of the parser to be
used. This may be the name of a specific parser ("lxml",
"lxml-xml", "html.parser", or "html5lib") or it may be the
type of markup to be used ("html", "html5", "xml"). It's
recommended that you name a specific parser, so that
Beautiful Soup gives you the same results across platforms
and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or
instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param element_classes: A dictionary mapping BeautifulSoup
classes like Tag and NavigableString to other classes you'd
like to be instantiated instead as the parse tree is
built. This is useful for using subclasses to modify the
default behavior of Tag or NavigableString.
classes like Tag and NavigableString, to other classes you'd
like to be instantiated instead as the parse tree is
built. This is useful for subclassing Tag or NavigableString
to modify default behavior.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4; they will result in a warning and then be ignored.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This
makes it possible to configure a TreeBuilder beyond saying
which one to use.
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4; they will result in a warning and then be
ignored.
Apart from this, any keyword arguments passed into the
BeautifulSoup constructor are propagated to the TreeBuilder
constructor. This makes it possible to configure a
TreeBuilder by passing in arguments, not just by saying which
one to use.
"""
if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn(
@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
):
) and markup:
# The user did not tell us which TreeBuilder to use,
# and we had to guess. Issue a warning.
if builder.is_xml:
markup_type = "XML"
else:
@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
parser=builder.NAME,
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
warnings.warn(
self.NO_PARSER_SPECIFIED_WARNING % values,
GuessedAtParserWarning, stacklevel=2
)
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
else:
possible_filename = markup
is_file = False
is_directory = False
try:
is_file = os.path.exists(possible_filename)
if is_file:
is_directory = os.path.isdir(possible_filename)
except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
if isinstance(markup, str):
markup = markup.encode("utf8")
if is_directory:
warnings.warn(
'"%s" looks like a directory name, not markup. You may'
' want to open a file found in this directory and pass'
' the filehandle into Beautiful Soup.' % (
self._decode_markup(markup)
),
MarkupResemblesLocatorWarning
)
elif is_file:
warnings.warn(
'"%s" looks like a filename, not markup. You should'
' probably open this file and pass the filehandle into'
' Beautiful Soup.' % markup)
' Beautiful Soup.' % self._decode_markup(markup),
MarkupResemblesLocatorWarning
)
self._check_markup_is_url(markup)
rejections = []
@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
self.builder.soup = None
def __copy__(self):
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
d['builder'] = None
return d
@staticmethod
def _check_markup_is_url(markup):
"""
Check if markup looks like it's actually a url and raise a warning
if so. Markup can be unicode or str (py2) / bytes (py3).
@classmethod
def _decode_markup(cls, markup):
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
TODO: warnings.warn had this problem back in 2010 but it might not
anymore.
"""
if isinstance(markup, bytes):
decoded = markup.decode('utf-8', 'replace')
else:
decoded = markup
return decoded
@classmethod
def _check_markup_is_url(cls, markup):
"""Error-handling method to raise a warning if incoming markup looks
like a URL.
:param markup: A string.
"""
if isinstance(markup, bytes):
space = b' '
@ -364,18 +428,20 @@ class BeautifulSoup(Tag):
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
if isinstance(markup, bytes):
decoded_markup = markup.decode('utf-8', 'replace')
else:
decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % decoded_markup
' that document to Beautiful Soup.' % cls._decode_markup(
markup
),
MarkupResemblesLocatorWarning
)
def _feed(self):
"""Internal method that parses previously set markup, creating a large
number of Tag and NavigableString objects.
"""
# Convert the document to Unicode.
self.builder.reset()
@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
self.popTag()
def reset(self):
"""Reset this object to a state as though it had never parsed any
markup.
"""
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
self.current_data = []
self.currentTag = None
self.tagStack = []
self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup."""
"""Create a new Tag associated with this BeautifulSoup object.
:param name: The name of the new Tag.
:param namespace: The URI of the new Tag's XML namespace, if any.
:param prefix: The prefix for the new Tag's XML namespace, if any.
:param attrs: A dictionary of this Tag's attribute values; can
be used instead of `kwattrs` for attributes like 'class'
that are reserved words in Python.
:param sourceline: The line number where this tag was
(purportedly) found in its source document.
:param sourcepos: The character position within `sourceline` where this
tag was (purportedly) found.
:param kwattrs: Keyword arguments for the new Tag's attribute values.
"""
kwattrs.update(attrs)
return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos
)
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup."""
subclass = subclass or self.element_classes.get(
NavigableString, NavigableString
def string_container(self, base_class=None):
container = base_class or NavigableString
# There may be a general override of NavigableString.
container = self.element_classes.get(
container, container
)
return subclass(s)
def insert_before(self, successor):
# On top of that, we may be inside a tag that needs a special
# container class.
if self.string_container_stack and container is NavigableString:
container = self.builder.string_containers.get(
self.string_container_stack[-1].name, container
)
return container
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this BeautifulSoup
object.
"""
container = self.string_container(subclass)
return container(s)
def insert_before(self, *args):
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
it because there is nothing before or after it in the parse tree.
"""
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
def insert_after(self, *args):
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
it because there is nothing before or after it in the parse tree.
"""
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
"""Internal method called by _popToTag when a tag is closed."""
tag = self.tagStack.pop()
if tag.name in self.open_tag_counter:
self.open_tag_counter[tag.name] -= 1
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
#print "Pop", tag.name
if self.string_container_stack and tag == self.string_container_stack[-1]:
self.string_container_stack.pop()
#print("Pop", tag.name)
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
#print "Push", tag.name
"""Internal method called by handle_starttag when a tag is opened."""
#print("Push", tag.name)
if self.currentTag is not None:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
if tag.name != self.ROOT_TAG_NAME:
self.open_tag_counter[tag.name] += 1
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
if tag.name in self.builder.string_containers:
self.string_container_stack.append(tag)
def endData(self, containerClass=None):
# Default container is NavigableString.
containerClass = containerClass or NavigableString
# The user may want us to instantiate some alias for the
# container class.
containerClass = self.element_classes.get(
containerClass, containerClass
)
"""Method called by the TreeBuilder when the end of a data segment
occurs.
"""
if self.current_data:
current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains
@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
not self.parse_only.search(current_data)):
return
containerClass = self.string_container(containerClass)
o = containerClass(current_data)
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
"""Method called by the TreeBuilder to integrate an object into the parse tree."""
if parent is None:
parent = self.currentTag
if most_recent_element is not None:
@ -545,10 +656,19 @@ class BeautifulSoup(Tag):
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
the given tag."""
#print "Popping to %s" % name
instance of the given tag.
If there are no open tags with the given name, nothing will be
popped.
:param name: Pop up to the most recent tag with this name.
:param nsprefix: The namespace prefix that goes with `name`.
:param inclusivePop: It this is false, pops the tag stack up
to but *not* including the most recent instqance of the
given tag.
"""
#print("Popping to %s" % name)
if name == self.ROOT_TAG_NAME:
# The BeautifulSoup object itself can never be popped.
return
@ -557,6 +677,8 @@ class BeautifulSoup(Tag):
stack_size = len(self.tagStack)
for i in range(stack_size - 1, 0, -1):
if not self.open_tag_counter.get(name):
break
t = self.tagStack[i]
if (name == t.name and nsprefix == t.prefix):
if inclusivePop:
@ -568,15 +690,22 @@ class BeautifulSoup(Tag):
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None):
"""Push a start tag on to the stack.
"""Called by the tree builder when a new tag is encountered.
If this method returns None, the tag was rejected by the
:param name: Name of the tag.
:param nsprefix: Namespace prefix for the tag.
:param attrs: A dictionary of attribute values.
:param sourceline: The line number where this tag was found in its
source document.
:param sourcepos: The character position within `sourceline` where this
tag was found.
If this method returns None, the tag was rejected by an active
SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
# print "Start tag %s: %s" % (name, attrs)
# print("Start tag %s: %s" % (name, attrs))
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
return tag
def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
"""Called by the tree builder when an ending tag is encountered.
:param name: Name of the tag.
:param nsprefix: Namespace prefix for the tag.
"""
#print("End tag: " + name)
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
"""Called by the tree builder when a chunk of textual data is encountered."""
self.current_data.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
"""Returns a string or Unicode representation of the parse tree
as an HTML or XML document.
:param pretty_print: If this is True, indentation will be used to
make the document more readable.
:param eventual_encoding: The encoding of the final document.
If this is None, the document will be a Unicode string.
"""
if self.is_xml:
# Print the XML declaration
encoding_part = ''
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
# This is a special Python encoding; it can't actually
# go into an XML document because it means nothing
# outside of Python.
eventual_encoding = None
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
# Alias to make it easier to type import: 'from bs4 import _soup'
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):
class StopParsing(Exception):
"""Exception raised by a TreeBuilder if it's unable to continue parsing."""
pass
class FeatureNotFound(ValueError):
"""Exception raised by the BeautifulSoup constructor if no parser with the
requested features is found.
"""
pass
#By default, act as an HTML pretty-printer.
#If this file is run as a script, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print(soup.prettify())
print((soup.prettify()))

View file

@ -7,8 +7,11 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
Stylesheet,
Script,
TemplateString,
nonwhitespace_re
)
)
__all__ = [
'HTMLTreeBuilder',
@ -27,18 +30,33 @@ HTML_5 = 'html5'
class TreeBuilderRegistry(object):
"""A way of looking up TreeBuilder subclasses by their name or by desired
features.
"""
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features."""
"""Register a treebuilder based on its advertised features.
:param treebuilder_class: A subclass of Treebuilder. its .features
attribute should list its features.
"""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
"""Look up a TreeBuilder subclass with the desired features.
:param features: A list of features to look for. If none are
provided, the most recently registered TreeBuilder subclass
will be used.
:return: A TreeBuilder subclass, or None if there's no
registered subclass with all the requested features.
"""
if len(self.builders) == 0:
# There are no builders at all.
return None
@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
"""Turn a textual document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
@ -96,7 +114,12 @@ class TreeBuilder(object):
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
# Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
# The textual contents of tags with these names should be
# instantiated with some class other than NavigableString.
DEFAULT_STRING_CONTAINERS = {}
USE_DEFAULT = object()
@ -105,30 +128,39 @@ class TreeBuilder(object):
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
store_line_numbers=USE_DEFAULT):
store_line_numbers=USE_DEFAULT,
string_containers=USE_DEFAULT,
):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this to a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
the way <pre> tags are treated in HTML. Tags in this list
will have
the way <pre> tags are treated in HTML. Tags in this list
are immune from pretty-printing; their contents will always be
output as-is.
:param string_containers: A dictionary mapping tag names to
the classes that should be instantiated to contain the textual
contents of those tags. The default is to use NavigableString
for every tag, no matter what the name. You can override the
default by changing DEFAULT_STRING_CONTAINERS.
:param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
`Tag` object. You can turn this off by passing
store_line_numbers=False. If the parser you're using doesn't
keep track of this information, then setting store_line_numbers=True
will do nothing.
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
`Tag` object. You can turn this off by passing
store_line_numbers=False. If the parser you're using doesn't
keep track of this information, then setting store_line_numbers=True
will do nothing.
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
@ -139,15 +171,25 @@ class TreeBuilder(object):
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
self.store_line_numbers = store_line_numbers
self.store_line_numbers = store_line_numbers
if string_containers == self.USE_DEFAULT:
string_containers = self.DEFAULT_STRING_CONTAINERS
self.string_containers = string_containers
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
:param soup: A BeautifulSoup object.
"""
self.soup = soup
def reset(self):
"""Do any work necessary to reset the underlying parser
for a new document.
By default, this does nothing.
"""
pass
def can_be_empty_element(self, tag_name):
@ -159,23 +201,57 @@ class TreeBuilder(object):
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />".
will be presented as "<p></p>", not "<p/>" or "<p>".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
empty-element tag if and only if it has no children.
"<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
be left alone.
:param tag_name: The name of a markup tag.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
"""Run some incoming markup through some parsing process,
populating the `BeautifulSoup` object in self.soup.
This method is not implemented in TreeBuilder; it must be
implemented in subclasses.
:return: None.
"""
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding. NOTE: This argument is not used by the
calling code and can probably be removed.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
By default, the only strategy is to parse the markup
as-is. See `LXMLTreeBuilderForXML` and
`HTMLParserTreeBuilder` for implementations that take into
account the quirks of particular parsers.
"""
yield markup, None, None, False
def test_fragment_to_document(self, fragment):
@ -188,16 +264,36 @@ class TreeBuilder(object):
results against other HTML fragments.
This method should not be used outside of tests.
:param fragment: A string -- fragment of HTML.
:return: A string -- a full HTML document.
"""
return fragment
def set_up_substitutions(self, tag):
"""Set up any substitutions that will need to be performed on
a `Tag` when it's output as a string.
By default, this does nothing. See `HTMLTreeBuilder` for a
case where this is used.
:param tag: A `Tag`
:return: Whether or not a substitution was performed.
"""
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
"""Replaces class="foo bar" with class=["foo", "bar"]
"""When an attribute value is associated with a tag that can
have multiple values for that attribute, convert the string
value to a list of strings.
Modifies its input in place.
Basically, replaces class="foo bar" with class=["foo", "bar"]
NOTE: This method modifies its input in place.
:param tag_name: The name of a tag.
:param attrs: A dictionary containing the tag's attributes.
Any appropriate attribute values will be modified in place.
"""
if not attrs:
return attrs
@ -225,7 +321,11 @@ class TreeBuilder(object):
return attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
"""A Beautiful Soup treebuilder that listens for SAX events.
This is not currently used for anything, but it demonstrates
how a simple TreeBuilder would work.
"""
def feed(self, markup):
raise NotImplementedError()
@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs)
#print("Start %s, %r" % (name, attrs))
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
#print("End %s" % name)
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder):
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines an unusual content model for these tags.
# We represent this by using a string class other than NavigableString
# inside these tags.
#
# I made this list by going through the HTML spec
# (https://html.spec.whatwg.org/#metadata-content) and looking for
# "metadata content" elements that can contain strings.
#
# TODO: Arguably <noscript> could go here but it seems
# qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS = {
'style': Stylesheet,
'script': Script,
'template': TemplateString,
}
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
"""Replace the declared encoding in a <meta> tag with a placeholder,
to be substituted when the tag is output to a string.
An HTML document may come in to Beautiful Soup as one
encoding, but exit in a different encoding, and the <meta> tag
needs to be changed to reflect this.
:param tag: A `Tag`
:return: Whether or not a substitution was performed.
"""
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules['bs4.builder']
this_module = sys.modules[__name__]
for name in module.__all__:
obj = getattr(module, name)
@ -363,6 +488,9 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
"""An Exception to be raised when the underlying parser simply
refuses to parse the given markup.
"""
def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last result.
# want to use HTMLParser as a last resort.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:

View file

@ -39,7 +39,18 @@ except ImportError as e:
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
"""Use html5lib to build a tree.
Note that this TreeBuilder does not support some features common
to HTML TreeBuilders. Some of these features could theoretically
be implemented, but at the very least it's quite difficult,
because html5lib moves the parse tree around as it's being built.
* This TreeBuilder doesn't use different subclasses of NavigableString
based on the name of the tag in which the string was found.
* You can't use a SoupStrainer to parse only part of a document.
"""
NAME = "html5lib"
@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
# TODO: What are **kwargs exactly? Should they be passed in
# here in addition to/instead of being passed to the BeautifulSoup
# constructor?
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
@ -316,9 +330,7 @@ class Element(treebuilder_base.Node):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0:
converted_attributes = []
for name, value in list(attributes.items()):
if isinstance(name, tuple):
@ -363,9 +375,9 @@ class Element(treebuilder_base.Node):
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
# print("MOVE", self.element.contents)
# print("FROM", self.element)
# print("TO", new_parent.element)
element = self.element
new_parent_element = new_parent.element
@ -423,9 +435,9 @@ class Element(treebuilder_base.Node):
element.contents = []
element.next_element = final_next_element
# print "DONE WITH MOVE"
# print "FROM", self.element
# print "TO", new_parent_element
# print("DONE WITH MOVE")
# print("FROM", self.element)
# print("TO", new_parent_element)
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)

View file

@ -8,7 +8,7 @@ __all__ = [
'HTMLParserTreeBuilder',
]
from future.moves.html.parser import HTMLParser
from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
@ -53,8 +53,30 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
"""A subclass of the Python standard library's HTMLParser class, which
listens for HTMLParser events and translates them into calls
to Beautiful Soup's tree construction API.
"""
# Strategies for handling duplicate attributes
IGNORE = 'ignore'
REPLACE = 'replace'
def __init__(self, *args, **kwargs):
"""Constructor.
:param on_duplicate_attribute: A strategy for what to do if a
tag includes the same attribute more than once. Accepted
values are: REPLACE (replace earlier values with later
ones, the default), IGNORE (keep the earliest value
encountered), or a callable. A callable must take three
arguments: the dictionary of attributes already processed,
the name of the duplicate attribute, and the most recent value
encountered.
"""
self.on_duplicate_attribute = kwargs.pop(
'on_duplicate_attribute', self.REPLACE
)
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
@ -67,20 +89,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element = []
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
"""In Python 3, HTMLParser subclasses must implement error(), although
this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In Python 2, HTMLParser implements error() by raising an exception,
which we don't want to do.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
In any event, this method is called only on very strange
markup and our best strategy is to pretend it didn't happen
and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
"""Handle an incoming empty-element tag.
This is only called when the markup looks like <tag/>.
:param name: Name of the tag.
:param attrs: Dictionary of the tag's attributes.
"""
# is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call
@ -89,6 +117,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True):
"""Handle an opening tag, e.g. '<tag>'
:param name: Name of the tag.
:param attrs: Dictionary of the tag's attributes.
:param handle_empty_element: True if this tag is known to be
an empty-element tag (i.e. there is not expected to be any
closing tag).
"""
# XXX namespace
attr_dict = {}
for key, value in attrs:
@ -96,9 +132,21 @@ class BeautifulSoupHTMLParser(HTMLParser):
# for consistency with the other tree builders.
if value is None:
value = ''
attr_dict[key] = value
if key in attr_dict:
# A single attribute shows up multiple times in this
# tag. How to handle it depends on the
# on_duplicate_attribute setting.
on_dupe = self.on_duplicate_attribute
if on_dupe == self.IGNORE:
pass
elif on_dupe in (None, self.REPLACE):
attr_dict[key] = value
else:
on_dupe(attr_dict, key, value)
else:
attr_dict[key] = value
attrvalue = '""'
#print "START", name
#print("START", name)
sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline,
@ -121,20 +169,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True):
#print "END", name
"""Handle a closing tag, e.g. '</tag>'
:param name: A tag name.
:param check_already_closed: True if this tag is expected to
be the closing portion of an empty-element tag,
e.g. '<tag></tag>'.
"""
#print("END", name)
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
# We've already called handle_endtag() for it, so just
# check it off the list.
# print "ALREADY CLOSED", name
#print("ALREADY CLOSED", name)
self.already_closed_empty_element.remove(name)
else:
self.soup.handle_endtag(name)
def handle_data(self, data):
"""Handle some textual data that shows up between tags."""
self.soup.handle_data(data)
def handle_charref(self, name):
"""Handle a numeric character reference by converting it to the
corresponding Unicode character and treating it as textual
data.
:param name: Character number, possibly in hexadecimal.
"""
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
@ -168,6 +230,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_entityref(self, name):
"""Handle a named entity reference by converting it to the
corresponding Unicode character(s) and treating it as textual
data.
:param name: Name of the entity reference.
"""
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
@ -181,21 +249,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_comment(self, data):
"""Handle an HTML comment.
:param data: The text of the comment.
"""
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
"""Handle a DOCTYPE declaration.
:param data: The text of the declaration.
"""
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
data = data[len("DOCTYPE "):]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
"""Handle a declaration of unknown type -- probably a CDATA block.
:param data: The text of the declaration.
"""
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
@ -206,13 +282,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData(cls)
def handle_pi(self, data):
"""Handle a processing instruction.
:param data: The text of the instruction.
"""
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
"""A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
found in the Python standard library.
"""
is_xml = False
picklable = True
NAME = HTMLPARSER
@ -221,36 +303,88 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
# The html.parser knows which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
"""Constructor.
:param parser_args: Positional arguments to pass into
the BeautifulSoupHTMLParser constructor, once it's
invoked.
:param parser_kwargs: Keyword arguments to pass into
the BeautifulSoupHTMLParser constructor, once it's
invoked.
:param kwargs: Keyword arguments for the superclass constructor.
"""
# Some keyword arguments will be pulled out of kwargs and placed
# into parser_kwargs.
extra_parser_kwargs = dict()
for arg in ('on_duplicate_attribute',):
if arg in kwargs:
value = kwargs.pop(arg)
extra_parser_kwargs[arg] = value
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
"""
if isinstance(markup, str):
# Parse Unicode as-is.
yield (markup, None, None, False)
return
# Ask UnicodeDammit to sniff the most likely encoding.
# This was provided by the end-user; treat it as a known
# definite encoding per the algorithm laid out in the HTML5
# spec. (See the EncodingDetector class for details.)
known_definite_encodings = [user_specified_encoding]
# This was found in the document; treat it as a slightly lower-priority
# user encoding.
user_encodings = [document_declared_encoding]
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
dammit = UnicodeDammit(
markup,
known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings,
is_html=True,
exclude_encodings=exclude_encodings
)
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
"""Run some incoming markup through some parsing process,
populating the `BeautifulSoup` object in self.soup.
"""
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup

View file

@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
# See: https://bugs.launchpad.net/lxml/+bug/1846906
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
:param soup: A `BeautifulSoup`.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
while parsing the document.
This might be useful later on when creating CSS selectors.
:param mapping: A dictionary mapping namespace prefixes to URIs.
"""
for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces:
@ -84,20 +89,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup._namespaces[key] = value
def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
"""Find the default parser for the given encoding.
:param encoding: A string.
:return: Either a parser object or a class, which
will be instantiated with default arguments.
"""
if self._default_parser is not None:
return self._default_parser
return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding):
"""Instantiate an appropriate parser for the given encoding.
:param encoding: A string.
:return: A parser object such as an `etree.XMLParser`.
"""
# Use the default parser.
parser = self.default_parser(encoding)
if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
parser = parser(
target=self, strip_cdata=False, recover=True, encoding=encoding
)
return parser
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
@ -122,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None):
"""
:yield: A series of 4-tuples.
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
lxml really wants to get a bytestring and convert it to
Unicode itself. So instead of using UnicodeDammit to convert
the bytestring to Unicode using different encodings, this
implementation uses EncodingDetector to iterate over the
encodings, and tell lxml to try to parse the document as each
one in turn.
:param markup: Some markup -- hopefully a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for parsing the document.
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
"""
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
@ -150,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
try_encodings = [user_specified_encoding, document_declared_encoding]
# This was provided by the end-user; treat it as a known
# definite encoding per the algorithm laid out in the HTML5
# spec. (See the EncodingDetector class for details.)
known_definite_encodings = [user_specified_encoding]
# This was found in the document; treat it as a slightly lower-priority
# user encoding.
user_encodings = [document_declared_encoding]
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
markup, known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings, is_html=is_html,
exclude_encodings=exclude_encodings
)
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)

View file

@ -1,4 +0,0 @@
import requests
data = requests.get("https://www.crummy.com/").content
from bs4 import _s
data = [x for x in _s(data).block_text()]

File diff suppressed because it is too large Load diff

View file

@ -20,9 +20,13 @@ import sys
import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print("Diagnostic running on Beautiful Soup %s" % __version__)
print("Python version %s" % sys.version)
"""Diagnostic suite for isolating common problems.
:param data: A string containing markup that needs to be explained.
:return: None; diagnostics are printed to standard output.
"""
print(("Diagnostic running on Beautiful Soup %s" % __version__))
print(("Python version %s" % sys.version))
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@ -39,65 +43,76 @@ def diagnose(data):
basic_parsers.append("lxml-xml")
try:
from lxml import etree
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
except ImportError as e:
print (
print(
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers:
try:
import html5lib
print("Found html5lib version %s" % html5lib.__version__)
print(("Found html5lib version %s" % html5lib.__version__))
except ImportError as e:
print (
print(
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
elif data.startswith("http:") or data.startswith("https:"):
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
else:
try:
if os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
print(('"%s" looks like a filename. Reading data from the file.' % data))
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print()
print("")
for parser in basic_parsers:
print("Trying to parse your markup with %s" % parser)
print(("Trying to parse your markup with %s" % parser))
success = False
try:
soup = BeautifulSoup(data, features=parser)
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
print(("%s could not parse the markup." % parser))
traceback.print_exc()
if success:
print("Here's what %s did with the markup:" % parser)
print(soup.prettify())
print(("Here's what %s did with the markup:" % parser))
print((soup.prettify()))
print("-" * 80)
print(("-" * 80))
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
Soup code is running. You can use this to determine whether
an lxml-specific problem is in Beautiful Soup's lxml tree builders
or in lxml itself.
:param data: Some markup.
:param html: If True, markup will be parsed with lxml's HTML parser.
if False, lxml's XML parser will be used.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
"""Subclass of HTMLParser that announces parse events, without doing
anything else.
You can use this to get a picture of how html.parser sees a given
document. The easiest way to do this is to call `htmlparser_trace`.
"""
def _p(self, s):
print(s)
@ -134,6 +149,8 @@ def htmlparser_trace(data):
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
:param data: Some markup.
"""
parser = AnnouncingParser()
parser.feed(data)
@ -176,9 +193,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
b = time.time()
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
print(("%s could not parse the markup." % parser))
traceback.print_exc()
if success:
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
def profile(num_elements=100000, parser="lxml"):
"""Use Python's profiler on a randomly generated document."""
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
stats.sort_stats("cumulative")
stats.print_stats('_html5lib|bs4', 50)
# If this file is run as a script, standard input is diagnosed.
if __name__ == '__main__':
diagnose(sys.stdin.read())

File diff suppressed because it is too large Load diff

View file

@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
Formatters are passed in as the `formatter` argument to methods
like `PageElement.encode`. Most people won't need to think about
formatters, and most people who need to think about them can pass
in one of these predefined strings as `formatter` rather than
making a new Formatter object:
For HTML documents:
* 'html' - HTML entity substitution for generic HTML documents. (default)
* 'html5' - HTML entity substitution for HTML5 documents, as
well as some optimizations in the way tags are rendered.
* 'minimal' - Only make the substitutions necessary to guarantee
valid HTML.
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
For XML documents:
* 'html' - Entity substitution for XHTML documents.
* 'minimal' - Only make the substitutions necessary to guarantee
valid XML. (default)
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
empty_attributes_are_booleans=False,
):
"""
"""Constructor.
:param void_element_close_prefix: By default, represent void
elements as <tag/> rather than <tag>
:param language: This should be Formatter.XML if you are formatting
XML markup and Formatter.HTML if you are formatting HTML markup.
:param entity_substitution: A function to call to replace special
characters with XML/HTML entities. For examples, see
bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
:param void_element_close_prefix: By default, void elements
are represented as <tag/> (XML rules) rather than <tag>
(HTML rules). To get <tag>, pass in the empty string.
:param cdata_containing_tags: The list of tags that are defined
as containing CDATA in this dialect. For example, in HTML,
<script> and <style> tags are defined as containing CDATA,
and their contents should not be formatted.
:param blank_attributes_are_booleans: Render attributes whose value
is the empty string as HTML-style boolean attributes.
(Attributes whose value is None are always rendered this way.)
"""
self.language = language
self.entity_substitution = entity_substitution
@ -39,9 +76,17 @@ class Formatter(EntitySubstitution):
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
self.empty_attributes_are_booleans=empty_attributes_are_booleans
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution."""
"""Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
text.
:param ns: A string.
:return: A string with certain characters replaced by named
or numeric entities.
"""
if not self.entity_substitution:
return ns
from .element import NavigableString
@ -54,21 +99,41 @@ class Formatter(EntitySubstitution):
return self.entity_substitution(ns)
def attribute_value(self, value):
"""Process the value of an attribute."""
"""Process the value of an attribute.
:param ns: A string.
:return: A string with certain characters replaced by named
or numeric entities.
"""
return self.substitute(value)
def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
return sorted(tag.attrs.items())
"""Reorder a tag's attributes however you want.
By default, attributes are sorted alphabetically. This makes
behavior consistent between Python 2 and Python 3, and preserves
backwards compatibility with older versions of Beautiful Soup.
If `empty_boolean_attributes` is True, then attributes whose
values are set to the empty string will be treated as boolean
attributes.
"""
if tag.attrs is None:
return []
return sorted(
(k, (None if self.empty_attributes_are_booleans and v == '' else v))
for k, v in list(tag.attrs.items())
)
class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
@ -80,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None
void_element_close_prefix=None,
empty_attributes_are_booleans=True,
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml

View file

@ -8,6 +8,7 @@ import pickle
import copy
import functools
import unittest
import warnings
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
@ -15,7 +16,10 @@ from bs4.element import (
Comment,
ContentMetaAttributeValue,
Doctype,
PYTHON_SPECIFIC_ENCODINGS,
SoupStrainer,
Script,
Stylesheet,
Tag
)
@ -83,8 +87,22 @@ class SoupTest(unittest.TestCase):
if compare_parsed_to is None:
compare_parsed_to = to_parse
# Verify that the documents come out the same.
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
# Also run some checks on the BeautifulSoup object itself:
# Verify that every tag that was opened was eventually closed.
# There are no tags in the open tag counter.
assert all(v==0 for v in list(obj.open_tag_counter.values()))
# The only tag in the tag stack is the one for the root
# document.
self.assertEqual(
[obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
)
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
@ -211,7 +229,41 @@ class SoupTest(unittest.TestCase):
return child
class HTMLTreeBuilderSmokeTest(object):
class TreeBuilderSmokeTest(object):
# Tests that are common to HTML and XML tree builders.
def test_fuzzed_input(self):
# This test centralizes in one place the various fuzz tests
# for Beautiful Soup created by the oss-fuzz project.
# These strings superficially resemble markup, but they
# generally can't be parsed into anything. The best we can
# hope for is that parsing these strings won't crash the
# parser.
#
# n.b. This markup is commented out because these fuzz tests
# _do_ crash the parser. However the crashes are due to bugs
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
# bugs!
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://bugs.python.org/issue37747
#
#b'\n<![\xff\xfe\xfe\xcd\x00',
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
# https://bugs.python.org/issue34480
#
#b'<![n\x00'
]
for markup in bad_markup:
with warnings.catch_warnings(record=False):
soup = self.soup(markup)
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
"""A basic test of a treebuilder's competence.
@ -233,6 +285,22 @@ class HTMLTreeBuilderSmokeTest(object):
new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element)
def test_special_string_containers(self):
soup = self.soup(
"<style>Some CSS</style><script>Some Javascript</script>"
)
assert isinstance(soup.style.string, Stylesheet)
assert isinstance(soup.script.string, Script)
soup = self.soup(
"<style><!--Some CSS--></style>"
)
assert isinstance(soup.style.string, Stylesheet)
# The contents of the style tag resemble an HTML comment, but
# it's not treated as a comment.
self.assertEqual("<!--Some CSS-->", soup.style.string)
assert isinstance(soup.style.string, Stylesheet)
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
self.assertEqual(
soup.encode("utf8")[:len(doctype_str)],
doctype_str
)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment):
def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
"""Generate and parse a document with the given doctype."""
doctype = '<!DOCTYPE %s>' % doctype_fragment
doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
markup = doctype + '\n<p>foo</p>'
soup = self.soup(markup)
return doctype, soup
return doctype.encode("utf8"), soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object):
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_mixed_case_doctype(self):
# A lowercase or mixed-case doctype becomes a Doctype.
for doctype_fragment in ("doctype", "DocType"):
doctype_str, soup = self._document_with_doctype(
"html", doctype_fragment
)
# Make sure a Doctype object was created and that the DOCTYPE
# is uppercase.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, "html")
self.assertEqual(
soup.encode("utf8")[:len(doctype_str)],
b"<!DOCTYPE html>"
)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
@ -532,7 +624,7 @@ Hello, world!
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@ -594,7 +686,7 @@ Hello, world!
markup = b'<a class="foo bar">'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class'])
#
# Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are
@ -779,11 +871,44 @@ Hello, world!
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
def test_python_specific_encodings_not_used_in_charset(self):
# You can encode an HTML document using a Python-specific
# encoding, but that encoding won't be mentioned _inside_ the
# resulting document. Instead, the document will appear to
# have no encoding.
for markup in [
b'<meta charset="utf8"></head>'
b'<meta id="encoding" charset="utf-8" />'
]:
soup = self.soup(markup)
for encoding in PYTHON_SPECIFIC_ENCODINGS:
if encoding in (
'idna', 'mbcs', 'oem', 'undefined',
'string_escape', 'string-escape'
):
# For one reason or another, these will raise an
# exception if we actually try to use them, so don't
# bother.
continue
encoded = soup.encode(encoding)
assert b'meta charset=""' in encoded
assert encoding.encode("ascii") not in encoded
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>")
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
def test_closing_tag_with_no_opening_tag(self):
# Without BeautifulSoup.open_tag_counter, the </span> tag will
# cause _popToTag to be called over and over again as we look
# for a <span> tag that wasn't there. The result is that 'text2'
# will show up outside the body of the document.
soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
self.assertEqual(
"<body><div><p>text1</p>text2</div></body>", soup.body.decode()
)
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""
@ -791,7 +916,7 @@ Hello, world!
self.linkage_validator(soup)
class XMLTreeBuilderSmokeTest(object):
class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_python_specific_encodings_not_used_in_xml_declaration(self):
# You can encode an XML document using a Python-specific
# encoding, but that encoding won't be mentioned _inside_ the
# resulting document.
markup = b"""<?xml version="1.0"?>\n<foo/>"""
soup = self.soup(markup)
for encoding in PYTHON_SPECIFIC_ENCODINGS:
if encoding in (
'idna', 'mbcs', 'oem', 'undefined',
'string_escape', 'string-escape'
):
# For one reason or another, these will raise an
# exception if we actually try to use them, so don't
# bother.
continue
encoded = soup.encode(encoding)
assert b'<?xml version="1.0"?>' in encoded
assert encoding.encode("ascii") not in encoded
def test_processing_instruction(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
soup = self.soup(markup)
@ -828,7 +972,7 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8"), markup)
def test_nested_namespaces(self):
doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

View file

@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
def test_special_string_containers(self):
# The html5lib tree builder doesn't support this standard feature,
# because there's no way of knowing, when a string is created,
# where in the tree it will eventually end up.
pass
def test_html5_attributes(self):
# The html5lib TreeBuilder can convert any entity named in
# the HTML5 spec to a sequence of Unicode characters, and
# convert those Unicode characters to a (potentially
# different) named entity on the way out.
#
# This is a copy of the same test from
# HTMLParserTreeBuilderSmokeTest. It's not in the superclass
# because the lxml HTML TreeBuilder _doesn't_ work this way.
for input_element, output_unicode, output_element in (
("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
('&models;', '\u22a7', b'&models;'),
('&Nfr;', '\U0001d511', b'&Nfr;'),
('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
('&not;', '\xac', b'&not;'),
('&Not;', '\u2aec', b'&Not;'),
('&quot;', '"', b'"'),
('&there4;', '\u2234', b'&there4;'),
('&Therefore;', '\u2234', b'&there4;'),
('&therefore;', '\u2234', b'&there4;'),
("&fjlig;", 'fj', b'fj'),
("&sqcup;", '\u2294', b'&sqcup;'),
("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
("&apos;", "'", b"'"),
("&verbar;", "|", b"|"),
):
markup = '<div>%s</div>' % input_element
div = self.soup(markup).div
without_element = div.encode()
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
self.assertEqual(without_element, expect)
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
self.assertEqual(with_element, expect)

View file

@ -3,6 +3,7 @@ trees."""
from pdb import set_trace
import pickle
import warnings
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
def test_on_duplicate_attribute(self):
# The html.parser tree builder has a variety of ways of
# handling a tag that contains the same attribute multiple times.
markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
# If you don't provide any particular value for
# on_duplicate_attribute, later values replace earlier values.
soup = self.soup(markup)
self.assertEqual("url3", soup.a['href'])
self.assertEqual(["cls"], soup.a['class'])
self.assertEqual("id", soup.a['id'])
# You can also get this behavior explicitly.
def assert_attribute(on_duplicate_attribute, expected):
soup = self.soup(
markup, on_duplicate_attribute=on_duplicate_attribute
)
self.assertEqual(expected, soup.a['href'])
# Verify that non-duplicate attributes are treated normally.
self.assertEqual(["cls"], soup.a['class'])
self.assertEqual("id", soup.a['id'])
assert_attribute(None, "url3")
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
# You can ignore subsequent values in favor of the first.
assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
# And you can pass in a callable that does whatever you want.
def accumulate(attrs, key, value):
if not isinstance(attrs[key], list):
attrs[key] = [attrs[key]]
attrs[key].append(value)
assert_attribute(accumulate, ["url1", "url2", "url3"])
def test_html5_attributes(self):
# The html.parser TreeBuilder can convert any entity named in
# the HTML5 spec to a sequence of Unicode characters, and
# convert those Unicode characters to a (potentially
# different) named entity on the way out.
for input_element, output_unicode, output_element in (
("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
('&models;', '\u22a7', b'&models;'),
('&Nfr;', '\U0001d511', b'&Nfr;'),
('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
('&not;', '\xac', b'&not;'),
('&Not;', '\u2aec', b'&Not;'),
('&quot;', '"', b'"'),
('&there4;', '\u2234', b'&there4;'),
('&Therefore;', '\u2234', b'&there4;'),
('&therefore;', '\u2234', b'&there4;'),
("&fjlig;", 'fj', b'fj'),
("&sqcup;", '\u2294', b'&sqcup;'),
("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
("&apos;", "'", b"'"),
("&verbar;", "|", b"|"),
):
markup = '<div>%s</div>' % input_element
div = self.soup(markup).div
without_element = div.encode()
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
self.assertEqual(without_element, expect)
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
self.assertEqual(with_element, expect)
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
parser.error("don't crash")
with warnings.catch_warnings(record=True) as warns:
parser.error("don't crash")
[warning] = warns
assert "don't crash" == str(warning.message)

View file

@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into

View file

@ -3,6 +3,7 @@
from pdb import set_trace
import logging
import os
import unittest
import sys
import tempfile
@ -10,6 +11,8 @@ import tempfile
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
GuessedAtParserWarning,
MarkupResemblesLocatorWarning,
)
from bs4.builder import (
TreeBuilder,
@ -29,7 +32,6 @@ import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
EncodingDetector,
)
from bs4.testing import (
default_builder,
@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
self.store_line_numbers = False
self.cdata_list_attributes = []
self.preserve_whitespace_tags = []
self.string_containers = {}
def initialize_soup(self, soup):
pass
def feed(self, markup):
@ -186,28 +189,69 @@ class TestConstructor(SoupTest):
isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)
def test_alternate_string_containers(self):
# Test the ability to customize the string containers for
# different types of tags.
class PString(NavigableString):
pass
class BString(NavigableString):
pass
soup = self.soup(
"<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
string_containers = {
'b': BString,
'p': PString,
}
)
# The string before the <p> tag is a regular NavigableString.
assert isinstance(soup.div.contents[0], NavigableString)
# The string inside the <p> tag, but not inside the <i> tag,
# is a PString.
assert isinstance(soup.p.contents[0], PString)
# Every string inside the <b> tag is a BString, even the one that
# was also inside an <i> tag.
for s in soup.b.strings:
assert isinstance(s, BString)
# Now that parsing was complete, the string_container_stack
# (where this information was kept) has been cleared out.
self.assertEqual([], soup.string_container_stack)
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
self.assertTrue(v)
def _assert_warning(self, warnings, cls):
for w in warnings:
if isinstance(w.message, cls):
return w
raise Exception("%s warning not found in %r" % cls, warnings)
def _assert_no_parser_specified(self, w):
warning = self._assert_warning(w, GuessedAtParserWarning)
message = str(warning.message)
self.assertTrue(
message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
)
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
soup = BeautifulSoup("<a><b></b></a>")
self._assert_no_parser_specified(w)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
soup = BeautifulSoup("<a><b></b></a>", "html")
self._assert_no_parser_specified(w)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html.parser")
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True)
class TestWarnings(SoupTest):
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
msg = str(w[0].message)
self.assertTrue("looks like a filename" in msg)
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
self.assertTrue("looks like a filename" in str(warning.message))
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
self.assertEqual(0, len(w))
self.assertEqual([], w)
def test_directory_warning(self):
try:
filename = tempfile.mkdtemp()
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
self.assertTrue("looks like a directory" in str(warning.message))
finally:
os.rmdir(filename)
# The directory no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
self.assertEqual([], w)
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
# Be aware this isn't the only warning that can be raised during
# execution..
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup("http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_bytes_and_space(self):
# Here the markup contains something besides a URL, so no warning
# is issued.
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
def test_html5_entity(self):
# Some HTML5 entities correspond to single- or multi-character
# Unicode sequences.
for entity, u in (
# A few spot checks of our ability to recognize
# special character sequences and convert them
# to named entities.
('&models;', '\u22a7'),
('&Nfr;', '\U0001d511'),
('&ngeqq;', '\u2267\u0338'),
('&not;', '\xac'),
('&Not;', '\u2aec'),
# We _could_ convert | to &verbarr;, but we don't, because
# | is an ASCII character.
('|' '|'),
# Similarly for the fj ligature, which we could convert to
# &fjlig;, but we don't.
("fj", "fj"),
# We do convert _these_ ASCII characters to HTML entities,
# because that's required to generate valid HTML.
('&gt;', '>'),
('&lt;', '<'),
('&amp;', '&'),
):
template = '3 %s 4'
raw = template % u
with_entities = template % entity
self.assertEqual(self.sub.substitute_html(raw), with_entities)
def test_html5_entity_with_variation_selector(self):
# Some HTML5 entities correspond either to a single-character
# Unicode sequence _or_ to the same character plus U+FE00,
# VARIATION SELECTOR 1. We can handle this.
data = "fjords \u2294 penguins"
markup = "fjords &sqcup; penguins"
self.assertEqual(self.sub.substitute_html(data), markup)
data = "fjords \u2294\ufe00 penguins"
markup = "fjords &sqcups; penguins"
self.assertEqual(self.sub.substitute_html(data), markup)
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest):
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
# And if we exclude that, there is no valid guess at all.
dammit = UnicodeDammit(
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
self.assertEqual(dammit.original_encoding, None)
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
for data in (
b'<html><meta charset="euc-jp" /></html>',
b"<html><meta charset='euc-jp' /></html>",
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_byte_order_mark_removed(self):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
def test_find_declared_encoding(self):
# Test our ability to find a declared encoding inside an
# XML or HTML document.
#
# Even if the document comes in as Unicode, it may be
# interesting to know what encoding was claimed
# originally.
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
html_bytes = html_unicode.encode("ascii")
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
xml_bytes = xml_unicode.encode("ascii")
m = EncodingDetector.find_declared_encoding
self.assertEqual(None, m(html_unicode, is_html=False))
self.assertEqual("utf-8", m(html_unicode, is_html=True))
self.assertEqual("utf-8", m(html_bytes, is_html=True))
self.assertEqual("iso-8859-1", m(xml_unicode))
self.assertEqual("iso-8859-1", m(xml_bytes))
# Normally, only the first few kilobytes of a document are checked for
# an encoding.
spacer = b' ' * 5000
self.assertEqual(None, m(spacer + html_bytes))
self.assertEqual(None, m(spacer + xml_bytes))
# But you can tell find_declared_encoding to search an entire
# HTML document.
self.assertEqual(
"utf-8",
m(spacer + html_bytes, is_html=True, search_entire_document=True)
)
# The XML encoding declaration has to be the very first thing
# in the document. We'll allow whitespace before the document
# starts, but nothing else.
self.assertEqual(
"iso-8859-1",
m(xml_bytes, search_entire_document=True)
)
self.assertEqual(
None, m(b'a' + xml_bytes, search_entire_document=True)
)
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none_or_missing(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
a = NamespacedAttribute("xmlns", "")
self.assertEqual(a, "xmlns")
a = NamespacedAttribute("xmlns")
self.assertEqual(a, "xmlns")
def test_namespace_may_be_none_or_missing(self):
a = NamespacedAttribute(None, "tag")
self.assertEqual(a, "tag")
a = NamespacedAttribute("", "tag")
self.assertEqual(a, "tag")
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)

View file

@ -27,13 +27,17 @@ from bs4.element import (
Doctype,
Formatter,
NavigableString,
Script,
SoupStrainer,
Stylesheet,
Tag,
TemplateString,
)
from bs4.testing import (
SoupTest,
skipIf,
)
from soupsieve import SelectorSyntaxError
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
soup.a.extend(l)
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
def test_extend_with_another_tags_contents(self):
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
soup = self.soup(data)
d1 = soup.find('div', id='d1')
d2 = soup.find('div', id='d2')
d2.extend(d1)
self.assertEqual('<div id="d1"></div>', d1.decode())
self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
def test_move_tag_to_beginning_of_parent(self):
data = "<a><b></b><c></c><d></d></a>"
soup = self.soup(data)
@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
self.assertEqual(no.next_element, "no")
self.assertEqual(no.next_sibling, " business")
def test_replace_with_errors(self):
# Can't replace a tag that's not part of a tree.
a_tag = Tag(name="a")
self.assertRaises(ValueError, a_tag.replace_with, "won't work")
# Can't replace a tag with its parent.
a_tag = self.soup("<a><b></b></a>").a
self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
# Or with a list that includes its parent.
self.assertRaises(ValueError, a_tag.b.replace_with,
"string1", a_tag, "string2")
def test_replace_with_multiple(self):
data = "<a><b></b><c></c></a>"
soup = self.soup(data)
d_tag = soup.new_tag("d")
d_tag.string = "Text In D Tag"
e_tag = soup.new_tag("e")
f_tag = soup.new_tag("f")
a_string = "Random Text"
soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
self.assertEqual(
"<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>",
soup.decode()
)
assert soup.b.next_element == d_tag
assert d_tag.string.next_element==e_tag
assert e_tag.next_element.string == a_string
assert e_tag.next_element.next_element == f_tag
def test_replace_first_child(self):
data = "<a><b></b><c></c></a>"
soup = self.soup(data)
@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
a.clear(decompose=True)
self.assertEqual(0, len(em.contents))
def test_decompose(self):
# Test PageElement.decompose() and PageElement.decomposed
soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
p1, p2 = soup.find_all('p')
a = p1.a
text = p1.em.string
for i in [p1, p2, a, text]:
self.assertEqual(False, i.decomposed)
# This sets p1 and everything beneath it to decomposed.
p1.decompose()
for i in [p1, a, text]:
self.assertEqual(True, i.decomposed)
# p2 is unaffected.
self.assertEqual(False, p2.decomposed)
def test_string_set(self):
"""Tag.string = 'string'"""
soup = self.soup("<a></a> <b><c></c></b>")
@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
def test_get_text_ignores_comments(self):
def test_get_text_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(soup.get_text(), "foobar")
@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
def test_all_strings_ignores_comments(self):
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
self.assertEqual(soup.get_text(), "foobar")
def test_all_strings_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
def test_string_methods_inside_special_string_container_tags(self):
# Strings inside tags like <script> are generally ignored by
# methods like get_text, because they're not what humans
# consider 'text'. But if you call get_text on the <script>
# tag itself, those strings _are_ considered to be 'text',
# because there's nothing else you might be looking for.
style = self.soup("<div>a<style>Some CSS</style></div>")
template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
self.assertEqual(style.div.get_text(), "a")
self.assertEqual(list(style.div.strings), ["a"])
self.assertEqual(style.div.style.get_text(), "Some CSS")
self.assertEqual(list(style.div.style.strings),
['Some CSS'])
# The comment is not picked up here. That's because it was
# parsed into a Comment object, which is not considered
# interesting by template.strings.
self.assertEqual(template.div.get_text(), "a")
self.assertEqual(list(template.div.strings), ["a"])
self.assertEqual(template.div.template.get_text(), "Templated text.")
self.assertEqual(list(template.div.template.strings),
["Templated ", "text", "."])
# The comment is included here, because it didn't get parsed
# into a Comment object--it's part of the Script string.
self.assertEqual(script.div.get_text(), "a")
self.assertEqual(list(script.div.strings), ["a"])
self.assertEqual(script.div.script.get_text(),
"<!--a comment-->Some text")
self.assertEqual(list(script.div.script.strings),
['<!--a comment-->Some text'])
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest):
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
class TestFormatter(SoupTest):
def test_sort_attributes(self):
# Test the ability to override Formatter.attributes() to,
# e.g., disable the normal sorting of attributes.
class UnsortedFormatter(Formatter):
def attributes(self, tag):
self.called_with = tag
for k, v in sorted(tag.attrs.items()):
if k == 'ignore':
continue
yield k,v
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
formatter = UnsortedFormatter()
decoded = soup.decode(formatter=formatter)
# attributes() was called on the <p> tag. It filtered out one
# attribute and sorted the other two.
self.assertEqual(formatter.called_with, soup.p)
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
class TestNavigableStringSubclasses(SoupTest):
def test_cdata(self):
# None of the current builders turn CDATA sections into CData
# objects, but you can create them manually.
soup = self.soup("")
cdata = CData("foo")
soup.insert(1, cdata)
self.assertEqual(str(soup), "<![CDATA[foo]]>")
self.assertEqual(soup.find(text="foo"), "foo")
self.assertEqual(soup.contents[0], "foo")
def test_cdata_is_never_formatted(self):
"""Text inside a CData object is passed into the formatter.
But the return value is ignored.
"""
self.count = 0
def increment(*args):
self.count += 1
return "BITTER FAILURE"
soup = self.soup("")
cdata = CData("<><><>")
soup.insert(1, cdata)
self.assertEqual(
b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
self.assertEqual(1, self.count)
def test_doctype_ends_in_newline(self):
# Unlike other NavigableString subclasses, a DOCTYPE always ends
# in a newline.
doctype = Doctype("foo")
soup = self.soup("")
soup.insert(1, doctype)
self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
def test_declaration(self):
d = Declaration("foo")
self.assertEqual("<?foo?>", d.output_ready())
class TestSoupSelector(TreeTest):
HTML = """
@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
SyntaxError, self.soup.select, "a:nth-of-type(a)")
SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self):
# Try to select first paragraph
@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
self.assertRaises(SyntaxError, self.soup.select, ',x, y')
self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])