Update beautifulsoup4-4.10.0

This commit is contained in:
JonnyWong16 2021-10-14 20:46:06 -07:00
parent b581460b51
commit ab8fa4d5b3
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
16 changed files with 4599 additions and 743 deletions

View file

@ -1,6 +1,5 @@
"""Beautiful Soup """Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/ http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a Beautiful Soup uses a pluggable XML or HTML parser to parse a
@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate, provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree. search, and modify the parse tree.
Beautiful Soup works with Python 2.7 and up. It works better if lxml Beautiful Soup works with Python 3.5 and up. It works better if lxml
and/or html5lib is installed. and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the For more than you ever wanted to know about Beautiful Soup, see the
documentation: documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.8.1" __version__ = "4.10.0"
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
# Use of this source code is governed by the MIT license. # Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
from collections import Counter
import os import os
import re import re
import sys import sys
import traceback import traceback
import warnings import warnings
# The very first thing we do is give a useful error if someone is
# running this code under Python 2.
if sys.version_info.major < 3:
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
from .builder import builder_registry, ParserRejectedMarkup from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit from .dammit import UnicodeDammit
from .element import ( from .element import (
@ -42,28 +46,49 @@ from .element import (
NavigableString, NavigableString,
PageElement, PageElement,
ProcessingInstruction, ProcessingInstruction,
PYTHON_SPECIFIC_ENCODINGS,
ResultSet, ResultSet,
Script,
Stylesheet,
SoupStrainer, SoupStrainer,
Tag, Tag,
TemplateString,
) )
# The very first thing we do is give a useful error if someone is # Define some custom warnings.
# running this code under Python 3 without converting it. class GuessedAtParserWarning(UserWarning):
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' """The warning issued when BeautifulSoup has to guess what parser to
use -- probably because no parser was specified in the constructor.
"""
class MarkupResemblesLocatorWarning(UserWarning):
"""The warning issued when BeautifulSoup is given 'markup' that
actually looks like a resource locator -- a URL or a path to a file
on disk.
"""
class BeautifulSoup(Tag): class BeautifulSoup(Tag):
""" """A data structure representing a parsed HTML or XML document.
This class defines the basic interface called by the tree builders.
These methods will be called by the parser: Most of the methods you'll call on a BeautifulSoup object are inherited from
reset() PageElement or Tag.
feed(markup)
Internally, this class defines the basic interface called by the
tree builders when converting an HTML/XML document into a data
structure. The interface abstracts away the differences between
parsers. To write a new tree builder, you'll need to understand
these methods as a whole.
These methods will be called by the BeautifulSoup constructor:
* reset()
* feed(markup)
The tree builder may call these methods from its feed() implementation: The tree builder may call these methods from its feed() implementation:
handle_starttag(name, attrs) # See note about return value * handle_starttag(name, attrs) # See note about return value
handle_endtag(name) * handle_endtag(name)
handle_data(data) # Appends to the current data node * handle_data(data) # Appends to the current data node
endData(containerClass) # Ends the current data node * endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events, able to build a tree using 'start tag' events, 'end tag' events,
@ -73,68 +98,75 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then like HTML's <br> tag), call handle_starttag and then
handle_endtag. handle_endtag.
""" """
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
# a Tag with a .name. This name makes it clear the BeautifulSoup
# object isn't a real markup tag.
ROOT_TAG_NAME = '[document]' ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they # If the end-user gives no indication which tree builder they
# want, look for one with these features. # want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast'] DEFAULT_BUILDER_FEATURES = ['html', 'fast']
# A string containing all ASCII whitespace characters, used in
# endData() to detect data chunks that seem 'empty'.
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs): element_classes=None, **kwargs):
"""Constructor. """Constructor.
:param markup: A string or a file-like object representing :param markup: A string or a file-like object representing
markup to be parsed. markup to be parsed.
:param features: Desirable features of the parser to be used. This :param features: Desirable features of the parser to be
may be the name of a specific parser ("lxml", "lxml-xml", used. This may be the name of a specific parser ("lxml",
"html.parser", or "html5lib") or it may be the type of markup "lxml-xml", "html.parser", or "html5lib") or it may be the
to be used ("html", "html5", "xml"). It's recommended that you type of markup to be used ("html", "html5", "xml"). It's
name a specific parser, so that Beautiful Soup gives you the recommended that you name a specific parser, so that
same results across platforms and virtual environments. Beautiful Soup gives you the same results across platforms
and virtual environments.
:param builder: A TreeBuilder subclass to instantiate (or :param builder: A TreeBuilder subclass to instantiate (or
instance to use) instead of looking one up based on instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a `features`. You only need to use this if you've implemented a
custom TreeBuilder. custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document :param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too when parsing part of a document that would otherwise be too
large to fit into memory. large to fit into memory.
:param from_encoding: A string indicating the encoding of the :param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding. guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating :param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is the document's encoding but you know Beautiful Soup's guess is
wrong. wrong.
:param element_classes: A dictionary mapping BeautifulSoup :param element_classes: A dictionary mapping BeautifulSoup
classes like Tag and NavigableString to other classes you'd classes like Tag and NavigableString, to other classes you'd
like to be instantiated instead as the parse tree is like to be instantiated instead as the parse tree is
built. This is useful for using subclasses to modify the built. This is useful for subclassing Tag or NavigableString
default behavior of Tag or NavigableString. to modify default behavior.
:param kwargs: For backwards compatibility purposes, the :param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4; they will result in a warning and then be ignored. Beautiful Soup 4; they will result in a warning and then be
ignored.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This Apart from this, any keyword arguments passed into the
makes it possible to configure a TreeBuilder beyond saying BeautifulSoup constructor are propagated to the TreeBuilder
which one to use. constructor. This makes it possible to configure a
TreeBuilder by passing in arguments, not just by saying which
one to use.
""" """
if 'convertEntities' in kwargs: if 'convertEntities' in kwargs:
del kwargs['convertEntities'] del kwargs['convertEntities']
warnings.warn( warnings.warn(
@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
if not original_builder and not ( if not original_builder and not (
original_features == builder.NAME or original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES original_features in builder.ALTERNATE_NAMES
): ) and markup:
# The user did not tell us which TreeBuilder to use,
# and we had to guess. Issue a warning.
if builder.is_xml: if builder.is_xml:
markup_type = "XML" markup_type = "XML"
else: else:
@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
parser=builder.NAME, parser=builder.NAME,
markup_type=markup_type markup_type=markup_type
) )
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) warnings.warn(
self.NO_PARSER_SPECIFIED_WARNING % values,
GuessedAtParserWarning, stacklevel=2
)
else: else:
if kwargs: if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
else: else:
possible_filename = markup possible_filename = markup
is_file = False is_file = False
is_directory = False
try: try:
is_file = os.path.exists(possible_filename) is_file = os.path.exists(possible_filename)
if is_file:
is_directory = os.path.isdir(possible_filename)
except Exception as e: except Exception as e:
# This is almost certainly a problem involving # This is almost certainly a problem involving
# characters not valid in filenames on this # characters not valid in filenames on this
# system. Just let it go. # system. Just let it go.
pass pass
if is_file: if is_directory:
if isinstance(markup, str): warnings.warn(
markup = markup.encode("utf8") '"%s" looks like a directory name, not markup. You may'
' want to open a file found in this directory and pass'
' the filehandle into Beautiful Soup.' % (
self._decode_markup(markup)
),
MarkupResemblesLocatorWarning
)
elif is_file:
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should' '"%s" looks like a filename, not markup. You should'
' probably open this file and pass the filehandle into' ' probably open this file and pass the filehandle into'
' Beautiful Soup.' % markup) ' Beautiful Soup.' % self._decode_markup(markup),
MarkupResemblesLocatorWarning
)
self._check_markup_is_url(markup) self._check_markup_is_url(markup)
rejections = [] rejections = []
@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
self.builder.soup = None self.builder.soup = None
def __copy__(self): def __copy__(self):
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
copy = type(self)( copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
) )
@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
d['builder'] = None d['builder'] = None
return d return d
@staticmethod @classmethod
def _check_markup_is_url(markup): def _decode_markup(cls, markup):
""" """Ensure `markup` is bytes so it's safe to send into warnings.warn.
Check if markup looks like it's actually a url and raise a warning
if so. Markup can be unicode or str (py2) / bytes (py3). TODO: warnings.warn had this problem back in 2010 but it might not
anymore.
"""
if isinstance(markup, bytes):
decoded = markup.decode('utf-8', 'replace')
else:
decoded = markup
return decoded
@classmethod
def _check_markup_is_url(cls, markup):
"""Error-handling method to raise a warning if incoming markup looks
like a URL.
:param markup: A string.
""" """
if isinstance(markup, bytes): if isinstance(markup, bytes):
space = b' ' space = b' '
@ -364,18 +428,20 @@ class BeautifulSoup(Tag):
if any(markup.startswith(prefix) for prefix in cant_start_with): if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup: if not space in markup:
if isinstance(markup, bytes):
decoded_markup = markup.decode('utf-8', 'replace')
else:
decoded_markup = markup
warnings.warn( warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an' '"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like' ' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed' ' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % decoded_markup ' that document to Beautiful Soup.' % cls._decode_markup(
markup
),
MarkupResemblesLocatorWarning
) )
def _feed(self): def _feed(self):
"""Internal method that parses previously set markup, creating a large
number of Tag and NavigableString objects.
"""
# Convert the document to Unicode. # Convert the document to Unicode.
self.builder.reset() self.builder.reset()
@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
self.popTag() self.popTag()
def reset(self): def reset(self):
"""Reset this object to a state as though it had never parsed any
markup.
"""
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1 self.hidden = 1
self.builder.reset() self.builder.reset()
self.current_data = [] self.current_data = []
self.currentTag = None self.currentTag = None
self.tagStack = [] self.tagStack = []
self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = [] self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
self.pushTag(self) self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs): sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup.""" """Create a new Tag associated with this BeautifulSoup object.
:param name: The name of the new Tag.
:param namespace: The URI of the new Tag's XML namespace, if any.
:param prefix: The prefix for the new Tag's XML namespace, if any.
:param attrs: A dictionary of this Tag's attribute values; can
be used instead of `kwattrs` for attributes like 'class'
that are reserved words in Python.
:param sourceline: The line number where this tag was
(purportedly) found in its source document.
:param sourcepos: The character position within `sourceline` where this
tag was (purportedly) found.
:param kwattrs: Keyword arguments for the new Tag's attribute values.
"""
kwattrs.update(attrs) kwattrs.update(attrs)
return self.element_classes.get(Tag, Tag)( return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs, None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos sourceline=sourceline, sourcepos=sourcepos
) )
def new_string(self, s, subclass=None): def string_container(self, base_class=None):
"""Create a new NavigableString associated with this soup.""" container = base_class or NavigableString
subclass = subclass or self.element_classes.get(
NavigableString, NavigableString # There may be a general override of NavigableString.
container = self.element_classes.get(
container, container
) )
return subclass(s)
def insert_before(self, successor): # On top of that, we may be inside a tag that needs a special
# container class.
if self.string_container_stack and container is NavigableString:
container = self.builder.string_containers.get(
self.string_container_stack[-1].name, container
)
return container
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this BeautifulSoup
object.
"""
container = self.string_container(subclass)
return container(s)
def insert_before(self, *args):
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
it because there is nothing before or after it in the parse tree.
"""
raise NotImplementedError("BeautifulSoup objects don't support insert_before().") raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor): def insert_after(self, *args):
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
it because there is nothing before or after it in the parse tree.
"""
raise NotImplementedError("BeautifulSoup objects don't support insert_after().") raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self): def popTag(self):
"""Internal method called by _popToTag when a tag is closed."""
tag = self.tagStack.pop() tag = self.tagStack.pop()
if tag.name in self.open_tag_counter:
self.open_tag_counter[tag.name] -= 1
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop() self.preserve_whitespace_tag_stack.pop()
#print "Pop", tag.name if self.string_container_stack and tag == self.string_container_stack[-1]:
self.string_container_stack.pop()
#print("Pop", tag.name)
if self.tagStack: if self.tagStack:
self.currentTag = self.tagStack[-1] self.currentTag = self.tagStack[-1]
return self.currentTag return self.currentTag
def pushTag(self, tag): def pushTag(self, tag):
#print "Push", tag.name """Internal method called by handle_starttag when a tag is opened."""
#print("Push", tag.name)
if self.currentTag is not None: if self.currentTag is not None:
self.currentTag.contents.append(tag) self.currentTag.contents.append(tag)
self.tagStack.append(tag) self.tagStack.append(tag)
self.currentTag = self.tagStack[-1] self.currentTag = self.tagStack[-1]
if tag.name != self.ROOT_TAG_NAME:
self.open_tag_counter[tag.name] += 1
if tag.name in self.builder.preserve_whitespace_tags: if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag) self.preserve_whitespace_tag_stack.append(tag)
if tag.name in self.builder.string_containers:
self.string_container_stack.append(tag)
def endData(self, containerClass=None): def endData(self, containerClass=None):
"""Method called by the TreeBuilder when the end of a data segment
# Default container is NavigableString. occurs.
containerClass = containerClass or NavigableString """
# The user may want us to instantiate some alias for the
# container class.
containerClass = self.element_classes.get(
containerClass, containerClass
)
if self.current_data: if self.current_data:
current_data = ''.join(self.current_data) current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains # If whitespace is not preserved, and this string contains
@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
not self.parse_only.search(current_data)): not self.parse_only.search(current_data)):
return return
containerClass = self.string_container(containerClass)
o = containerClass(current_data) o = containerClass(current_data)
self.object_was_parsed(o) self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None): def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree.""" """Method called by the TreeBuilder to integrate an object into the parse tree."""
if parent is None: if parent is None:
parent = self.currentTag parent = self.currentTag
if most_recent_element is not None: if most_recent_element is not None:
@ -545,10 +656,19 @@ class BeautifulSoup(Tag):
def _popToTag(self, name, nsprefix=None, inclusivePop=True): def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent """Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag instance of the given tag.
stack up to but *not* including the most recent instqance of
the given tag.""" If there are no open tags with the given name, nothing will be
#print "Popping to %s" % name popped.
:param name: Pop up to the most recent tag with this name.
:param nsprefix: The namespace prefix that goes with `name`.
:param inclusivePop: It this is false, pops the tag stack up
to but *not* including the most recent instqance of the
given tag.
"""
#print("Popping to %s" % name)
if name == self.ROOT_TAG_NAME: if name == self.ROOT_TAG_NAME:
# The BeautifulSoup object itself can never be popped. # The BeautifulSoup object itself can never be popped.
return return
@ -557,6 +677,8 @@ class BeautifulSoup(Tag):
stack_size = len(self.tagStack) stack_size = len(self.tagStack)
for i in range(stack_size - 1, 0, -1): for i in range(stack_size - 1, 0, -1):
if not self.open_tag_counter.get(name):
break
t = self.tagStack[i] t = self.tagStack[i]
if (name == t.name and nsprefix == t.prefix): if (name == t.name and nsprefix == t.prefix):
if inclusivePop: if inclusivePop:
@ -568,15 +690,22 @@ class BeautifulSoup(Tag):
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None): sourcepos=None):
"""Push a start tag on to the stack. """Called by the tree builder when a new tag is encountered.
If this method returns None, the tag was rejected by the :param name: Name of the tag.
:param nsprefix: Namespace prefix for the tag.
:param attrs: A dictionary of attribute values.
:param sourceline: The line number where this tag was found in its
source document.
:param sourcepos: The character position within `sourceline` where this
tag was found.
If this method returns None, the tag was rejected by an active
SoupStrainer. You should proceed as if the tag had not occurred SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag, in the document. For instance, if this was a self-closing tag,
don't call handle_endtag. don't call handle_endtag.
""" """
# print("Start tag %s: %s" % (name, attrs))
# print "Start tag %s: %s" % (name, attrs)
self.endData() self.endData()
if (self.parse_only and len(self.tagStack) <= 1 if (self.parse_only and len(self.tagStack) <= 1
@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
return tag return tag
def handle_endtag(self, name, nsprefix=None): def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name """Called by the tree builder when an ending tag is encountered.
:param name: Name of the tag.
:param nsprefix: Namespace prefix for the tag.
"""
#print("End tag: " + name)
self.endData() self.endData()
self._popToTag(name, nsprefix) self._popToTag(name, nsprefix)
def handle_data(self, data): def handle_data(self, data):
"""Called by the tree builder when a chunk of textual data is encountered."""
self.current_data.append(data) self.current_data.append(data)
def decode(self, pretty_print=False, def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal"):
"""Returns a string or Unicode representation of this document. """Returns a string or Unicode representation of the parse tree
To get Unicode, pass None for encoding.""" as an HTML or XML document.
:param pretty_print: If this is True, indentation will be used to
make the document more readable.
:param eventual_encoding: The encoding of the final document.
If this is None, the document will be a Unicode string.
"""
if self.is_xml: if self.is_xml:
# Print the XML declaration # Print the XML declaration
encoding_part = '' encoding_part = ''
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
# This is a special Python encoding; it can't actually
# go into an XML document because it means nothing
# outside of Python.
eventual_encoding = None
if eventual_encoding != None: if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding encoding_part = ' encoding="%s"' % eventual_encoding
prefix = '<?xml version="1.0"%s?>\n' % encoding_part prefix = '<?xml version="1.0"%s?>\n' % encoding_part
@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode( return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter) indent_level, eventual_encoding, formatter)
# Alias to make it easier to type import: 'from bs4 import _soup' # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup _s = BeautifulSoup
_soup = BeautifulSoup _soup = BeautifulSoup
@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):
class StopParsing(Exception): class StopParsing(Exception):
"""Exception raised by a TreeBuilder if it's unable to continue parsing."""
pass pass
class FeatureNotFound(ValueError): class FeatureNotFound(ValueError):
"""Exception raised by the BeautifulSoup constructor if no parser with the
requested features is found.
"""
pass pass
#By default, act as an HTML pretty-printer. #If this file is run as a script, act as an HTML pretty-printer.
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
soup = BeautifulSoup(sys.stdin) soup = BeautifulSoup(sys.stdin)
print(soup.prettify()) print((soup.prettify()))

View file

@ -7,8 +7,11 @@ import sys
from bs4.element import ( from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
Stylesheet,
Script,
TemplateString,
nonwhitespace_re nonwhitespace_re
) )
__all__ = [ __all__ = [
'HTMLTreeBuilder', 'HTMLTreeBuilder',
@ -27,18 +30,33 @@ HTML_5 = 'html5'
class TreeBuilderRegistry(object): class TreeBuilderRegistry(object):
"""A way of looking up TreeBuilder subclasses by their name or by desired
features.
"""
def __init__(self): def __init__(self):
self.builders_for_feature = defaultdict(list) self.builders_for_feature = defaultdict(list)
self.builders = [] self.builders = []
def register(self, treebuilder_class): def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features.""" """Register a treebuilder based on its advertised features.
:param treebuilder_class: A subclass of Treebuilder. its .features
attribute should list its features.
"""
for feature in treebuilder_class.features: for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class)
def lookup(self, *features): def lookup(self, *features):
"""Look up a TreeBuilder subclass with the desired features.
:param features: A list of features to look for. If none are
provided, the most recently registered TreeBuilder subclass
will be used.
:return: A TreeBuilder subclass, or None if there's no
registered subclass with all the requested features.
"""
if len(self.builders) == 0: if len(self.builders) == 0:
# There are no builders at all. # There are no builders at all.
return None return None
@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
builder_registry = TreeBuilderRegistry() builder_registry = TreeBuilderRegistry()
class TreeBuilder(object): class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree.""" """Turn a textual document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]" NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = [] ALTERNATE_NAMES = []
@ -96,7 +114,12 @@ class TreeBuilder(object):
# comma-separated list of CDATA, rather than a single CDATA. # comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {} DEFAULT_CDATA_LIST_ATTRIBUTES = {}
# Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set() DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
# The textual contents of tags with these names should be
# instantiated with some class other than NavigableString.
DEFAULT_STRING_CONTAINERS = {}
USE_DEFAULT = object() USE_DEFAULT = object()
@ -105,30 +128,39 @@ class TreeBuilder(object):
def __init__(self, multi_valued_attributes=USE_DEFAULT, def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT,
store_line_numbers=USE_DEFAULT): store_line_numbers=USE_DEFAULT,
string_containers=USE_DEFAULT,
):
"""Constructor. """Constructor.
:param multi_valued_attributes: If this is set to None, the :param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will 'class' into lists. Setting this to a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example. for an example.
Internally, these are called "CDATA list attributes", but that Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`. is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat :param preserve_whitespace_tags: A list of tags to treat
the way <pre> tags are treated in HTML. Tags in this list the way <pre> tags are treated in HTML. Tags in this list
will have are immune from pretty-printing; their contents will always be
output as-is.
:param string_containers: A dictionary mapping tag names to
the classes that should be instantiated to contain the textual
contents of those tags. The default is to use NavigableString
for every tag, no matter what the name. You can override the
default by changing DEFAULT_STRING_CONTAINERS.
:param store_line_numbers: If the parser keeps track of the :param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding information will, by default, be stored in each corresponding
`Tag` object. You can turn this off by passing `Tag` object. You can turn this off by passing
store_line_numbers=False. If the parser you're using doesn't store_line_numbers=False. If the parser you're using doesn't
keep track of this information, then setting store_line_numbers=True keep track of this information, then setting store_line_numbers=True
will do nothing. will do nothing.
""" """
self.soup = None self.soup = None
if multi_valued_attributes is self.USE_DEFAULT: if multi_valued_attributes is self.USE_DEFAULT:
@ -139,15 +171,25 @@ class TreeBuilder(object):
self.preserve_whitespace_tags = preserve_whitespace_tags self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT: if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS store_line_numbers = self.TRACKS_LINE_NUMBERS
self.store_line_numbers = store_line_numbers self.store_line_numbers = store_line_numbers
if string_containers == self.USE_DEFAULT:
string_containers = self.DEFAULT_STRING_CONTAINERS
self.string_containers = string_containers
def initialize_soup(self, soup): def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now """The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder. being associated with the TreeBuilder.
:param soup: A BeautifulSoup object.
""" """
self.soup = soup self.soup = soup
def reset(self): def reset(self):
"""Do any work necessary to reset the underlying parser
for a new document.
By default, this does nothing.
"""
pass pass
def can_be_empty_element(self, tag_name): def can_be_empty_element(self, tag_name):
@ -159,23 +201,57 @@ class TreeBuilder(object):
For instance: an HTMLBuilder does not consider a <p> tag to be For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />". will be presented as "<p></p>", not "<p/>" or "<p>".
The default implementation has no opinion about which tags are The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents. empty-element tag if and only if it has no children.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
be left alone. be left alone.
:param tag_name: The name of a markup tag.
""" """
if self.empty_element_tags is None: if self.empty_element_tags is None:
return True return True
return tag_name in self.empty_element_tags return tag_name in self.empty_element_tags
def feed(self, markup): def feed(self, markup):
"""Run some incoming markup through some parsing process,
populating the `BeautifulSoup` object in self.soup.
This method is not implemented in TreeBuilder; it must be
implemented in subclasses.
:return: None.
"""
raise NotImplementedError() raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None): document_declared_encoding=None, exclude_encodings=None):
"""Run any preliminary steps necessary to make incoming markup
acceptable to the parser.
:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding. NOTE: This argument is not used by the
calling code and can probably be removed.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
By default, the only strategy is to parse the markup
as-is. See `LXMLTreeBuilderForXML` and
`HTMLParserTreeBuilder` for implementations that take into
account the quirks of particular parsers.
"""
yield markup, None, None, False yield markup, None, None, False
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
@ -188,16 +264,36 @@ class TreeBuilder(object):
results against other HTML fragments. results against other HTML fragments.
This method should not be used outside of tests. This method should not be used outside of tests.
:param fragment: A string -- fragment of HTML.
:return: A string -- a full HTML document.
""" """
return fragment return fragment
def set_up_substitutions(self, tag): def set_up_substitutions(self, tag):
"""Set up any substitutions that will need to be performed on
a `Tag` when it's output as a string.
By default, this does nothing. See `HTMLTreeBuilder` for a
case where this is used.
:param tag: A `Tag`
:return: Whether or not a substitution was performed.
"""
return False return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs): def _replace_cdata_list_attribute_values(self, tag_name, attrs):
"""Replaces class="foo bar" with class=["foo", "bar"] """When an attribute value is associated with a tag that can
have multiple values for that attribute, convert the string
value to a list of strings.
Modifies its input in place. Basically, replaces class="foo bar" with class=["foo", "bar"]
NOTE: This method modifies its input in place.
:param tag_name: The name of a tag.
:param attrs: A dictionary containing the tag's attributes.
Any appropriate attribute values will be modified in place.
""" """
if not attrs: if not attrs:
return attrs return attrs
@ -225,7 +321,11 @@ class TreeBuilder(object):
return attrs return attrs
class SAXTreeBuilder(TreeBuilder): class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events.""" """A Beautiful Soup treebuilder that listens for SAX events.
This is not currently used for anything, but it demonstrates
how a simple TreeBuilder would work.
"""
def feed(self, markup): def feed(self, markup):
raise NotImplementedError() raise NotImplementedError()
@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):
def startElement(self, name, attrs): def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items())) attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs) #print("Start %s, %r" % (name, attrs))
self.soup.handle_starttag(name, attrs) self.soup.handle_starttag(name, attrs)
def endElement(self, name): def endElement(self, name):
#print "End %s" % name #print("End %s" % name)
self.soup.handle_endtag(name) self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs): def startElementNS(self, nsTuple, nodeName, attrs):
@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder):
# but it may do so eventually, and this information is available if # but it may do so eventually, and this information is available if
# you need to use it. # you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
# The HTML standard defines an unusual content model for these tags.
# We represent this by using a string class other than NavigableString
# inside these tags.
#
# I made this list by going through the HTML spec
# (https://html.spec.whatwg.org/#metadata-content) and looking for
# "metadata content" elements that can contain strings.
#
# TODO: Arguably <noscript> could go here but it seems
# qualitatively different from the other tags.
DEFAULT_STRING_CONTAINERS = {
'style': Stylesheet,
'script': Script,
'template': TemplateString,
}
# The HTML standard defines these attributes as containing a # The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is, # space-separated list of values, not a single value. That is,
@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag): def set_up_substitutions(self, tag):
"""Replace the declared encoding in a <meta> tag with a placeholder,
to be substituted when the tag is output to a string.
An HTML document may come in to Beautiful Soup as one
encoding, but exit in a different encoding, and the <meta> tag
needs to be changed to reflect this.
:param tag: A `Tag`
:return: Whether or not a substitution was performed.
"""
# We are only interested in <meta> tags # We are only interested in <meta> tags
if tag.name != 'meta': if tag.name != 'meta':
return False return False
@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):
def register_treebuilders_from(module): def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module.""" """Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this. this_module = sys.modules[__name__]
this_module = sys.modules['bs4.builder']
for name in module.__all__: for name in module.__all__:
obj = getattr(module, name) obj = getattr(module, name)
@ -363,6 +488,9 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj) this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception): class ParserRejectedMarkup(Exception):
"""An Exception to be raised when the underlying parser simply
refuses to parse the given markup.
"""
def __init__(self, message_or_exception): def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either """Explain why the parser rejected the given markup, either
with a textual explanation or another exception. with a textual explanation or another exception.
@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
# Builders are registered in reverse order of priority, so that custom # Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml # builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only # to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last result. # want to use HTMLParser as a last resort.
from . import _htmlparser from . import _htmlparser
register_treebuilders_from(_htmlparser) register_treebuilders_from(_htmlparser)
try: try:

View file

@ -39,7 +39,18 @@ except ImportError as e:
new_html5lib = True new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder): class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree.""" """Use html5lib to build a tree.
Note that this TreeBuilder does not support some features common
to HTML TreeBuilders. Some of these features could theoretically
be implemented, but at the very least it's quite difficult,
because html5lib moves the parse tree around as it's being built.
* This TreeBuilder doesn't use different subclasses of NavigableString
based on the name of the tag in which the string was found.
* You can't use a SoupStrainer to parse only part of a document.
"""
NAME = "html5lib" NAME = "html5lib"
@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
"", "html.parser", store_line_numbers=store_line_numbers, "", "html.parser", store_line_numbers=store_line_numbers,
**kwargs **kwargs
) )
# TODO: What are **kwargs exactly? Should they be passed in
# here in addition to/instead of being passed to the BeautifulSoup
# constructor?
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser # This will be set later to an html5lib.html5parser.HTMLParser
@ -316,9 +330,7 @@ class Element(treebuilder_base.Node):
return AttrList(self.element) return AttrList(self.element)
def setAttributes(self, attributes): def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0: if attributes is not None and len(attributes) > 0:
converted_attributes = [] converted_attributes = []
for name, value in list(attributes.items()): for name, value in list(attributes.items()):
if isinstance(name, tuple): if isinstance(name, tuple):
@ -363,9 +375,9 @@ class Element(treebuilder_base.Node):
def reparentChildren(self, new_parent): def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag.""" """Move all of this tag's children into another tag."""
# print "MOVE", self.element.contents # print("MOVE", self.element.contents)
# print "FROM", self.element # print("FROM", self.element)
# print "TO", new_parent.element # print("TO", new_parent.element)
element = self.element element = self.element
new_parent_element = new_parent.element new_parent_element = new_parent.element
@ -423,9 +435,9 @@ class Element(treebuilder_base.Node):
element.contents = [] element.contents = []
element.next_element = final_next_element element.next_element = final_next_element
# print "DONE WITH MOVE" # print("DONE WITH MOVE")
# print "FROM", self.element # print("FROM", self.element)
# print "TO", new_parent_element # print("TO", new_parent_element)
def cloneNode(self): def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace) tag = self.soup.new_tag(self.element.name, self.namespace)

View file

@ -8,7 +8,7 @@ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
from future.moves.html.parser import HTMLParser from html.parser import HTMLParser
try: try:
from html.parser import HTMLParseError from html.parser import HTMLParseError
@ -53,8 +53,30 @@ from bs4.builder import (
HTMLPARSER = 'html.parser' HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser): class BeautifulSoupHTMLParser(HTMLParser):
"""A subclass of the Python standard library's HTMLParser class, which
listens for HTMLParser events and translates them into calls
to Beautiful Soup's tree construction API.
"""
# Strategies for handling duplicate attributes
IGNORE = 'ignore'
REPLACE = 'replace'
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
"""Constructor.
:param on_duplicate_attribute: A strategy for what to do if a
tag includes the same attribute more than once. Accepted
values are: REPLACE (replace earlier values with later
ones, the default), IGNORE (keep the earliest value
encountered), or a callable. A callable must take three
arguments: the dictionary of attributes already processed,
the name of the duplicate attribute, and the most recent value
encountered.
"""
self.on_duplicate_attribute = kwargs.pop(
'on_duplicate_attribute', self.REPLACE
)
HTMLParser.__init__(self, *args, **kwargs) HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered # Keep a list of empty-element tags that were encountered
@ -67,20 +89,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element = [] self.already_closed_empty_element = []
def error(self, msg): def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this """In Python 3, HTMLParser subclasses must implement error(), although
requirement doesn't appear to be documented. this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception. In Python 2, HTMLParser implements error() by raising an exception,
which we don't want to do.
In any event, this method is called only on very strange markup and our best strategy In any event, this method is called only on very strange
is to pretend it didn't happen and keep going. markup and our best strategy is to pretend it didn't happen
and keep going.
""" """
warnings.warn(msg) warnings.warn(msg)
def handle_startendtag(self, name, attrs): def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like """Handle an incoming empty-element tag.
# <tag/>.
This is only called when the markup looks like <tag/>.
:param name: Name of the tag.
:param attrs: Dictionary of the tag's attributes.
"""
# is_startend() tells handle_starttag not to close the tag # is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We # just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call # know that this is an empty-element tag and we want to call
@ -89,6 +117,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_endtag(name) self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True): def handle_starttag(self, name, attrs, handle_empty_element=True):
"""Handle an opening tag, e.g. '<tag>'
:param name: Name of the tag.
:param attrs: Dictionary of the tag's attributes.
:param handle_empty_element: True if this tag is known to be
an empty-element tag (i.e. there is not expected to be any
closing tag).
"""
# XXX namespace # XXX namespace
attr_dict = {} attr_dict = {}
for key, value in attrs: for key, value in attrs:
@ -96,9 +132,21 @@ class BeautifulSoupHTMLParser(HTMLParser):
# for consistency with the other tree builders. # for consistency with the other tree builders.
if value is None: if value is None:
value = '' value = ''
attr_dict[key] = value if key in attr_dict:
# A single attribute shows up multiple times in this
# tag. How to handle it depends on the
# on_duplicate_attribute setting.
on_dupe = self.on_duplicate_attribute
if on_dupe == self.IGNORE:
pass
elif on_dupe in (None, self.REPLACE):
attr_dict[key] = value
else:
on_dupe(attr_dict, key, value)
else:
attr_dict[key] = value
attrvalue = '""' attrvalue = '""'
#print "START", name #print("START", name)
sourceline, sourcepos = self.getpos() sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag( tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline, name, None, None, attr_dict, sourceline=sourceline,
@ -121,20 +169,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element.append(name) self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True): def handle_endtag(self, name, check_already_closed=True):
#print "END", name """Handle a closing tag, e.g. '</tag>'
:param name: A tag name.
:param check_already_closed: True if this tag is expected to
be the closing portion of an empty-element tag,
e.g. '<tag></tag>'.
"""
#print("END", name)
if check_already_closed and name in self.already_closed_empty_element: if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag. # This is a redundant end tag for an empty-element tag.
# We've already called handle_endtag() for it, so just # We've already called handle_endtag() for it, so just
# check it off the list. # check it off the list.
# print "ALREADY CLOSED", name #print("ALREADY CLOSED", name)
self.already_closed_empty_element.remove(name) self.already_closed_empty_element.remove(name)
else: else:
self.soup.handle_endtag(name) self.soup.handle_endtag(name)
def handle_data(self, data): def handle_data(self, data):
"""Handle some textual data that shows up between tags."""
self.soup.handle_data(data) self.soup.handle_data(data)
def handle_charref(self, name): def handle_charref(self, name):
"""Handle a numeric character reference by converting it to the
corresponding Unicode character and treating it as textual
data.
:param name: Character number, possibly in hexadecimal.
"""
# XXX workaround for a bug in HTMLParser. Remove this once # XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions. # it's fixed in all supported versions.
# http://bugs.python.org/issue13633 # http://bugs.python.org/issue13633
@ -168,6 +230,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data) self.handle_data(data)
def handle_entityref(self, name): def handle_entityref(self, name):
"""Handle a named entity reference by converting it to the
corresponding Unicode character(s) and treating it as textual
data.
:param name: Name of the entity reference.
"""
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None: if character is not None:
data = character data = character
@ -181,21 +249,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data) self.handle_data(data)
def handle_comment(self, data): def handle_comment(self, data):
"""Handle an HTML comment.
:param data: The text of the comment.
"""
self.soup.endData() self.soup.endData()
self.soup.handle_data(data) self.soup.handle_data(data)
self.soup.endData(Comment) self.soup.endData(Comment)
def handle_decl(self, data): def handle_decl(self, data):
"""Handle a DOCTYPE declaration.
:param data: The text of the declaration.
"""
self.soup.endData() self.soup.endData()
if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):]
data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
self.soup.handle_data(data) self.soup.handle_data(data)
self.soup.endData(Doctype) self.soup.endData(Doctype)
def unknown_decl(self, data): def unknown_decl(self, data):
"""Handle a declaration of unknown type -- probably a CDATA block.
:param data: The text of the declaration.
"""
if data.upper().startswith('CDATA['): if data.upper().startswith('CDATA['):
cls = CData cls = CData
data = data[len('CDATA['):] data = data[len('CDATA['):]
@ -206,13 +282,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData(cls) self.soup.endData(cls)
def handle_pi(self, data): def handle_pi(self, data):
"""Handle a processing instruction.
:param data: The text of the instruction.
"""
self.soup.endData() self.soup.endData()
self.soup.handle_data(data) self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction) self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder): class HTMLParserTreeBuilder(HTMLTreeBuilder):
"""A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
found in the Python standard library.
"""
is_xml = False is_xml = False
picklable = True picklable = True
NAME = HTMLPARSER NAME = HTMLPARSER
@ -221,36 +303,88 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
# The html.parser knows which line number and position in the # The html.parser knows which line number and position in the
# original file is the source of an element. # original file is the source of an element.
TRACKS_LINE_NUMBERS = True TRACKS_LINE_NUMBERS = True
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
"""Constructor.
:param parser_args: Positional arguments to pass into
the BeautifulSoupHTMLParser constructor, once it's
invoked.
:param parser_kwargs: Keyword arguments to pass into
the BeautifulSoupHTMLParser constructor, once it's
invoked.
:param kwargs: Keyword arguments for the superclass constructor.
"""
# Some keyword arguments will be pulled out of kwargs and placed
# into parser_kwargs.
extra_parser_kwargs = dict()
for arg in ('on_duplicate_attribute',):
if arg in kwargs:
value = kwargs.pop(arg)
extra_parser_kwargs[arg] = value
super(HTMLParserTreeBuilder, self).__init__(**kwargs) super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or [] parser_args = parser_args or []
parser_kwargs = parser_kwargs or {} parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs) self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None): document_declared_encoding=None, exclude_encodings=None):
"""
:return: A 4-tuple (markup, original encoding, encoding """Run any preliminary steps necessary to make incoming markup
declared within markup, whether any characters had to be acceptable to the parser.
replaced with REPLACEMENT CHARACTER).
:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
""" """
if isinstance(markup, str): if isinstance(markup, str):
# Parse Unicode as-is.
yield (markup, None, None, False) yield (markup, None, None, False)
return return
# Ask UnicodeDammit to sniff the most likely encoding.
# This was provided by the end-user; treat it as a known
# definite encoding per the algorithm laid out in the HTML5
# spec. (See the EncodingDetector class for details.)
known_definite_encodings = [user_specified_encoding]
# This was found in the document; treat it as a slightly lower-priority
# user encoding.
user_encodings = [document_declared_encoding]
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True, dammit = UnicodeDammit(
exclude_encodings=exclude_encodings) markup,
known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings,
is_html=True,
exclude_encodings=exclude_encodings
)
yield (dammit.markup, dammit.original_encoding, yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding, dammit.declared_html_encoding,
dammit.contains_replacement_characters) dammit.contains_replacement_characters)
def feed(self, markup): def feed(self, markup):
"""Run some incoming markup through some parsing process,
populating the `BeautifulSoup` object in self.soup.
"""
args, kwargs = self.parser_args args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs) parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup parser.soup = self.soup

View file

@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# But instead we build an XMLParser or HTMLParser object to serve # But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include # as the target of parse messages, and those messages don't include
# line numbers. # line numbers.
# See: https://bugs.launchpad.net/lxml/+bug/1846906
def initialize_soup(self, soup): def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace """Let the BeautifulSoup object know about the standard namespace
mapping. mapping.
:param soup: A `BeautifulSoup`.
""" """
super(LXMLTreeBuilderForXML, self).initialize_soup(soup) super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS) self._register_namespaces(self.DEFAULT_NSMAPS)
@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
while parsing the document. while parsing the document.
This might be useful later on when creating CSS selectors. This might be useful later on when creating CSS selectors.
:param mapping: A dictionary mapping namespace prefixes to URIs.
""" """
for key, value in list(mapping.items()): for key, value in list(mapping.items()):
if key and key not in self.soup._namespaces: if key and key not in self.soup._namespaces:
@ -84,20 +89,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup._namespaces[key] = value self.soup._namespaces[key] = value
def default_parser(self, encoding): def default_parser(self, encoding):
# This can either return a parser object or a class, which """Find the default parser for the given encoding.
# will be instantiated with default arguments.
:param encoding: A string.
:return: Either a parser object or a class, which
will be instantiated with default arguments.
"""
if self._default_parser is not None: if self._default_parser is not None:
return self._default_parser return self._default_parser
return etree.XMLParser( return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding) target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding): def parser_for(self, encoding):
"""Instantiate an appropriate parser for the given encoding.
:param encoding: A string.
:return: A parser object such as an `etree.XMLParser`.
"""
# Use the default parser. # Use the default parser.
parser = self.default_parser(encoding) parser = self.default_parser(encoding)
if isinstance(parser, Callable): if isinstance(parser, Callable):
# Instantiate the parser with default arguments # Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding) parser = parser(
target=self, strip_cdata=False, recover=True, encoding=encoding
)
return parser return parser
def __init__(self, parser=None, empty_element_tags=None, **kwargs): def __init__(self, parser=None, empty_element_tags=None, **kwargs):
@ -122,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None, exclude_encodings=None,
document_declared_encoding=None): document_declared_encoding=None):
""" """Run any preliminary steps necessary to make incoming markup
:yield: A series of 4-tuples. acceptable to the parser.
lxml really wants to get a bytestring and convert it to
Unicode itself. So instead of using UnicodeDammit to convert
the bytestring to Unicode using different encodings, this
implementation uses EncodingDetector to iterate over the
encodings, and tell lxml to try to parse the document as each
one in turn.
:param markup: Some markup -- hopefully a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
in this encoding.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
:yield: A series of 4-tuples:
(markup, encoding, declared encoding, (markup, encoding, declared encoding,
has undergone character replacement) has undergone character replacement)
Each 4-tuple represents a strategy for parsing the document. Each 4-tuple represents a strategy for converting the
document to Unicode and parsing it. Each strategy will be tried
in turn.
""" """
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml is_html = not self.is_xml
if is_html: if is_html:
self.processing_instruction_class = ProcessingInstruction self.processing_instruction_class = ProcessingInstruction
@ -150,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False) document_declared_encoding, False)
try_encodings = [user_specified_encoding, document_declared_encoding] # This was provided by the end-user; treat it as a known
# definite encoding per the algorithm laid out in the HTML5
# spec. (See the EncodingDetector class for details.)
known_definite_encodings = [user_specified_encoding]
# This was found in the document; treat it as a slightly lower-priority
# user encoding.
user_encodings = [document_declared_encoding]
detector = EncodingDetector( detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings) markup, known_definite_encodings=known_definite_encodings,
user_encodings=user_encodings, is_html=is_html,
exclude_encodings=exclude_encodings
)
for encoding in detector.encodings: for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False) yield (detector.markup, encoding, document_declared_encoding, False)

View file

@ -1,4 +0,0 @@
import requests
data = requests.get("https://www.crummy.com/").content
from bs4 import _s
data = [x for x in _s(data).block_text()]

File diff suppressed because it is too large Load diff

View file

@ -20,9 +20,13 @@ import sys
import cProfile import cProfile
def diagnose(data): def diagnose(data):
"""Diagnostic suite for isolating common problems.""" """Diagnostic suite for isolating common problems.
print("Diagnostic running on Beautiful Soup %s" % __version__)
print("Python version %s" % sys.version) :param data: A string containing markup that needs to be explained.
:return: None; diagnostics are printed to standard output.
"""
print(("Diagnostic running on Beautiful Soup %s" % __version__))
print(("Python version %s" % sys.version))
basic_parsers = ["html.parser", "html5lib", "lxml"] basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers: for name in basic_parsers:
@ -39,65 +43,76 @@ def diagnose(data):
basic_parsers.append("lxml-xml") basic_parsers.append("lxml-xml")
try: try:
from lxml import etree from lxml import etree
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
except ImportError as e: except ImportError as e:
print ( print(
"lxml is not installed or couldn't be imported.") "lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers: if 'html5lib' in basic_parsers:
try: try:
import html5lib import html5lib
print("Found html5lib version %s" % html5lib.__version__) print(("Found html5lib version %s" % html5lib.__version__))
except ImportError as e: except ImportError as e:
print ( print(
"html5lib is not installed or couldn't be imported.") "html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return return
else: else:
try: try:
if os.path.exists(data): if os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data) print(('"%s" looks like a filename. Reading data from the file.' % data))
with open(data) as fp: with open(data) as fp:
data = fp.read() data = fp.read()
except ValueError: except ValueError:
# This can happen on some platforms when the 'filename' is # This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename. # too long. Assume it's data and not a filename.
pass pass
print() print("")
for parser in basic_parsers: for parser in basic_parsers:
print("Trying to parse your markup with %s" % parser) print(("Trying to parse your markup with %s" % parser))
success = False success = False
try: try:
soup = BeautifulSoup(data, features=parser) soup = BeautifulSoup(data, features=parser)
success = True success = True
except Exception as e: except Exception as e:
print("%s could not parse the markup." % parser) print(("%s could not parse the markup." % parser))
traceback.print_exc() traceback.print_exc()
if success: if success:
print("Here's what %s did with the markup:" % parser) print(("Here's what %s did with the markup:" % parser))
print(soup.prettify()) print((soup.prettify()))
print("-" * 80) print(("-" * 80))
def lxml_trace(data, html=True, **kwargs): def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing. """Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful This lets you see how lxml parses a document when no Beautiful
Soup code is running. Soup code is running. You can use this to determine whether
an lxml-specific problem is in Beautiful Soup's lxml tree builders
or in lxml itself.
:param data: Some markup.
:param html: If True, markup will be parsed with lxml's HTML parser.
if False, lxml's XML parser will be used.
""" """
from lxml import etree from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print(("%s, %4s, %s" % (event, element.tag, element.text))) print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser): class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else.""" """Subclass of HTMLParser that announces parse events, without doing
anything else.
You can use this to get a picture of how html.parser sees a given
document. The easiest way to do this is to call `htmlparser_trace`.
"""
def _p(self, s): def _p(self, s):
print(s) print(s)
@ -134,6 +149,8 @@ def htmlparser_trace(data):
This lets you see how HTMLParser parses a document when no This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running. Beautiful Soup code is running.
:param data: Some markup.
""" """
parser = AnnouncingParser() parser = AnnouncingParser()
parser.feed(data) parser.feed(data)
@ -176,9 +193,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000): def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark.""" """Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__) print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
data = rdoc(num_elements) data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data)) print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False success = False
@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
b = time.time() b = time.time()
success = True success = True
except Exception as e: except Exception as e:
print("%s could not parse the markup." % parser) print(("%s could not parse the markup." % parser))
traceback.print_exc() traceback.print_exc()
if success: if success:
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
from lxml import etree from lxml import etree
a = time.time() a = time.time()
etree.HTML(data) etree.HTML(data)
b = time.time() b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a)) print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
import html5lib import html5lib
parser = html5lib.HTMLParser() parser = html5lib.HTMLParser()
a = time.time() a = time.time()
parser.parse(data) parser.parse(data)
b = time.time() b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a)) print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
def profile(num_elements=100000, parser="lxml"): def profile(num_elements=100000, parser="lxml"):
"""Use Python's profiler on a randomly generated document."""
filehandle = tempfile.NamedTemporaryFile() filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name filename = filehandle.name
@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
stats.sort_stats("cumulative") stats.sort_stats("cumulative")
stats.print_stats('_html5lib|bs4', 50) stats.print_stats('_html5lib|bs4', 50)
# If this file is run as a script, standard input is diagnosed.
if __name__ == '__main__': if __name__ == '__main__':
diagnose(sys.stdin.read()) diagnose(sys.stdin.read())

File diff suppressed because it is too large Load diff

View file

@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):
Some parts of this strategy come from the distinction between Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user. HTML4, HTML5, and XML. Others are configurable by the user.
Formatters are passed in as the `formatter` argument to methods
like `PageElement.encode`. Most people won't need to think about
formatters, and most people who need to think about them can pass
in one of these predefined strings as `formatter` rather than
making a new Formatter object:
For HTML documents:
* 'html' - HTML entity substitution for generic HTML documents. (default)
* 'html5' - HTML entity substitution for HTML5 documents, as
well as some optimizations in the way tags are rendered.
* 'minimal' - Only make the substitutions necessary to guarantee
valid HTML.
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
For XML documents:
* 'html' - Entity substitution for XHTML documents.
* 'minimal' - Only make the substitutions necessary to guarantee
valid XML. (default)
* None - Do not perform any substitution. This will be faster
but may result in invalid markup.
""" """
# Registries of XML and HTML formatters. # Registries of XML and HTML formatters.
XML_FORMATTERS = {} XML_FORMATTERS = {}
@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
def __init__( def __init__(
self, language=None, entity_substitution=None, self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None, void_element_close_prefix='/', cdata_containing_tags=None,
empty_attributes_are_booleans=False,
): ):
""" """Constructor.
:param void_element_close_prefix: By default, represent void :param language: This should be Formatter.XML if you are formatting
elements as <tag/> rather than <tag> XML markup and Formatter.HTML if you are formatting HTML markup.
:param entity_substitution: A function to call to replace special
characters with XML/HTML entities. For examples, see
bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
:param void_element_close_prefix: By default, void elements
are represented as <tag/> (XML rules) rather than <tag>
(HTML rules). To get <tag>, pass in the empty string.
:param cdata_containing_tags: The list of tags that are defined
as containing CDATA in this dialect. For example, in HTML,
<script> and <style> tags are defined as containing CDATA,
and their contents should not be formatted.
:param blank_attributes_are_booleans: Render attributes whose value
is the empty string as HTML-style boolean attributes.
(Attributes whose value is None are always rendered this way.)
""" """
self.language = language self.language = language
self.entity_substitution = entity_substitution self.entity_substitution = entity_substitution
@ -39,9 +76,17 @@ class Formatter(EntitySubstitution):
self.cdata_containing_tags = self._default( self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags' language, cdata_containing_tags, 'cdata_containing_tags'
) )
self.empty_attributes_are_booleans=empty_attributes_are_booleans
def substitute(self, ns): def substitute(self, ns):
"""Process a string that needs to undergo entity substitution.""" """Process a string that needs to undergo entity substitution.
This may be a string encountered in an attribute value or as
text.
:param ns: A string.
:return: A string with certain characters replaced by named
or numeric entities.
"""
if not self.entity_substitution: if not self.entity_substitution:
return ns return ns
from .element import NavigableString from .element import NavigableString
@ -54,21 +99,41 @@ class Formatter(EntitySubstitution):
return self.entity_substitution(ns) return self.entity_substitution(ns)
def attribute_value(self, value): def attribute_value(self, value):
"""Process the value of an attribute.""" """Process the value of an attribute.
:param ns: A string.
:return: A string with certain characters replaced by named
or numeric entities.
"""
return self.substitute(value) return self.substitute(value)
def attributes(self, tag): def attributes(self, tag):
"""Reorder a tag's attributes however you want.""" """Reorder a tag's attributes however you want.
return sorted(tag.attrs.items())
By default, attributes are sorted alphabetically. This makes
behavior consistent between Python 2 and Python 3, and preserves
backwards compatibility with older versions of Beautiful Soup.
If `empty_boolean_attributes` is True, then attributes whose
values are set to the empty string will be treated as boolean
attributes.
"""
if tag.attrs is None:
return []
return sorted(
(k, (None if self.empty_attributes_are_booleans and v == '' else v))
for k, v in list(tag.attrs.items())
)
class HTMLFormatter(Formatter): class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY = {} REGISTRY = {}
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter): class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY = {} REGISTRY = {}
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
@ -80,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
) )
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html, entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None void_element_close_prefix=None,
empty_attributes_are_booleans=True,
) )
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml entity_substitution=EntitySubstitution.substitute_xml

View file

@ -8,6 +8,7 @@ import pickle
import copy import copy
import functools import functools
import unittest import unittest
import warnings
from unittest import TestCase from unittest import TestCase
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import ( from bs4.element import (
@ -15,7 +16,10 @@ from bs4.element import (
Comment, Comment,
ContentMetaAttributeValue, ContentMetaAttributeValue,
Doctype, Doctype,
PYTHON_SPECIFIC_ENCODINGS,
SoupStrainer, SoupStrainer,
Script,
Stylesheet,
Tag Tag
) )
@ -83,8 +87,22 @@ class SoupTest(unittest.TestCase):
if compare_parsed_to is None: if compare_parsed_to is None:
compare_parsed_to = to_parse compare_parsed_to = to_parse
# Verify that the documents come out the same.
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
# Also run some checks on the BeautifulSoup object itself:
# Verify that every tag that was opened was eventually closed.
# There are no tags in the open tag counter.
assert all(v==0 for v in list(obj.open_tag_counter.values()))
# The only tag in the tag stack is the one for the root
# document.
self.assertEqual(
[obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
)
def assertConnectedness(self, element): def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly """Ensure that next_element and previous_element are properly
set for all descendants of the given element. set for all descendants of the given element.
@ -211,7 +229,41 @@ class SoupTest(unittest.TestCase):
return child return child
class HTMLTreeBuilderSmokeTest(object): class TreeBuilderSmokeTest(object):
# Tests that are common to HTML and XML tree builders.
def test_fuzzed_input(self):
# This test centralizes in one place the various fuzz tests
# for Beautiful Soup created by the oss-fuzz project.
# These strings superficially resemble markup, but they
# generally can't be parsed into anything. The best we can
# hope for is that parsing these strings won't crash the
# parser.
#
# n.b. This markup is commented out because these fuzz tests
# _do_ crash the parser. However the crashes are due to bugs
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
# bugs!
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://bugs.python.org/issue37747
#
#b'\n<![\xff\xfe\xfe\xcd\x00',
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
# https://bugs.python.org/issue34480
#
#b'<![n\x00'
]
for markup in bad_markup:
with warnings.catch_warnings(record=False):
soup = self.soup(markup)
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
"""A basic test of a treebuilder's competence. """A basic test of a treebuilder's competence.
@ -233,6 +285,22 @@ class HTMLTreeBuilderSmokeTest(object):
new_tag = soup.new_tag(name) new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element) self.assertEqual(True, new_tag.is_empty_element)
def test_special_string_containers(self):
soup = self.soup(
"<style>Some CSS</style><script>Some Javascript</script>"
)
assert isinstance(soup.style.string, Stylesheet)
assert isinstance(soup.script.string, Script)
soup = self.soup(
"<style><!--Some CSS--></style>"
)
assert isinstance(soup.style.string, Stylesheet)
# The contents of the style tag resemble an HTML comment, but
# it's not treated as a comment.
self.assertEqual("<!--Some CSS-->", soup.style.string)
assert isinstance(soup.style.string, Stylesheet)
def test_pickle_and_unpickle_identity(self): def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical # Pickling a tree, then unpickling it, yields a tree identical
# to the original. # to the original.
@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
doctype = soup.contents[0] doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype) self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment) self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) self.assertEqual(
soup.encode("utf8")[:len(doctype_str)],
doctype_str
)
# Make sure that the doctype was correctly associated with the # Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed. # parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo') self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment): def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
"""Generate and parse a document with the given doctype.""" """Generate and parse a document with the given doctype."""
doctype = '<!DOCTYPE %s>' % doctype_fragment doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
markup = doctype + '\n<p>foo</p>' markup = doctype + '\n<p>foo</p>'
soup = self.soup(markup) soup = self.soup(markup)
return doctype, soup return doctype.encode("utf8"), soup
def test_normal_doctypes(self): def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly.""" """Make sure normal, everyday HTML doctypes are handled correctly."""
@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object):
doctype = soup.contents[0] doctype = soup.contents[0]
self.assertEqual("", doctype.strip()) self.assertEqual("", doctype.strip())
def test_mixed_case_doctype(self):
# A lowercase or mixed-case doctype becomes a Doctype.
for doctype_fragment in ("doctype", "DocType"):
doctype_str, soup = self._document_with_doctype(
"html", doctype_fragment
)
# Make sure a Doctype object was created and that the DOCTYPE
# is uppercase.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, "html")
self.assertEqual(
soup.encode("utf8")[:len(doctype_str)],
b"<!DOCTYPE html>"
)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def test_public_doctype_with_url(self): def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype) self.assertDoctypeHandled(doctype)
@ -532,7 +624,7 @@ Hello, world!
self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect) self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self): def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder." "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@ -594,7 +686,7 @@ Hello, world!
markup = b'<a class="foo bar">' markup = b'<a class="foo bar">'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class']) self.assertEqual(['foo', 'bar'], soup.a['class'])
# #
# Generally speaking, tests below this point are more tests of # Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are # Beautiful Soup than tests of the tree builders. But parsers are
@ -779,11 +871,44 @@ Hello, world!
# encoding. # encoding.
self.assertEqual('utf8', charset.encode("utf8")) self.assertEqual('utf8', charset.encode("utf8"))
def test_python_specific_encodings_not_used_in_charset(self):
# You can encode an HTML document using a Python-specific
# encoding, but that encoding won't be mentioned _inside_ the
# resulting document. Instead, the document will appear to
# have no encoding.
for markup in [
b'<meta charset="utf8"></head>'
b'<meta id="encoding" charset="utf-8" />'
]:
soup = self.soup(markup)
for encoding in PYTHON_SPECIFIC_ENCODINGS:
if encoding in (
'idna', 'mbcs', 'oem', 'undefined',
'string_escape', 'string-escape'
):
# For one reason or another, these will raise an
# exception if we actually try to use them, so don't
# bother.
continue
encoded = soup.encode(encoding)
assert b'meta charset=""' in encoded
assert encoding.encode("ascii") not in encoded
def test_tag_with_no_attributes_can_have_attributes_added(self): def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>") data = self.soup("<a>text</a>")
data.a['foo'] = 'bar' data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode()) self.assertEqual('<a foo="bar">text</a>', data.a.decode())
def test_closing_tag_with_no_opening_tag(self):
# Without BeautifulSoup.open_tag_counter, the </span> tag will
# cause _popToTag to be called over and over again as we look
# for a <span> tag that wasn't there. The result is that 'text2'
# will show up outside the body of the document.
soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
self.assertEqual(
"<body><div><p>text1</p>text2</div></body>", soup.body.decode()
)
def test_worst_case(self): def test_worst_case(self):
"""Test the worst case (currently) for linking issues.""" """Test the worst case (currently) for linking issues."""
@ -791,7 +916,7 @@ Hello, world!
self.linkage_validator(soup) self.linkage_validator(soup)
class XMLTreeBuilderSmokeTest(object): class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
def test_pickle_and_unpickle_identity(self): def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical # Pickling a tree, then unpickling it, yields a tree identical
@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8")) self.assertEqual(markup, soup.encode("utf8"))
def test_python_specific_encodings_not_used_in_xml_declaration(self):
# You can encode an XML document using a Python-specific
# encoding, but that encoding won't be mentioned _inside_ the
# resulting document.
markup = b"""<?xml version="1.0"?>\n<foo/>"""
soup = self.soup(markup)
for encoding in PYTHON_SPECIFIC_ENCODINGS:
if encoding in (
'idna', 'mbcs', 'oem', 'undefined',
'string_escape', 'string-escape'
):
# For one reason or another, these will raise an
# exception if we actually try to use them, so don't
# bother.
continue
encoded = soup.encode(encoding)
assert b'<?xml version="1.0"?>' in encoded
assert encoding.encode("ascii") not in encoded
def test_processing_instruction(self): def test_processing_instruction(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
soup = self.soup(markup) soup = self.soup(markup)
@ -828,7 +972,7 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual( self.assertEqual(
soup.encode("utf-8"), markup) soup.encode("utf-8"), markup)
def test_nested_namespaces(self): def test_nested_namespaces(self):
doc = b"""<?xml version="1.0" encoding="utf-8"?> doc = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

View file

@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup, store_line_numbers=False) soup = self.soup(markup, store_line_numbers=False)
self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name) self.assertEqual("sourcepos", soup.p.sourcepos.name)
def test_special_string_containers(self):
# The html5lib tree builder doesn't support this standard feature,
# because there's no way of knowing, when a string is created,
# where in the tree it will eventually end up.
pass
def test_html5_attributes(self):
# The html5lib TreeBuilder can convert any entity named in
# the HTML5 spec to a sequence of Unicode characters, and
# convert those Unicode characters to a (potentially
# different) named entity on the way out.
#
# This is a copy of the same test from
# HTMLParserTreeBuilderSmokeTest. It's not in the superclass
# because the lxml HTML TreeBuilder _doesn't_ work this way.
for input_element, output_unicode, output_element in (
("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
('&models;', '\u22a7', b'&models;'),
('&Nfr;', '\U0001d511', b'&Nfr;'),
('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
('&not;', '\xac', b'&not;'),
('&Not;', '\u2aec', b'&Not;'),
('&quot;', '"', b'"'),
('&there4;', '\u2234', b'&there4;'),
('&Therefore;', '\u2234', b'&there4;'),
('&therefore;', '\u2234', b'&there4;'),
("&fjlig;", 'fj', b'fj'),
("&sqcup;", '\u2294', b'&sqcup;'),
("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
("&apos;", "'", b"'"),
("&verbar;", "|", b"|"),
):
markup = '<div>%s</div>' % input_element
div = self.soup(markup).div
without_element = div.encode()
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
self.assertEqual(without_element, expect)
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
self.assertEqual(with_element, expect)

View file

@ -3,6 +3,7 @@ trees."""
from pdb import set_trace from pdb import set_trace
import pickle import pickle
import warnings
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder from bs4.builder import HTMLParserTreeBuilder
from bs4.builder._htmlparser import BeautifulSoupHTMLParser from bs4.builder._htmlparser import BeautifulSoupHTMLParser
@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name) self.assertEqual("sourcepos", soup.p.sourcepos.name)
def test_on_duplicate_attribute(self):
# The html.parser tree builder has a variety of ways of
# handling a tag that contains the same attribute multiple times.
markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
# If you don't provide any particular value for
# on_duplicate_attribute, later values replace earlier values.
soup = self.soup(markup)
self.assertEqual("url3", soup.a['href'])
self.assertEqual(["cls"], soup.a['class'])
self.assertEqual("id", soup.a['id'])
# You can also get this behavior explicitly.
def assert_attribute(on_duplicate_attribute, expected):
soup = self.soup(
markup, on_duplicate_attribute=on_duplicate_attribute
)
self.assertEqual(expected, soup.a['href'])
# Verify that non-duplicate attributes are treated normally.
self.assertEqual(["cls"], soup.a['class'])
self.assertEqual("id", soup.a['id'])
assert_attribute(None, "url3")
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
# You can ignore subsequent values in favor of the first.
assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
# And you can pass in a callable that does whatever you want.
def accumulate(attrs, key, value):
if not isinstance(attrs[key], list):
attrs[key] = [attrs[key]]
attrs[key].append(value)
assert_attribute(accumulate, ["url1", "url2", "url3"])
def test_html5_attributes(self):
# The html.parser TreeBuilder can convert any entity named in
# the HTML5 spec to a sequence of Unicode characters, and
# convert those Unicode characters to a (potentially
# different) named entity on the way out.
for input_element, output_unicode, output_element in (
("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
('&models;', '\u22a7', b'&models;'),
('&Nfr;', '\U0001d511', b'&Nfr;'),
('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
('&not;', '\xac', b'&not;'),
('&Not;', '\u2aec', b'&Not;'),
('&quot;', '"', b'"'),
('&there4;', '\u2234', b'&there4;'),
('&Therefore;', '\u2234', b'&there4;'),
('&therefore;', '\u2234', b'&there4;'),
("&fjlig;", 'fj', b'fj'),
("&sqcup;", '\u2294', b'&sqcup;'),
("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
("&apos;", "'", b"'"),
("&verbar;", "|", b"|"),
):
markup = '<div>%s</div>' % input_element
div = self.soup(markup).div
without_element = div.encode()
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
self.assertEqual(without_element, expect)
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
self.assertEqual(with_element, expect)
class TestHTMLParserSubclass(SoupTest): class TestHTMLParserSubclass(SoupTest):
def test_error(self): def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way """Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash. that doesn't cause a crash.
""" """
parser = BeautifulSoupHTMLParser() parser = BeautifulSoupHTMLParser()
parser.error("don't crash") with warnings.catch_warnings(record=True) as warns:
parser.error("don't crash")
[warning] = warns
assert "don't crash" == str(warning.message)

View file

@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>") "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals( self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>") "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_entities_in_foreign_document_encoding(self): def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we # We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into # hear about markup like "&#147;", it's been (incorrectly) converted into

View file

@ -3,6 +3,7 @@
from pdb import set_trace from pdb import set_trace
import logging import logging
import os
import unittest import unittest
import sys import sys
import tempfile import tempfile
@ -10,6 +11,8 @@ import tempfile
from bs4 import ( from bs4 import (
BeautifulSoup, BeautifulSoup,
BeautifulStoneSoup, BeautifulStoneSoup,
GuessedAtParserWarning,
MarkupResemblesLocatorWarning,
) )
from bs4.builder import ( from bs4.builder import (
TreeBuilder, TreeBuilder,
@ -29,7 +32,6 @@ import bs4.dammit
from bs4.dammit import ( from bs4.dammit import (
EntitySubstitution, EntitySubstitution,
UnicodeDammit, UnicodeDammit,
EncodingDetector,
) )
from bs4.testing import ( from bs4.testing import (
default_builder, default_builder,
@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
self.store_line_numbers = False self.store_line_numbers = False
self.cdata_list_attributes = [] self.cdata_list_attributes = []
self.preserve_whitespace_tags = [] self.preserve_whitespace_tags = []
self.string_containers = {}
def initialize_soup(self, soup): def initialize_soup(self, soup):
pass pass
def feed(self, markup): def feed(self, markup):
@ -186,28 +189,69 @@ class TestConstructor(SoupTest):
isinstance(x, (TagPlus, StringPlus, CommentPlus)) isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator() for x in soup.recursiveChildGenerator()
) )
def test_alternate_string_containers(self):
# Test the ability to customize the string containers for
# different types of tags.
class PString(NavigableString):
pass
class BString(NavigableString):
pass
soup = self.soup(
"<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
string_containers = {
'b': BString,
'p': PString,
}
)
# The string before the <p> tag is a regular NavigableString.
assert isinstance(soup.div.contents[0], NavigableString)
# The string inside the <p> tag, but not inside the <i> tag,
# is a PString.
assert isinstance(soup.p.contents[0], PString)
# Every string inside the <b> tag is a BString, even the one that
# was also inside an <i> tag.
for s in soup.b.strings:
assert isinstance(s, BString)
# Now that parsing was complete, the string_container_stack
# (where this information was kept) has been cleared out.
self.assertEqual([], soup.string_container_stack)
class TestWarnings(SoupTest): class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True): def _assert_warning(self, warnings, cls):
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) for w in warnings:
self.assertTrue(v) if isinstance(w.message, cls):
return w
raise Exception("%s warning not found in %r" % cls, warnings)
def _assert_no_parser_specified(self, w):
warning = self._assert_warning(w, GuessedAtParserWarning)
message = str(warning.message)
self.assertTrue(
message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
)
def test_warning_if_no_parser_specified(self): def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>") soup = BeautifulSoup("<a><b></b></a>")
msg = str(w[0].message) self._assert_no_parser_specified(w)
self._assert_no_parser_specified(msg)
def test_warning_if_parser_specified_too_vague(self): def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html") soup = BeautifulSoup("<a><b></b></a>", "html")
msg = str(w[0].message) self._assert_no_parser_specified(w)
self._assert_no_parser_specified(msg)
def test_no_warning_if_explicit_parser_specified(self): def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html.parser") soup = BeautifulSoup("<a><b></b></a>", "html.parser")
self.assertEqual([], w) self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self): def test_parseOnlyThese_renamed_to_parse_only(self):
@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
self.assertRaises( self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True) TypeError, self.soup, "<a>", no_such_argument=True)
class TestWarnings(SoupTest):
def test_disk_file_warning(self): def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile() filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name filename = filehandle.name
try: try:
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename) soup = self.soup(filename)
msg = str(w[0].message) warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
self.assertTrue("looks like a filename" in msg) self.assertTrue("looks like a filename" in str(warning.message))
finally: finally:
filehandle.close() filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning. # The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename) soup = self.soup(filename)
self.assertEqual(0, len(w)) self.assertEqual([], w)
def test_directory_warning(self):
try:
filename = tempfile.mkdtemp()
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
self.assertTrue("looks like a directory" in str(warning.message))
finally:
os.rmdir(filename)
# The directory no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
self.assertEqual([], w)
def test_url_warning_with_bytes_url(self): def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/") soup = self.soup(b"http://www.crummybytes.com/")
# Be aware this isn't the only warning that can be raised during warning = self._assert_warning(
# execution.. warning_list, MarkupResemblesLocatorWarning
self.assertTrue(any("looks like a URL" in str(w.message) )
for w in warning_list)) self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_unicode_url(self): def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise # note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning # python's warnings system swallows the second warning
soup = self.soup("http://www.crummyunicode.com/") soup = self.soup("http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message) warning = self._assert_warning(
for w in warning_list)) warning_list, MarkupResemblesLocatorWarning
)
self.assertTrue("looks like a URL" in str(warning.message))
def test_url_warning_with_bytes_and_space(self): def test_url_warning_with_bytes_and_space(self):
# Here the markup contains something besides a URL, so no warning
# is issued.
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great") soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message) self.assertFalse(any("looks like a URL" in str(w.message)
@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual(self.sub.substitute_html(dammit.markup), self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;") "&lsquo;&rsquo;foo&ldquo;&rdquo;")
def test_html5_entity(self):
# Some HTML5 entities correspond to single- or multi-character
# Unicode sequences.
for entity, u in (
# A few spot checks of our ability to recognize
# special character sequences and convert them
# to named entities.
('&models;', '\u22a7'),
('&Nfr;', '\U0001d511'),
('&ngeqq;', '\u2267\u0338'),
('&not;', '\xac'),
('&Not;', '\u2aec'),
# We _could_ convert | to &verbarr;, but we don't, because
# | is an ASCII character.
('|' '|'),
# Similarly for the fj ligature, which we could convert to
# &fjlig;, but we don't.
("fj", "fj"),
# We do convert _these_ ASCII characters to HTML entities,
# because that's required to generate valid HTML.
('&gt;', '>'),
('&lt;', '<'),
('&amp;', '&'),
):
template = '3 %s 4'
raw = template % u
with_entities = template % entity
self.assertEqual(self.sub.substitute_html(raw), with_entities)
def test_html5_entity_with_variation_selector(self):
# Some HTML5 entities correspond either to a single-character
# Unicode sequence _or_ to the same character plus U+FE00,
# VARIATION SELECTOR 1. We can handle this.
data = "fjords \u2294 penguins"
markup = "fjords &sqcup; penguins"
self.assertEqual(self.sub.substitute_html(data), markup)
data = "fjords \u2294\ufe00 penguins"
markup = "fjords &sqcups; penguins"
self.assertEqual(self.sub.substitute_html(data), markup)
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"' s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s) self.assertEqual(self.sub.substitute_xml(s, False), s)
@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest):
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
# And if we exclude that, there is no valid guess at all.
dammit = UnicodeDammit(
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
self.assertEqual(dammit.original_encoding, None)
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
for data in (
b'<html><meta charset="euc-jp" /></html>',
b"<html><meta charset='euc-jp' /></html>",
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_byte_order_mark_removed(self):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
def test_find_declared_encoding(self):
# Test our ability to find a declared encoding inside an
# XML or HTML document.
#
# Even if the document comes in as Unicode, it may be
# interesting to know what encoding was claimed
# originally.
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
html_bytes = html_unicode.encode("ascii")
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
xml_bytes = xml_unicode.encode("ascii")
m = EncodingDetector.find_declared_encoding
self.assertEqual(None, m(html_unicode, is_html=False))
self.assertEqual("utf-8", m(html_unicode, is_html=True))
self.assertEqual("utf-8", m(html_bytes, is_html=True))
self.assertEqual("iso-8859-1", m(xml_unicode))
self.assertEqual("iso-8859-1", m(xml_bytes))
# Normally, only the first few kilobytes of a document are checked for
# an encoding.
spacer = b' ' * 5000
self.assertEqual(None, m(spacer + html_bytes))
self.assertEqual(None, m(spacer + xml_bytes))
# But you can tell find_declared_encoding to search an entire
# HTML document.
self.assertEqual(
"utf-8",
m(spacer + html_bytes, is_html=True, search_entire_document=True)
)
# The XML encoding declaration has to be the very first thing
# in the document. We'll allow whitespace before the document
# starts, but nothing else.
self.assertEqual(
"iso-8859-1",
m(xml_bytes, search_entire_document=True)
)
self.assertEqual(
None, m(b'a' + xml_bytes, search_entire_document=True)
)
class TestNamedspacedAttribute(SoupTest): class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none_or_missing(self): def test_name_may_be_none_or_missing(self):
a = NamespacedAttribute("xmlns", None) a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns") self.assertEqual(a, "xmlns")
a = NamespacedAttribute("xmlns", "")
self.assertEqual(a, "xmlns")
a = NamespacedAttribute("xmlns") a = NamespacedAttribute("xmlns")
self.assertEqual(a, "xmlns") self.assertEqual(a, "xmlns")
def test_namespace_may_be_none_or_missing(self):
a = NamespacedAttribute(None, "tag")
self.assertEqual(a, "tag")
a = NamespacedAttribute("", "tag")
self.assertEqual(a, "tag")
def test_attribute_is_equivalent_to_colon_separated_string(self): def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b") a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a) self.assertEqual("a:b", a)

View file

@ -27,13 +27,17 @@ from bs4.element import (
Doctype, Doctype,
Formatter, Formatter,
NavigableString, NavigableString,
Script,
SoupStrainer, SoupStrainer,
Stylesheet,
Tag, Tag,
TemplateString,
) )
from bs4.testing import ( from bs4.testing import (
SoupTest, SoupTest,
skipIf, skipIf,
) )
from soupsieve import SelectorSyntaxError
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
soup.a.extend(l) soup.a.extend(l)
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode()) self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
def test_extend_with_another_tags_contents(self):
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
soup = self.soup(data)
d1 = soup.find('div', id='d1')
d2 = soup.find('div', id='d2')
d2.extend(d1)
self.assertEqual('<div id="d1"></div>', d1.decode())
self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
def test_move_tag_to_beginning_of_parent(self): def test_move_tag_to_beginning_of_parent(self):
data = "<a><b></b><c></c><d></d></a>" data = "<a><b></b><c></c><d></d></a>"
soup = self.soup(data) soup = self.soup(data)
@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
self.assertEqual(no.next_element, "no") self.assertEqual(no.next_element, "no")
self.assertEqual(no.next_sibling, " business") self.assertEqual(no.next_sibling, " business")
def test_replace_with_errors(self):
# Can't replace a tag that's not part of a tree.
a_tag = Tag(name="a")
self.assertRaises(ValueError, a_tag.replace_with, "won't work")
# Can't replace a tag with its parent.
a_tag = self.soup("<a><b></b></a>").a
self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
# Or with a list that includes its parent.
self.assertRaises(ValueError, a_tag.b.replace_with,
"string1", a_tag, "string2")
def test_replace_with_multiple(self):
data = "<a><b></b><c></c></a>"
soup = self.soup(data)
d_tag = soup.new_tag("d")
d_tag.string = "Text In D Tag"
e_tag = soup.new_tag("e")
f_tag = soup.new_tag("f")
a_string = "Random Text"
soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
self.assertEqual(
"<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>",
soup.decode()
)
assert soup.b.next_element == d_tag
assert d_tag.string.next_element==e_tag
assert e_tag.next_element.string == a_string
assert e_tag.next_element.next_element == f_tag
def test_replace_first_child(self): def test_replace_first_child(self):
data = "<a><b></b><c></c></a>" data = "<a><b></b><c></c></a>"
soup = self.soup(data) soup = self.soup(data)
@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
a.clear(decompose=True) a.clear(decompose=True)
self.assertEqual(0, len(em.contents)) self.assertEqual(0, len(em.contents))
def test_decompose(self):
# Test PageElement.decompose() and PageElement.decomposed
soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
p1, p2 = soup.find_all('p')
a = p1.a
text = p1.em.string
for i in [p1, p2, a, text]:
self.assertEqual(False, i.decomposed)
# This sets p1 and everything beneath it to decomposed.
p1.decompose()
for i in [p1, a, text]:
self.assertEqual(True, i.decomposed)
# p2 is unaffected.
self.assertEqual(False, p2.decomposed)
def test_string_set(self): def test_string_set(self):
"""Tag.string = 'string'""" """Tag.string = 'string'"""
soup = self.soup("<a></a> <b><c></c></b>") soup = self.soup("<a></a> <b><c></c></b>")
@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
def test_get_text_ignores_comments(self): def test_get_text_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar") soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(soup.get_text(), "foobar") self.assertEqual(soup.get_text(), "foobar")
@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
self.assertEqual( self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar") soup.get_text(types=None), "fooIGNOREbar")
def test_all_strings_ignores_comments(self): soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
self.assertEqual(soup.get_text(), "foobar")
def test_all_strings_ignores_special_string_containers(self):
soup = self.soup("foo<!--IGNORE-->bar") soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings)) self.assertEqual(['foo', 'bar'], list(soup.strings))
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
def test_string_methods_inside_special_string_container_tags(self):
# Strings inside tags like <script> are generally ignored by
# methods like get_text, because they're not what humans
# consider 'text'. But if you call get_text on the <script>
# tag itself, those strings _are_ considered to be 'text',
# because there's nothing else you might be looking for.
style = self.soup("<div>a<style>Some CSS</style></div>")
template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
self.assertEqual(style.div.get_text(), "a")
self.assertEqual(list(style.div.strings), ["a"])
self.assertEqual(style.div.style.get_text(), "Some CSS")
self.assertEqual(list(style.div.style.strings),
['Some CSS'])
# The comment is not picked up here. That's because it was
# parsed into a Comment object, which is not considered
# interesting by template.strings.
self.assertEqual(template.div.get_text(), "a")
self.assertEqual(list(template.div.strings), ["a"])
self.assertEqual(template.div.template.get_text(), "Templated text.")
self.assertEqual(list(template.div.template.strings),
["Templated ", "text", "."])
# The comment is included here, because it didn't get parsed
# into a Comment object--it's part of the Script string.
self.assertEqual(script.div.get_text(), "a")
self.assertEqual(list(script.div.strings), ["a"])
self.assertEqual(script.div.script.get_text(),
"<!--a comment-->Some text")
self.assertEqual(list(script.div.script.strings),
['<!--a comment-->Some text'])
class TestCDAtaListAttributes(SoupTest): class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'. """Testing cdata-list attributes like 'class'.
@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest):
else: else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup)) self.assertEqual(b'<b>\\u2603</b>', repr(soup))
class TestFormatter(SoupTest):
def test_sort_attributes(self):
# Test the ability to override Formatter.attributes() to,
# e.g., disable the normal sorting of attributes.
class UnsortedFormatter(Formatter):
def attributes(self, tag):
self.called_with = tag
for k, v in sorted(tag.attrs.items()):
if k == 'ignore':
continue
yield k,v
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
formatter = UnsortedFormatter()
decoded = soup.decode(formatter=formatter)
# attributes() was called on the <p> tag. It filtered out one
# attribute and sorted the other two.
self.assertEqual(formatter.called_with, soup.p)
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
class TestNavigableStringSubclasses(SoupTest):
def test_cdata(self):
# None of the current builders turn CDATA sections into CData
# objects, but you can create them manually.
soup = self.soup("")
cdata = CData("foo")
soup.insert(1, cdata)
self.assertEqual(str(soup), "<![CDATA[foo]]>")
self.assertEqual(soup.find(text="foo"), "foo")
self.assertEqual(soup.contents[0], "foo")
def test_cdata_is_never_formatted(self):
"""Text inside a CData object is passed into the formatter.
But the return value is ignored.
"""
self.count = 0
def increment(*args):
self.count += 1
return "BITTER FAILURE"
soup = self.soup("")
cdata = CData("<><><>")
soup.insert(1, cdata)
self.assertEqual(
b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
self.assertEqual(1, self.count)
def test_doctype_ends_in_newline(self):
# Unlike other NavigableString subclasses, a DOCTYPE always ends
# in a newline.
doctype = Doctype("foo")
soup = self.soup("")
soup.insert(1, doctype)
self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
def test_declaration(self):
d = Declaration("foo")
self.assertEqual("<?foo?>", d.output_ready())
class TestSoupSelector(TreeTest): class TestSoupSelector(TreeTest):
HTML = """ HTML = """
@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0) self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self): def test_invalid_tag(self):
self.assertRaises(SyntaxError, self.soup.select, 'tag%t') self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self): def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
NotImplementedError, self.soup.select, "a:no-such-pseudoclass") NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises( self.assertRaises(
SyntaxError, self.soup.select, "a:nth-of-type(a)") SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self): def test_nth_of_type(self):
# Try to select first paragraph # Try to select first paragraph
@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual([], self.soup.select('#inner ~ h2')) self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self): def test_dangling_combinator(self):
self.assertRaises(SyntaxError, self.soup.select, 'h1 >') self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self): def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self): def test_invalid_multiple_select(self):
self.assertRaises(SyntaxError, self.soup.select, ',x, y') self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
self.assertRaises(SyntaxError, self.soup.select, 'x,,y') self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self): def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb']) self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])