mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 21:21:15 -07:00
Update beautifulsoup4-4.10.0
This commit is contained in:
parent
b581460b51
commit
ab8fa4d5b3
16 changed files with 4599 additions and 743 deletions
|
@ -1,6 +1,5 @@
|
||||||
"""Beautiful Soup
|
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
|
||||||
Elixir and Tonic
|
|
||||||
"The Screen-Scraper's Friend"
|
|
||||||
http://www.crummy.com/software/BeautifulSoup/
|
http://www.crummy.com/software/BeautifulSoup/
|
||||||
|
|
||||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
|
@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||||
provides methods and Pythonic idioms that make it easy to navigate,
|
provides methods and Pythonic idioms that make it easy to navigate,
|
||||||
search, and modify the parse tree.
|
search, and modify the parse tree.
|
||||||
|
|
||||||
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
Beautiful Soup works with Python 3.5 and up. It works better if lxml
|
||||||
and/or html5lib is installed.
|
and/or html5lib is installed.
|
||||||
|
|
||||||
For more than you ever wanted to know about Beautiful Soup, see the
|
For more than you ever wanted to know about Beautiful Soup, see the
|
||||||
documentation:
|
documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.8.1"
|
__version__ = "4.10.0"
|
||||||
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
|
||||||
# Use of this source code is governed by the MIT license.
|
# Use of this source code is governed by the MIT license.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
# The very first thing we do is give a useful error if someone is
|
||||||
|
# running this code under Python 2.
|
||||||
|
if sys.version_info.major < 3:
|
||||||
|
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
|
||||||
|
|
||||||
from .builder import builder_registry, ParserRejectedMarkup
|
from .builder import builder_registry, ParserRejectedMarkup
|
||||||
from .dammit import UnicodeDammit
|
from .dammit import UnicodeDammit
|
||||||
from .element import (
|
from .element import (
|
||||||
|
@ -42,28 +46,49 @@ from .element import (
|
||||||
NavigableString,
|
NavigableString,
|
||||||
PageElement,
|
PageElement,
|
||||||
ProcessingInstruction,
|
ProcessingInstruction,
|
||||||
|
PYTHON_SPECIFIC_ENCODINGS,
|
||||||
ResultSet,
|
ResultSet,
|
||||||
|
Script,
|
||||||
|
Stylesheet,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
Tag,
|
Tag,
|
||||||
|
TemplateString,
|
||||||
)
|
)
|
||||||
|
|
||||||
# The very first thing we do is give a useful error if someone is
|
# Define some custom warnings.
|
||||||
# running this code under Python 3 without converting it.
|
class GuessedAtParserWarning(UserWarning):
|
||||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
"""The warning issued when BeautifulSoup has to guess what parser to
|
||||||
|
use -- probably because no parser was specified in the constructor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class MarkupResemblesLocatorWarning(UserWarning):
|
||||||
|
"""The warning issued when BeautifulSoup is given 'markup' that
|
||||||
|
actually looks like a resource locator -- a URL or a path to a file
|
||||||
|
on disk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class BeautifulSoup(Tag):
|
class BeautifulSoup(Tag):
|
||||||
"""
|
"""A data structure representing a parsed HTML or XML document.
|
||||||
This class defines the basic interface called by the tree builders.
|
|
||||||
|
|
||||||
These methods will be called by the parser:
|
Most of the methods you'll call on a BeautifulSoup object are inherited from
|
||||||
reset()
|
PageElement or Tag.
|
||||||
feed(markup)
|
|
||||||
|
Internally, this class defines the basic interface called by the
|
||||||
|
tree builders when converting an HTML/XML document into a data
|
||||||
|
structure. The interface abstracts away the differences between
|
||||||
|
parsers. To write a new tree builder, you'll need to understand
|
||||||
|
these methods as a whole.
|
||||||
|
|
||||||
|
These methods will be called by the BeautifulSoup constructor:
|
||||||
|
* reset()
|
||||||
|
* feed(markup)
|
||||||
|
|
||||||
The tree builder may call these methods from its feed() implementation:
|
The tree builder may call these methods from its feed() implementation:
|
||||||
handle_starttag(name, attrs) # See note about return value
|
* handle_starttag(name, attrs) # See note about return value
|
||||||
handle_endtag(name)
|
* handle_endtag(name)
|
||||||
handle_data(data) # Appends to the current data node
|
* handle_data(data) # Appends to the current data node
|
||||||
endData(containerClass) # Ends the current data node
|
* endData(containerClass) # Ends the current data node
|
||||||
|
|
||||||
No matter how complicated the underlying parser is, you should be
|
No matter how complicated the underlying parser is, you should be
|
||||||
able to build a tree using 'start tag' events, 'end tag' events,
|
able to build a tree using 'start tag' events, 'end tag' events,
|
||||||
|
@ -73,12 +98,18 @@ class BeautifulSoup(Tag):
|
||||||
like HTML's <br> tag), call handle_starttag and then
|
like HTML's <br> tag), call handle_starttag and then
|
||||||
handle_endtag.
|
handle_endtag.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
|
||||||
|
# a Tag with a .name. This name makes it clear the BeautifulSoup
|
||||||
|
# object isn't a real markup tag.
|
||||||
ROOT_TAG_NAME = '[document]'
|
ROOT_TAG_NAME = '[document]'
|
||||||
|
|
||||||
# If the end-user gives no indication which tree builder they
|
# If the end-user gives no indication which tree builder they
|
||||||
# want, look for one with these features.
|
# want, look for one with these features.
|
||||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||||
|
|
||||||
|
# A string containing all ASCII whitespace characters, used in
|
||||||
|
# endData() to detect data chunks that seem 'empty'.
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||||
|
@ -91,12 +122,13 @@ class BeautifulSoup(Tag):
|
||||||
:param markup: A string or a file-like object representing
|
:param markup: A string or a file-like object representing
|
||||||
markup to be parsed.
|
markup to be parsed.
|
||||||
|
|
||||||
:param features: Desirable features of the parser to be used. This
|
:param features: Desirable features of the parser to be
|
||||||
may be the name of a specific parser ("lxml", "lxml-xml",
|
used. This may be the name of a specific parser ("lxml",
|
||||||
"html.parser", or "html5lib") or it may be the type of markup
|
"lxml-xml", "html.parser", or "html5lib") or it may be the
|
||||||
to be used ("html", "html5", "xml"). It's recommended that you
|
type of markup to be used ("html", "html5", "xml"). It's
|
||||||
name a specific parser, so that Beautiful Soup gives you the
|
recommended that you name a specific parser, so that
|
||||||
same results across platforms and virtual environments.
|
Beautiful Soup gives you the same results across platforms
|
||||||
|
and virtual environments.
|
||||||
|
|
||||||
:param builder: A TreeBuilder subclass to instantiate (or
|
:param builder: A TreeBuilder subclass to instantiate (or
|
||||||
instance to use) instead of looking one up based on
|
instance to use) instead of looking one up based on
|
||||||
|
@ -118,23 +150,23 @@ class BeautifulSoup(Tag):
|
||||||
wrong.
|
wrong.
|
||||||
|
|
||||||
:param element_classes: A dictionary mapping BeautifulSoup
|
:param element_classes: A dictionary mapping BeautifulSoup
|
||||||
classes like Tag and NavigableString to other classes you'd
|
classes like Tag and NavigableString, to other classes you'd
|
||||||
like to be instantiated instead as the parse tree is
|
like to be instantiated instead as the parse tree is
|
||||||
built. This is useful for using subclasses to modify the
|
built. This is useful for subclassing Tag or NavigableString
|
||||||
default behavior of Tag or NavigableString.
|
to modify default behavior.
|
||||||
|
|
||||||
:param kwargs: For backwards compatibility purposes, the
|
:param kwargs: For backwards compatibility purposes, the
|
||||||
constructor accepts certain keyword arguments used in
|
constructor accepts certain keyword arguments used in
|
||||||
Beautiful Soup 3. None of these arguments do anything in
|
Beautiful Soup 3. None of these arguments do anything in
|
||||||
Beautiful Soup 4; they will result in a warning and then be ignored.
|
Beautiful Soup 4; they will result in a warning and then be
|
||||||
|
ignored.
|
||||||
Apart from this, any keyword arguments passed into the BeautifulSoup
|
|
||||||
constructor are propagated to the TreeBuilder constructor. This
|
|
||||||
makes it possible to configure a TreeBuilder beyond saying
|
|
||||||
which one to use.
|
|
||||||
|
|
||||||
|
Apart from this, any keyword arguments passed into the
|
||||||
|
BeautifulSoup constructor are propagated to the TreeBuilder
|
||||||
|
constructor. This makes it possible to configure a
|
||||||
|
TreeBuilder by passing in arguments, not just by saying which
|
||||||
|
one to use.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if 'convertEntities' in kwargs:
|
if 'convertEntities' in kwargs:
|
||||||
del kwargs['convertEntities']
|
del kwargs['convertEntities']
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
@ -223,7 +255,9 @@ class BeautifulSoup(Tag):
|
||||||
if not original_builder and not (
|
if not original_builder and not (
|
||||||
original_features == builder.NAME or
|
original_features == builder.NAME or
|
||||||
original_features in builder.ALTERNATE_NAMES
|
original_features in builder.ALTERNATE_NAMES
|
||||||
):
|
) and markup:
|
||||||
|
# The user did not tell us which TreeBuilder to use,
|
||||||
|
# and we had to guess. Issue a warning.
|
||||||
if builder.is_xml:
|
if builder.is_xml:
|
||||||
markup_type = "XML"
|
markup_type = "XML"
|
||||||
else:
|
else:
|
||||||
|
@ -257,7 +291,10 @@ class BeautifulSoup(Tag):
|
||||||
parser=builder.NAME,
|
parser=builder.NAME,
|
||||||
markup_type=markup_type
|
markup_type=markup_type
|
||||||
)
|
)
|
||||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
warnings.warn(
|
||||||
|
self.NO_PARSER_SPECIFIED_WARNING % values,
|
||||||
|
GuessedAtParserWarning, stacklevel=2
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if kwargs:
|
if kwargs:
|
||||||
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
||||||
|
@ -286,20 +323,32 @@ class BeautifulSoup(Tag):
|
||||||
else:
|
else:
|
||||||
possible_filename = markup
|
possible_filename = markup
|
||||||
is_file = False
|
is_file = False
|
||||||
|
is_directory = False
|
||||||
try:
|
try:
|
||||||
is_file = os.path.exists(possible_filename)
|
is_file = os.path.exists(possible_filename)
|
||||||
|
if is_file:
|
||||||
|
is_directory = os.path.isdir(possible_filename)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# This is almost certainly a problem involving
|
# This is almost certainly a problem involving
|
||||||
# characters not valid in filenames on this
|
# characters not valid in filenames on this
|
||||||
# system. Just let it go.
|
# system. Just let it go.
|
||||||
pass
|
pass
|
||||||
if is_file:
|
if is_directory:
|
||||||
if isinstance(markup, str):
|
warnings.warn(
|
||||||
markup = markup.encode("utf8")
|
'"%s" looks like a directory name, not markup. You may'
|
||||||
|
' want to open a file found in this directory and pass'
|
||||||
|
' the filehandle into Beautiful Soup.' % (
|
||||||
|
self._decode_markup(markup)
|
||||||
|
),
|
||||||
|
MarkupResemblesLocatorWarning
|
||||||
|
)
|
||||||
|
elif is_file:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should'
|
'"%s" looks like a filename, not markup. You should'
|
||||||
' probably open this file and pass the filehandle into'
|
' probably open this file and pass the filehandle into'
|
||||||
' Beautiful Soup.' % markup)
|
' Beautiful Soup.' % self._decode_markup(markup),
|
||||||
|
MarkupResemblesLocatorWarning
|
||||||
|
)
|
||||||
self._check_markup_is_url(markup)
|
self._check_markup_is_url(markup)
|
||||||
|
|
||||||
rejections = []
|
rejections = []
|
||||||
|
@ -329,6 +378,7 @@ class BeautifulSoup(Tag):
|
||||||
self.builder.soup = None
|
self.builder.soup = None
|
||||||
|
|
||||||
def __copy__(self):
|
def __copy__(self):
|
||||||
|
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
|
||||||
copy = type(self)(
|
copy = type(self)(
|
||||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||||
)
|
)
|
||||||
|
@ -347,11 +397,25 @@ class BeautifulSoup(Tag):
|
||||||
d['builder'] = None
|
d['builder'] = None
|
||||||
return d
|
return d
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def _check_markup_is_url(markup):
|
def _decode_markup(cls, markup):
|
||||||
|
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
|
||||||
|
|
||||||
|
TODO: warnings.warn had this problem back in 2010 but it might not
|
||||||
|
anymore.
|
||||||
"""
|
"""
|
||||||
Check if markup looks like it's actually a url and raise a warning
|
if isinstance(markup, bytes):
|
||||||
if so. Markup can be unicode or str (py2) / bytes (py3).
|
decoded = markup.decode('utf-8', 'replace')
|
||||||
|
else:
|
||||||
|
decoded = markup
|
||||||
|
return decoded
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _check_markup_is_url(cls, markup):
|
||||||
|
"""Error-handling method to raise a warning if incoming markup looks
|
||||||
|
like a URL.
|
||||||
|
|
||||||
|
:param markup: A string.
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
space = b' '
|
space = b' '
|
||||||
|
@ -364,18 +428,20 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||||
if not space in markup:
|
if not space in markup:
|
||||||
if isinstance(markup, bytes):
|
|
||||||
decoded_markup = markup.decode('utf-8', 'replace')
|
|
||||||
else:
|
|
||||||
decoded_markup = markup
|
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||||
' HTTP client. You should probably use an HTTP client like'
|
' HTTP client. You should probably use an HTTP client like'
|
||||||
' requests to get the document behind the URL, and feed'
|
' requests to get the document behind the URL, and feed'
|
||||||
' that document to Beautiful Soup.' % decoded_markup
|
' that document to Beautiful Soup.' % cls._decode_markup(
|
||||||
|
markup
|
||||||
|
),
|
||||||
|
MarkupResemblesLocatorWarning
|
||||||
)
|
)
|
||||||
|
|
||||||
def _feed(self):
|
def _feed(self):
|
||||||
|
"""Internal method that parses previously set markup, creating a large
|
||||||
|
number of Tag and NavigableString objects.
|
||||||
|
"""
|
||||||
# Convert the document to Unicode.
|
# Convert the document to Unicode.
|
||||||
self.builder.reset()
|
self.builder.reset()
|
||||||
|
|
||||||
|
@ -386,66 +452,110 @@ class BeautifulSoup(Tag):
|
||||||
self.popTag()
|
self.popTag()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
"""Reset this object to a state as though it had never parsed any
|
||||||
|
markup.
|
||||||
|
"""
|
||||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||||
self.hidden = 1
|
self.hidden = 1
|
||||||
self.builder.reset()
|
self.builder.reset()
|
||||||
self.current_data = []
|
self.current_data = []
|
||||||
self.currentTag = None
|
self.currentTag = None
|
||||||
self.tagStack = []
|
self.tagStack = []
|
||||||
|
self.open_tag_counter = Counter()
|
||||||
self.preserve_whitespace_tag_stack = []
|
self.preserve_whitespace_tag_stack = []
|
||||||
|
self.string_container_stack = []
|
||||||
self.pushTag(self)
|
self.pushTag(self)
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
||||||
sourceline=None, sourcepos=None, **kwattrs):
|
sourceline=None, sourcepos=None, **kwattrs):
|
||||||
"""Create a new tag associated with this soup."""
|
"""Create a new Tag associated with this BeautifulSoup object.
|
||||||
|
|
||||||
|
:param name: The name of the new Tag.
|
||||||
|
:param namespace: The URI of the new Tag's XML namespace, if any.
|
||||||
|
:param prefix: The prefix for the new Tag's XML namespace, if any.
|
||||||
|
:param attrs: A dictionary of this Tag's attribute values; can
|
||||||
|
be used instead of `kwattrs` for attributes like 'class'
|
||||||
|
that are reserved words in Python.
|
||||||
|
:param sourceline: The line number where this tag was
|
||||||
|
(purportedly) found in its source document.
|
||||||
|
:param sourcepos: The character position within `sourceline` where this
|
||||||
|
tag was (purportedly) found.
|
||||||
|
:param kwattrs: Keyword arguments for the new Tag's attribute values.
|
||||||
|
|
||||||
|
"""
|
||||||
kwattrs.update(attrs)
|
kwattrs.update(attrs)
|
||||||
return self.element_classes.get(Tag, Tag)(
|
return self.element_classes.get(Tag, Tag)(
|
||||||
None, self.builder, name, namespace, nsprefix, kwattrs,
|
None, self.builder, name, namespace, nsprefix, kwattrs,
|
||||||
sourceline=sourceline, sourcepos=sourcepos
|
sourceline=sourceline, sourcepos=sourcepos
|
||||||
)
|
)
|
||||||
|
|
||||||
def new_string(self, s, subclass=None):
|
def string_container(self, base_class=None):
|
||||||
"""Create a new NavigableString associated with this soup."""
|
container = base_class or NavigableString
|
||||||
subclass = subclass or self.element_classes.get(
|
|
||||||
NavigableString, NavigableString
|
|
||||||
)
|
|
||||||
return subclass(s)
|
|
||||||
|
|
||||||
def insert_before(self, successor):
|
# There may be a general override of NavigableString.
|
||||||
|
container = self.element_classes.get(
|
||||||
|
container, container
|
||||||
|
)
|
||||||
|
|
||||||
|
# On top of that, we may be inside a tag that needs a special
|
||||||
|
# container class.
|
||||||
|
if self.string_container_stack and container is NavigableString:
|
||||||
|
container = self.builder.string_containers.get(
|
||||||
|
self.string_container_stack[-1].name, container
|
||||||
|
)
|
||||||
|
return container
|
||||||
|
|
||||||
|
def new_string(self, s, subclass=None):
|
||||||
|
"""Create a new NavigableString associated with this BeautifulSoup
|
||||||
|
object.
|
||||||
|
"""
|
||||||
|
container = self.string_container(subclass)
|
||||||
|
return container(s)
|
||||||
|
|
||||||
|
def insert_before(self, *args):
|
||||||
|
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
|
||||||
|
it because there is nothing before or after it in the parse tree.
|
||||||
|
"""
|
||||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||||
|
|
||||||
def insert_after(self, successor):
|
def insert_after(self, *args):
|
||||||
|
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
|
||||||
|
it because there is nothing before or after it in the parse tree.
|
||||||
|
"""
|
||||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||||
|
|
||||||
def popTag(self):
|
def popTag(self):
|
||||||
|
"""Internal method called by _popToTag when a tag is closed."""
|
||||||
tag = self.tagStack.pop()
|
tag = self.tagStack.pop()
|
||||||
|
if tag.name in self.open_tag_counter:
|
||||||
|
self.open_tag_counter[tag.name] -= 1
|
||||||
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
||||||
self.preserve_whitespace_tag_stack.pop()
|
self.preserve_whitespace_tag_stack.pop()
|
||||||
#print "Pop", tag.name
|
if self.string_container_stack and tag == self.string_container_stack[-1]:
|
||||||
|
self.string_container_stack.pop()
|
||||||
|
#print("Pop", tag.name)
|
||||||
if self.tagStack:
|
if self.tagStack:
|
||||||
self.currentTag = self.tagStack[-1]
|
self.currentTag = self.tagStack[-1]
|
||||||
return self.currentTag
|
return self.currentTag
|
||||||
|
|
||||||
def pushTag(self, tag):
|
def pushTag(self, tag):
|
||||||
#print "Push", tag.name
|
"""Internal method called by handle_starttag when a tag is opened."""
|
||||||
|
#print("Push", tag.name)
|
||||||
if self.currentTag is not None:
|
if self.currentTag is not None:
|
||||||
self.currentTag.contents.append(tag)
|
self.currentTag.contents.append(tag)
|
||||||
self.tagStack.append(tag)
|
self.tagStack.append(tag)
|
||||||
self.currentTag = self.tagStack[-1]
|
self.currentTag = self.tagStack[-1]
|
||||||
|
if tag.name != self.ROOT_TAG_NAME:
|
||||||
|
self.open_tag_counter[tag.name] += 1
|
||||||
if tag.name in self.builder.preserve_whitespace_tags:
|
if tag.name in self.builder.preserve_whitespace_tags:
|
||||||
self.preserve_whitespace_tag_stack.append(tag)
|
self.preserve_whitespace_tag_stack.append(tag)
|
||||||
|
if tag.name in self.builder.string_containers:
|
||||||
|
self.string_container_stack.append(tag)
|
||||||
|
|
||||||
def endData(self, containerClass=None):
|
def endData(self, containerClass=None):
|
||||||
|
"""Method called by the TreeBuilder when the end of a data segment
|
||||||
# Default container is NavigableString.
|
occurs.
|
||||||
containerClass = containerClass or NavigableString
|
"""
|
||||||
|
|
||||||
# The user may want us to instantiate some alias for the
|
|
||||||
# container class.
|
|
||||||
containerClass = self.element_classes.get(
|
|
||||||
containerClass, containerClass
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.current_data:
|
if self.current_data:
|
||||||
current_data = ''.join(self.current_data)
|
current_data = ''.join(self.current_data)
|
||||||
# If whitespace is not preserved, and this string contains
|
# If whitespace is not preserved, and this string contains
|
||||||
|
@ -472,11 +582,12 @@ class BeautifulSoup(Tag):
|
||||||
not self.parse_only.search(current_data)):
|
not self.parse_only.search(current_data)):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
containerClass = self.string_container(containerClass)
|
||||||
o = containerClass(current_data)
|
o = containerClass(current_data)
|
||||||
self.object_was_parsed(o)
|
self.object_was_parsed(o)
|
||||||
|
|
||||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||||
"""Add an object to the parse tree."""
|
"""Method called by the TreeBuilder to integrate an object into the parse tree."""
|
||||||
if parent is None:
|
if parent is None:
|
||||||
parent = self.currentTag
|
parent = self.currentTag
|
||||||
if most_recent_element is not None:
|
if most_recent_element is not None:
|
||||||
|
@ -545,10 +656,19 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
"""Pops the tag stack up to and including the most recent
|
"""Pops the tag stack up to and including the most recent
|
||||||
instance of the given tag. If inclusivePop is false, pops the tag
|
instance of the given tag.
|
||||||
stack up to but *not* including the most recent instqance of
|
|
||||||
the given tag."""
|
If there are no open tags with the given name, nothing will be
|
||||||
#print "Popping to %s" % name
|
popped.
|
||||||
|
|
||||||
|
:param name: Pop up to the most recent tag with this name.
|
||||||
|
:param nsprefix: The namespace prefix that goes with `name`.
|
||||||
|
:param inclusivePop: It this is false, pops the tag stack up
|
||||||
|
to but *not* including the most recent instqance of the
|
||||||
|
given tag.
|
||||||
|
|
||||||
|
"""
|
||||||
|
#print("Popping to %s" % name)
|
||||||
if name == self.ROOT_TAG_NAME:
|
if name == self.ROOT_TAG_NAME:
|
||||||
# The BeautifulSoup object itself can never be popped.
|
# The BeautifulSoup object itself can never be popped.
|
||||||
return
|
return
|
||||||
|
@ -557,6 +677,8 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
stack_size = len(self.tagStack)
|
stack_size = len(self.tagStack)
|
||||||
for i in range(stack_size - 1, 0, -1):
|
for i in range(stack_size - 1, 0, -1):
|
||||||
|
if not self.open_tag_counter.get(name):
|
||||||
|
break
|
||||||
t = self.tagStack[i]
|
t = self.tagStack[i]
|
||||||
if (name == t.name and nsprefix == t.prefix):
|
if (name == t.name and nsprefix == t.prefix):
|
||||||
if inclusivePop:
|
if inclusivePop:
|
||||||
|
@ -568,15 +690,22 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
||||||
sourcepos=None):
|
sourcepos=None):
|
||||||
"""Push a start tag on to the stack.
|
"""Called by the tree builder when a new tag is encountered.
|
||||||
|
|
||||||
If this method returns None, the tag was rejected by the
|
:param name: Name of the tag.
|
||||||
|
:param nsprefix: Namespace prefix for the tag.
|
||||||
|
:param attrs: A dictionary of attribute values.
|
||||||
|
:param sourceline: The line number where this tag was found in its
|
||||||
|
source document.
|
||||||
|
:param sourcepos: The character position within `sourceline` where this
|
||||||
|
tag was found.
|
||||||
|
|
||||||
|
If this method returns None, the tag was rejected by an active
|
||||||
SoupStrainer. You should proceed as if the tag had not occurred
|
SoupStrainer. You should proceed as if the tag had not occurred
|
||||||
in the document. For instance, if this was a self-closing tag,
|
in the document. For instance, if this was a self-closing tag,
|
||||||
don't call handle_endtag.
|
don't call handle_endtag.
|
||||||
"""
|
"""
|
||||||
|
# print("Start tag %s: %s" % (name, attrs))
|
||||||
# print "Start tag %s: %s" % (name, attrs)
|
|
||||||
self.endData()
|
self.endData()
|
||||||
|
|
||||||
if (self.parse_only and len(self.tagStack) <= 1
|
if (self.parse_only and len(self.tagStack) <= 1
|
||||||
|
@ -598,22 +727,38 @@ class BeautifulSoup(Tag):
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
def handle_endtag(self, name, nsprefix=None):
|
def handle_endtag(self, name, nsprefix=None):
|
||||||
#print "End tag: " + name
|
"""Called by the tree builder when an ending tag is encountered.
|
||||||
|
|
||||||
|
:param name: Name of the tag.
|
||||||
|
:param nsprefix: Namespace prefix for the tag.
|
||||||
|
"""
|
||||||
|
#print("End tag: " + name)
|
||||||
self.endData()
|
self.endData()
|
||||||
self._popToTag(name, nsprefix)
|
self._popToTag(name, nsprefix)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
|
"""Called by the tree builder when a chunk of textual data is encountered."""
|
||||||
self.current_data.append(data)
|
self.current_data.append(data)
|
||||||
|
|
||||||
def decode(self, pretty_print=False,
|
def decode(self, pretty_print=False,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal"):
|
||||||
"""Returns a string or Unicode representation of this document.
|
"""Returns a string or Unicode representation of the parse tree
|
||||||
To get Unicode, pass None for encoding."""
|
as an HTML or XML document.
|
||||||
|
|
||||||
|
:param pretty_print: If this is True, indentation will be used to
|
||||||
|
make the document more readable.
|
||||||
|
:param eventual_encoding: The encoding of the final document.
|
||||||
|
If this is None, the document will be a Unicode string.
|
||||||
|
"""
|
||||||
if self.is_xml:
|
if self.is_xml:
|
||||||
# Print the XML declaration
|
# Print the XML declaration
|
||||||
encoding_part = ''
|
encoding_part = ''
|
||||||
|
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
|
||||||
|
# This is a special Python encoding; it can't actually
|
||||||
|
# go into an XML document because it means nothing
|
||||||
|
# outside of Python.
|
||||||
|
eventual_encoding = None
|
||||||
if eventual_encoding != None:
|
if eventual_encoding != None:
|
||||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||||
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||||
|
@ -626,7 +771,7 @@ class BeautifulSoup(Tag):
|
||||||
return prefix + super(BeautifulSoup, self).decode(
|
return prefix + super(BeautifulSoup, self).decode(
|
||||||
indent_level, eventual_encoding, formatter)
|
indent_level, eventual_encoding, formatter)
|
||||||
|
|
||||||
# Alias to make it easier to type import: 'from bs4 import _soup'
|
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
||||||
_s = BeautifulSoup
|
_s = BeautifulSoup
|
||||||
_soup = BeautifulSoup
|
_soup = BeautifulSoup
|
||||||
|
|
||||||
|
@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup):
|
||||||
|
|
||||||
|
|
||||||
class StopParsing(Exception):
|
class StopParsing(Exception):
|
||||||
|
"""Exception raised by a TreeBuilder if it's unable to continue parsing."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class FeatureNotFound(ValueError):
|
class FeatureNotFound(ValueError):
|
||||||
|
"""Exception raised by the BeautifulSoup constructor if no parser with the
|
||||||
|
requested features is found.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
#By default, act as an HTML pretty-printer.
|
#If this file is run as a script, act as an HTML pretty-printer.
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
soup = BeautifulSoup(sys.stdin)
|
soup = BeautifulSoup(sys.stdin)
|
||||||
print(soup.prettify())
|
print((soup.prettify()))
|
||||||
|
|
|
@ -7,8 +7,11 @@ import sys
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CharsetMetaAttributeValue,
|
CharsetMetaAttributeValue,
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
|
Stylesheet,
|
||||||
|
Script,
|
||||||
|
TemplateString,
|
||||||
nonwhitespace_re
|
nonwhitespace_re
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTMLTreeBuilder',
|
'HTMLTreeBuilder',
|
||||||
|
@ -27,18 +30,33 @@ HTML_5 = 'html5'
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilderRegistry(object):
|
class TreeBuilderRegistry(object):
|
||||||
|
"""A way of looking up TreeBuilder subclasses by their name or by desired
|
||||||
|
features.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.builders_for_feature = defaultdict(list)
|
self.builders_for_feature = defaultdict(list)
|
||||||
self.builders = []
|
self.builders = []
|
||||||
|
|
||||||
def register(self, treebuilder_class):
|
def register(self, treebuilder_class):
|
||||||
"""Register a treebuilder based on its advertised features."""
|
"""Register a treebuilder based on its advertised features.
|
||||||
|
|
||||||
|
:param treebuilder_class: A subclass of Treebuilder. its .features
|
||||||
|
attribute should list its features.
|
||||||
|
"""
|
||||||
for feature in treebuilder_class.features:
|
for feature in treebuilder_class.features:
|
||||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||||
self.builders.insert(0, treebuilder_class)
|
self.builders.insert(0, treebuilder_class)
|
||||||
|
|
||||||
def lookup(self, *features):
|
def lookup(self, *features):
|
||||||
|
"""Look up a TreeBuilder subclass with the desired features.
|
||||||
|
|
||||||
|
:param features: A list of features to look for. If none are
|
||||||
|
provided, the most recently registered TreeBuilder subclass
|
||||||
|
will be used.
|
||||||
|
:return: A TreeBuilder subclass, or None if there's no
|
||||||
|
registered subclass with all the requested features.
|
||||||
|
"""
|
||||||
if len(self.builders) == 0:
|
if len(self.builders) == 0:
|
||||||
# There are no builders at all.
|
# There are no builders at all.
|
||||||
return None
|
return None
|
||||||
|
@ -81,7 +99,7 @@ class TreeBuilderRegistry(object):
|
||||||
builder_registry = TreeBuilderRegistry()
|
builder_registry = TreeBuilderRegistry()
|
||||||
|
|
||||||
class TreeBuilder(object):
|
class TreeBuilder(object):
|
||||||
"""Turn a document into a Beautiful Soup object tree."""
|
"""Turn a textual document into a Beautiful Soup object tree."""
|
||||||
|
|
||||||
NAME = "[Unknown tree builder]"
|
NAME = "[Unknown tree builder]"
|
||||||
ALTERNATE_NAMES = []
|
ALTERNATE_NAMES = []
|
||||||
|
@ -96,8 +114,13 @@ class TreeBuilder(object):
|
||||||
# comma-separated list of CDATA, rather than a single CDATA.
|
# comma-separated list of CDATA, rather than a single CDATA.
|
||||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||||
|
|
||||||
|
# Whitespace should be preserved inside these tags.
|
||||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||||
|
|
||||||
|
# The textual contents of tags with these names should be
|
||||||
|
# instantiated with some class other than NavigableString.
|
||||||
|
DEFAULT_STRING_CONTAINERS = {}
|
||||||
|
|
||||||
USE_DEFAULT = object()
|
USE_DEFAULT = object()
|
||||||
|
|
||||||
# Most parsers don't keep track of line numbers.
|
# Most parsers don't keep track of line numbers.
|
||||||
|
@ -105,12 +128,14 @@ class TreeBuilder(object):
|
||||||
|
|
||||||
def __init__(self, multi_valued_attributes=USE_DEFAULT,
|
def __init__(self, multi_valued_attributes=USE_DEFAULT,
|
||||||
preserve_whitespace_tags=USE_DEFAULT,
|
preserve_whitespace_tags=USE_DEFAULT,
|
||||||
store_line_numbers=USE_DEFAULT):
|
store_line_numbers=USE_DEFAULT,
|
||||||
|
string_containers=USE_DEFAULT,
|
||||||
|
):
|
||||||
"""Constructor.
|
"""Constructor.
|
||||||
|
|
||||||
:param multi_valued_attributes: If this is set to None, the
|
:param multi_valued_attributes: If this is set to None, the
|
||||||
TreeBuilder will not turn any values for attributes like
|
TreeBuilder will not turn any values for attributes like
|
||||||
'class' into lists. Setting this do a dictionary will
|
'class' into lists. Setting this to a dictionary will
|
||||||
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
|
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||||
for an example.
|
for an example.
|
||||||
|
|
||||||
|
@ -120,7 +145,14 @@ class TreeBuilder(object):
|
||||||
|
|
||||||
:param preserve_whitespace_tags: A list of tags to treat
|
:param preserve_whitespace_tags: A list of tags to treat
|
||||||
the way <pre> tags are treated in HTML. Tags in this list
|
the way <pre> tags are treated in HTML. Tags in this list
|
||||||
will have
|
are immune from pretty-printing; their contents will always be
|
||||||
|
output as-is.
|
||||||
|
|
||||||
|
:param string_containers: A dictionary mapping tag names to
|
||||||
|
the classes that should be instantiated to contain the textual
|
||||||
|
contents of those tags. The default is to use NavigableString
|
||||||
|
for every tag, no matter what the name. You can override the
|
||||||
|
default by changing DEFAULT_STRING_CONTAINERS.
|
||||||
|
|
||||||
:param store_line_numbers: If the parser keeps track of the
|
:param store_line_numbers: If the parser keeps track of the
|
||||||
line numbers and positions of the original markup, that
|
line numbers and positions of the original markup, that
|
||||||
|
@ -140,14 +172,24 @@ class TreeBuilder(object):
|
||||||
if store_line_numbers == self.USE_DEFAULT:
|
if store_line_numbers == self.USE_DEFAULT:
|
||||||
store_line_numbers = self.TRACKS_LINE_NUMBERS
|
store_line_numbers = self.TRACKS_LINE_NUMBERS
|
||||||
self.store_line_numbers = store_line_numbers
|
self.store_line_numbers = store_line_numbers
|
||||||
|
if string_containers == self.USE_DEFAULT:
|
||||||
|
string_containers = self.DEFAULT_STRING_CONTAINERS
|
||||||
|
self.string_containers = string_containers
|
||||||
|
|
||||||
def initialize_soup(self, soup):
|
def initialize_soup(self, soup):
|
||||||
"""The BeautifulSoup object has been initialized and is now
|
"""The BeautifulSoup object has been initialized and is now
|
||||||
being associated with the TreeBuilder.
|
being associated with the TreeBuilder.
|
||||||
|
|
||||||
|
:param soup: A BeautifulSoup object.
|
||||||
"""
|
"""
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
"""Do any work necessary to reset the underlying parser
|
||||||
|
for a new document.
|
||||||
|
|
||||||
|
By default, this does nothing.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def can_be_empty_element(self, tag_name):
|
def can_be_empty_element(self, tag_name):
|
||||||
|
@ -159,23 +201,57 @@ class TreeBuilder(object):
|
||||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||||
an empty-element tag (it's not in
|
an empty-element tag (it's not in
|
||||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||||
will be presented as "<p></p>", not "<p />".
|
will be presented as "<p></p>", not "<p/>" or "<p>".
|
||||||
|
|
||||||
The default implementation has no opinion about which tags are
|
The default implementation has no opinion about which tags are
|
||||||
empty-element tags, so a tag will be presented as an
|
empty-element tags, so a tag will be presented as an
|
||||||
empty-element tag if and only if it has no contents.
|
empty-element tag if and only if it has no children.
|
||||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
"<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
|
||||||
be left alone.
|
be left alone.
|
||||||
|
|
||||||
|
:param tag_name: The name of a markup tag.
|
||||||
"""
|
"""
|
||||||
if self.empty_element_tags is None:
|
if self.empty_element_tags is None:
|
||||||
return True
|
return True
|
||||||
return tag_name in self.empty_element_tags
|
return tag_name in self.empty_element_tags
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
|
"""Run some incoming markup through some parsing process,
|
||||||
|
populating the `BeautifulSoup` object in self.soup.
|
||||||
|
|
||||||
|
This method is not implemented in TreeBuilder; it must be
|
||||||
|
implemented in subclasses.
|
||||||
|
|
||||||
|
:return: None.
|
||||||
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
document_declared_encoding=None, exclude_encodings=None):
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
|
"""Run any preliminary steps necessary to make incoming markup
|
||||||
|
acceptable to the parser.
|
||||||
|
|
||||||
|
:param markup: Some markup -- probably a bytestring.
|
||||||
|
:param user_specified_encoding: The user asked to try this encoding.
|
||||||
|
:param document_declared_encoding: The markup itself claims to be
|
||||||
|
in this encoding. NOTE: This argument is not used by the
|
||||||
|
calling code and can probably be removed.
|
||||||
|
:param exclude_encodings: The user asked _not_ to try any of
|
||||||
|
these encodings.
|
||||||
|
|
||||||
|
:yield: A series of 4-tuples:
|
||||||
|
(markup, encoding, declared encoding,
|
||||||
|
has undergone character replacement)
|
||||||
|
|
||||||
|
Each 4-tuple represents a strategy for converting the
|
||||||
|
document to Unicode and parsing it. Each strategy will be tried
|
||||||
|
in turn.
|
||||||
|
|
||||||
|
By default, the only strategy is to parse the markup
|
||||||
|
as-is. See `LXMLTreeBuilderForXML` and
|
||||||
|
`HTMLParserTreeBuilder` for implementations that take into
|
||||||
|
account the quirks of particular parsers.
|
||||||
|
"""
|
||||||
yield markup, None, None, False
|
yield markup, None, None, False
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
|
@ -188,16 +264,36 @@ class TreeBuilder(object):
|
||||||
results against other HTML fragments.
|
results against other HTML fragments.
|
||||||
|
|
||||||
This method should not be used outside of tests.
|
This method should not be used outside of tests.
|
||||||
|
|
||||||
|
:param fragment: A string -- fragment of HTML.
|
||||||
|
:return: A string -- a full HTML document.
|
||||||
"""
|
"""
|
||||||
return fragment
|
return fragment
|
||||||
|
|
||||||
def set_up_substitutions(self, tag):
|
def set_up_substitutions(self, tag):
|
||||||
|
"""Set up any substitutions that will need to be performed on
|
||||||
|
a `Tag` when it's output as a string.
|
||||||
|
|
||||||
|
By default, this does nothing. See `HTMLTreeBuilder` for a
|
||||||
|
case where this is used.
|
||||||
|
|
||||||
|
:param tag: A `Tag`
|
||||||
|
:return: Whether or not a substitution was performed.
|
||||||
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
"""When an attribute value is associated with a tag that can
|
||||||
|
have multiple values for that attribute, convert the string
|
||||||
|
value to a list of strings.
|
||||||
|
|
||||||
Modifies its input in place.
|
Basically, replaces class="foo bar" with class=["foo", "bar"]
|
||||||
|
|
||||||
|
NOTE: This method modifies its input in place.
|
||||||
|
|
||||||
|
:param tag_name: The name of a tag.
|
||||||
|
:param attrs: A dictionary containing the tag's attributes.
|
||||||
|
Any appropriate attribute values will be modified in place.
|
||||||
"""
|
"""
|
||||||
if not attrs:
|
if not attrs:
|
||||||
return attrs
|
return attrs
|
||||||
|
@ -225,7 +321,11 @@ class TreeBuilder(object):
|
||||||
return attrs
|
return attrs
|
||||||
|
|
||||||
class SAXTreeBuilder(TreeBuilder):
|
class SAXTreeBuilder(TreeBuilder):
|
||||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
"""A Beautiful Soup treebuilder that listens for SAX events.
|
||||||
|
|
||||||
|
This is not currently used for anything, but it demonstrates
|
||||||
|
how a simple TreeBuilder would work.
|
||||||
|
"""
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder):
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||||
#print "Start %s, %r" % (name, attrs)
|
#print("Start %s, %r" % (name, attrs))
|
||||||
self.soup.handle_starttag(name, attrs)
|
self.soup.handle_starttag(name, attrs)
|
||||||
|
|
||||||
def endElement(self, name):
|
def endElement(self, name):
|
||||||
#print "End %s" % name
|
#print("End %s" % name)
|
||||||
self.soup.handle_endtag(name)
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||||
|
@ -290,6 +390,22 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
# you need to use it.
|
# you need to use it.
|
||||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||||
|
|
||||||
|
# The HTML standard defines an unusual content model for these tags.
|
||||||
|
# We represent this by using a string class other than NavigableString
|
||||||
|
# inside these tags.
|
||||||
|
#
|
||||||
|
# I made this list by going through the HTML spec
|
||||||
|
# (https://html.spec.whatwg.org/#metadata-content) and looking for
|
||||||
|
# "metadata content" elements that can contain strings.
|
||||||
|
#
|
||||||
|
# TODO: Arguably <noscript> could go here but it seems
|
||||||
|
# qualitatively different from the other tags.
|
||||||
|
DEFAULT_STRING_CONTAINERS = {
|
||||||
|
'style': Stylesheet,
|
||||||
|
'script': Script,
|
||||||
|
'template': TemplateString,
|
||||||
|
}
|
||||||
|
|
||||||
# The HTML standard defines these attributes as containing a
|
# The HTML standard defines these attributes as containing a
|
||||||
# space-separated list of values, not a single value. That is,
|
# space-separated list of values, not a single value. That is,
|
||||||
# class="foo bar" means that the 'class' attribute has two values,
|
# class="foo bar" means that the 'class' attribute has two values,
|
||||||
|
@ -317,6 +433,16 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
||||||
|
|
||||||
def set_up_substitutions(self, tag):
|
def set_up_substitutions(self, tag):
|
||||||
|
"""Replace the declared encoding in a <meta> tag with a placeholder,
|
||||||
|
to be substituted when the tag is output to a string.
|
||||||
|
|
||||||
|
An HTML document may come in to Beautiful Soup as one
|
||||||
|
encoding, but exit in a different encoding, and the <meta> tag
|
||||||
|
needs to be changed to reflect this.
|
||||||
|
|
||||||
|
:param tag: A `Tag`
|
||||||
|
:return: Whether or not a substitution was performed.
|
||||||
|
"""
|
||||||
# We are only interested in <meta> tags
|
# We are only interested in <meta> tags
|
||||||
if tag.name != 'meta':
|
if tag.name != 'meta':
|
||||||
return False
|
return False
|
||||||
|
@ -351,8 +477,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
|
|
||||||
def register_treebuilders_from(module):
|
def register_treebuilders_from(module):
|
||||||
"""Copy TreeBuilders from the given module into this module."""
|
"""Copy TreeBuilders from the given module into this module."""
|
||||||
# I'm fairly sure this is not the best way to do this.
|
this_module = sys.modules[__name__]
|
||||||
this_module = sys.modules['bs4.builder']
|
|
||||||
for name in module.__all__:
|
for name in module.__all__:
|
||||||
obj = getattr(module, name)
|
obj = getattr(module, name)
|
||||||
|
|
||||||
|
@ -363,6 +488,9 @@ def register_treebuilders_from(module):
|
||||||
this_module.builder_registry.register(obj)
|
this_module.builder_registry.register(obj)
|
||||||
|
|
||||||
class ParserRejectedMarkup(Exception):
|
class ParserRejectedMarkup(Exception):
|
||||||
|
"""An Exception to be raised when the underlying parser simply
|
||||||
|
refuses to parse the given markup.
|
||||||
|
"""
|
||||||
def __init__(self, message_or_exception):
|
def __init__(self, message_or_exception):
|
||||||
"""Explain why the parser rejected the given markup, either
|
"""Explain why the parser rejected the given markup, either
|
||||||
with a textual explanation or another exception.
|
with a textual explanation or another exception.
|
||||||
|
@ -375,7 +503,7 @@ class ParserRejectedMarkup(Exception):
|
||||||
# Builders are registered in reverse order of priority, so that custom
|
# Builders are registered in reverse order of priority, so that custom
|
||||||
# builder registrations will take precedence. In general, we want lxml
|
# builder registrations will take precedence. In general, we want lxml
|
||||||
# to take precedence over html5lib, because it's faster. And we only
|
# to take precedence over html5lib, because it's faster. And we only
|
||||||
# want to use HTMLParser as a last result.
|
# want to use HTMLParser as a last resort.
|
||||||
from . import _htmlparser
|
from . import _htmlparser
|
||||||
register_treebuilders_from(_htmlparser)
|
register_treebuilders_from(_htmlparser)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -39,7 +39,18 @@ except ImportError as e:
|
||||||
new_html5lib = True
|
new_html5lib = True
|
||||||
|
|
||||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
"""Use html5lib to build a tree."""
|
"""Use html5lib to build a tree.
|
||||||
|
|
||||||
|
Note that this TreeBuilder does not support some features common
|
||||||
|
to HTML TreeBuilders. Some of these features could theoretically
|
||||||
|
be implemented, but at the very least it's quite difficult,
|
||||||
|
because html5lib moves the parse tree around as it's being built.
|
||||||
|
|
||||||
|
* This TreeBuilder doesn't use different subclasses of NavigableString
|
||||||
|
based on the name of the tag in which the string was found.
|
||||||
|
|
||||||
|
* You can't use a SoupStrainer to parse only part of a document.
|
||||||
|
"""
|
||||||
|
|
||||||
NAME = "html5lib"
|
NAME = "html5lib"
|
||||||
|
|
||||||
|
@ -116,6 +127,9 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
"", "html.parser", store_line_numbers=store_line_numbers,
|
"", "html.parser", store_line_numbers=store_line_numbers,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
# TODO: What are **kwargs exactly? Should they be passed in
|
||||||
|
# here in addition to/instead of being passed to the BeautifulSoup
|
||||||
|
# constructor?
|
||||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||||
|
|
||||||
# This will be set later to an html5lib.html5parser.HTMLParser
|
# This will be set later to an html5lib.html5parser.HTMLParser
|
||||||
|
@ -316,9 +330,7 @@ class Element(treebuilder_base.Node):
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
def setAttributes(self, attributes):
|
||||||
|
|
||||||
if attributes is not None and len(attributes) > 0:
|
if attributes is not None and len(attributes) > 0:
|
||||||
|
|
||||||
converted_attributes = []
|
converted_attributes = []
|
||||||
for name, value in list(attributes.items()):
|
for name, value in list(attributes.items()):
|
||||||
if isinstance(name, tuple):
|
if isinstance(name, tuple):
|
||||||
|
@ -363,9 +375,9 @@ class Element(treebuilder_base.Node):
|
||||||
|
|
||||||
def reparentChildren(self, new_parent):
|
def reparentChildren(self, new_parent):
|
||||||
"""Move all of this tag's children into another tag."""
|
"""Move all of this tag's children into another tag."""
|
||||||
# print "MOVE", self.element.contents
|
# print("MOVE", self.element.contents)
|
||||||
# print "FROM", self.element
|
# print("FROM", self.element)
|
||||||
# print "TO", new_parent.element
|
# print("TO", new_parent.element)
|
||||||
|
|
||||||
element = self.element
|
element = self.element
|
||||||
new_parent_element = new_parent.element
|
new_parent_element = new_parent.element
|
||||||
|
@ -423,9 +435,9 @@ class Element(treebuilder_base.Node):
|
||||||
element.contents = []
|
element.contents = []
|
||||||
element.next_element = final_next_element
|
element.next_element = final_next_element
|
||||||
|
|
||||||
# print "DONE WITH MOVE"
|
# print("DONE WITH MOVE")
|
||||||
# print "FROM", self.element
|
# print("FROM", self.element)
|
||||||
# print "TO", new_parent_element
|
# print("TO", new_parent_element)
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||||
|
|
|
@ -8,7 +8,7 @@ __all__ = [
|
||||||
'HTMLParserTreeBuilder',
|
'HTMLParserTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
from future.moves.html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from html.parser import HTMLParseError
|
from html.parser import HTMLParseError
|
||||||
|
@ -53,8 +53,30 @@ from bs4.builder import (
|
||||||
HTMLPARSER = 'html.parser'
|
HTMLPARSER = 'html.parser'
|
||||||
|
|
||||||
class BeautifulSoupHTMLParser(HTMLParser):
|
class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
"""A subclass of the Python standard library's HTMLParser class, which
|
||||||
|
listens for HTMLParser events and translates them into calls
|
||||||
|
to Beautiful Soup's tree construction API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Strategies for handling duplicate attributes
|
||||||
|
IGNORE = 'ignore'
|
||||||
|
REPLACE = 'replace'
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
:param on_duplicate_attribute: A strategy for what to do if a
|
||||||
|
tag includes the same attribute more than once. Accepted
|
||||||
|
values are: REPLACE (replace earlier values with later
|
||||||
|
ones, the default), IGNORE (keep the earliest value
|
||||||
|
encountered), or a callable. A callable must take three
|
||||||
|
arguments: the dictionary of attributes already processed,
|
||||||
|
the name of the duplicate attribute, and the most recent value
|
||||||
|
encountered.
|
||||||
|
"""
|
||||||
|
self.on_duplicate_attribute = kwargs.pop(
|
||||||
|
'on_duplicate_attribute', self.REPLACE
|
||||||
|
)
|
||||||
HTMLParser.__init__(self, *args, **kwargs)
|
HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
# Keep a list of empty-element tags that were encountered
|
# Keep a list of empty-element tags that were encountered
|
||||||
|
@ -67,20 +89,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.already_closed_empty_element = []
|
self.already_closed_empty_element = []
|
||||||
|
|
||||||
def error(self, msg):
|
def error(self, msg):
|
||||||
"""In Python 3, HTMLParser subclasses must implement error(), although this
|
"""In Python 3, HTMLParser subclasses must implement error(), although
|
||||||
requirement doesn't appear to be documented.
|
this requirement doesn't appear to be documented.
|
||||||
|
|
||||||
In Python 2, HTMLParser implements error() as raising an exception.
|
In Python 2, HTMLParser implements error() by raising an exception,
|
||||||
|
which we don't want to do.
|
||||||
|
|
||||||
In any event, this method is called only on very strange markup and our best strategy
|
In any event, this method is called only on very strange
|
||||||
is to pretend it didn't happen and keep going.
|
markup and our best strategy is to pretend it didn't happen
|
||||||
|
and keep going.
|
||||||
"""
|
"""
|
||||||
warnings.warn(msg)
|
warnings.warn(msg)
|
||||||
|
|
||||||
def handle_startendtag(self, name, attrs):
|
def handle_startendtag(self, name, attrs):
|
||||||
# This is only called when the markup looks like
|
"""Handle an incoming empty-element tag.
|
||||||
# <tag/>.
|
|
||||||
|
|
||||||
|
This is only called when the markup looks like <tag/>.
|
||||||
|
|
||||||
|
:param name: Name of the tag.
|
||||||
|
:param attrs: Dictionary of the tag's attributes.
|
||||||
|
"""
|
||||||
# is_startend() tells handle_starttag not to close the tag
|
# is_startend() tells handle_starttag not to close the tag
|
||||||
# just because its name matches a known empty-element tag. We
|
# just because its name matches a known empty-element tag. We
|
||||||
# know that this is an empty-element tag and we want to call
|
# know that this is an empty-element tag and we want to call
|
||||||
|
@ -89,6 +117,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.handle_endtag(name)
|
self.handle_endtag(name)
|
||||||
|
|
||||||
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||||
|
"""Handle an opening tag, e.g. '<tag>'
|
||||||
|
|
||||||
|
:param name: Name of the tag.
|
||||||
|
:param attrs: Dictionary of the tag's attributes.
|
||||||
|
:param handle_empty_element: True if this tag is known to be
|
||||||
|
an empty-element tag (i.e. there is not expected to be any
|
||||||
|
closing tag).
|
||||||
|
"""
|
||||||
# XXX namespace
|
# XXX namespace
|
||||||
attr_dict = {}
|
attr_dict = {}
|
||||||
for key, value in attrs:
|
for key, value in attrs:
|
||||||
|
@ -96,9 +132,21 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
# for consistency with the other tree builders.
|
# for consistency with the other tree builders.
|
||||||
if value is None:
|
if value is None:
|
||||||
value = ''
|
value = ''
|
||||||
|
if key in attr_dict:
|
||||||
|
# A single attribute shows up multiple times in this
|
||||||
|
# tag. How to handle it depends on the
|
||||||
|
# on_duplicate_attribute setting.
|
||||||
|
on_dupe = self.on_duplicate_attribute
|
||||||
|
if on_dupe == self.IGNORE:
|
||||||
|
pass
|
||||||
|
elif on_dupe in (None, self.REPLACE):
|
||||||
|
attr_dict[key] = value
|
||||||
|
else:
|
||||||
|
on_dupe(attr_dict, key, value)
|
||||||
|
else:
|
||||||
attr_dict[key] = value
|
attr_dict[key] = value
|
||||||
attrvalue = '""'
|
attrvalue = '""'
|
||||||
#print "START", name
|
#print("START", name)
|
||||||
sourceline, sourcepos = self.getpos()
|
sourceline, sourcepos = self.getpos()
|
||||||
tag = self.soup.handle_starttag(
|
tag = self.soup.handle_starttag(
|
||||||
name, None, None, attr_dict, sourceline=sourceline,
|
name, None, None, attr_dict, sourceline=sourceline,
|
||||||
|
@ -121,20 +169,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.already_closed_empty_element.append(name)
|
self.already_closed_empty_element.append(name)
|
||||||
|
|
||||||
def handle_endtag(self, name, check_already_closed=True):
|
def handle_endtag(self, name, check_already_closed=True):
|
||||||
#print "END", name
|
"""Handle a closing tag, e.g. '</tag>'
|
||||||
|
|
||||||
|
:param name: A tag name.
|
||||||
|
:param check_already_closed: True if this tag is expected to
|
||||||
|
be the closing portion of an empty-element tag,
|
||||||
|
e.g. '<tag></tag>'.
|
||||||
|
"""
|
||||||
|
#print("END", name)
|
||||||
if check_already_closed and name in self.already_closed_empty_element:
|
if check_already_closed and name in self.already_closed_empty_element:
|
||||||
# This is a redundant end tag for an empty-element tag.
|
# This is a redundant end tag for an empty-element tag.
|
||||||
# We've already called handle_endtag() for it, so just
|
# We've already called handle_endtag() for it, so just
|
||||||
# check it off the list.
|
# check it off the list.
|
||||||
# print "ALREADY CLOSED", name
|
#print("ALREADY CLOSED", name)
|
||||||
self.already_closed_empty_element.remove(name)
|
self.already_closed_empty_element.remove(name)
|
||||||
else:
|
else:
|
||||||
self.soup.handle_endtag(name)
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
|
"""Handle some textual data that shows up between tags."""
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
|
|
||||||
def handle_charref(self, name):
|
def handle_charref(self, name):
|
||||||
|
"""Handle a numeric character reference by converting it to the
|
||||||
|
corresponding Unicode character and treating it as textual
|
||||||
|
data.
|
||||||
|
|
||||||
|
:param name: Character number, possibly in hexadecimal.
|
||||||
|
"""
|
||||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||||
# it's fixed in all supported versions.
|
# it's fixed in all supported versions.
|
||||||
# http://bugs.python.org/issue13633
|
# http://bugs.python.org/issue13633
|
||||||
|
@ -168,6 +230,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
def handle_entityref(self, name):
|
||||||
|
"""Handle a named entity reference by converting it to the
|
||||||
|
corresponding Unicode character(s) and treating it as textual
|
||||||
|
data.
|
||||||
|
|
||||||
|
:param name: Name of the entity reference.
|
||||||
|
"""
|
||||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||||
if character is not None:
|
if character is not None:
|
||||||
data = character
|
data = character
|
||||||
|
@ -181,21 +249,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
|
"""Handle an HTML comment.
|
||||||
|
|
||||||
|
:param data: The text of the comment.
|
||||||
|
"""
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(Comment)
|
self.soup.endData(Comment)
|
||||||
|
|
||||||
def handle_decl(self, data):
|
def handle_decl(self, data):
|
||||||
|
"""Handle a DOCTYPE declaration.
|
||||||
|
|
||||||
|
:param data: The text of the declaration.
|
||||||
|
"""
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
if data.startswith("DOCTYPE "):
|
|
||||||
data = data[len("DOCTYPE "):]
|
data = data[len("DOCTYPE "):]
|
||||||
elif data == 'DOCTYPE':
|
|
||||||
# i.e. "<!DOCTYPE>"
|
|
||||||
data = ''
|
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(Doctype)
|
self.soup.endData(Doctype)
|
||||||
|
|
||||||
def unknown_decl(self, data):
|
def unknown_decl(self, data):
|
||||||
|
"""Handle a declaration of unknown type -- probably a CDATA block.
|
||||||
|
|
||||||
|
:param data: The text of the declaration.
|
||||||
|
"""
|
||||||
if data.upper().startswith('CDATA['):
|
if data.upper().startswith('CDATA['):
|
||||||
cls = CData
|
cls = CData
|
||||||
data = data[len('CDATA['):]
|
data = data[len('CDATA['):]
|
||||||
|
@ -206,13 +282,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.soup.endData(cls)
|
self.soup.endData(cls)
|
||||||
|
|
||||||
def handle_pi(self, data):
|
def handle_pi(self, data):
|
||||||
|
"""Handle a processing instruction.
|
||||||
|
|
||||||
|
:param data: The text of the instruction.
|
||||||
|
"""
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(ProcessingInstruction)
|
self.soup.endData(ProcessingInstruction)
|
||||||
|
|
||||||
|
|
||||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
|
"""A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
|
||||||
|
found in the Python standard library.
|
||||||
|
"""
|
||||||
is_xml = False
|
is_xml = False
|
||||||
picklable = True
|
picklable = True
|
||||||
NAME = HTMLPARSER
|
NAME = HTMLPARSER
|
||||||
|
@ -223,9 +305,27 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
TRACKS_LINE_NUMBERS = True
|
TRACKS_LINE_NUMBERS = True
|
||||||
|
|
||||||
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
|
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
:param parser_args: Positional arguments to pass into
|
||||||
|
the BeautifulSoupHTMLParser constructor, once it's
|
||||||
|
invoked.
|
||||||
|
:param parser_kwargs: Keyword arguments to pass into
|
||||||
|
the BeautifulSoupHTMLParser constructor, once it's
|
||||||
|
invoked.
|
||||||
|
:param kwargs: Keyword arguments for the superclass constructor.
|
||||||
|
"""
|
||||||
|
# Some keyword arguments will be pulled out of kwargs and placed
|
||||||
|
# into parser_kwargs.
|
||||||
|
extra_parser_kwargs = dict()
|
||||||
|
for arg in ('on_duplicate_attribute',):
|
||||||
|
if arg in kwargs:
|
||||||
|
value = kwargs.pop(arg)
|
||||||
|
extra_parser_kwargs[arg] = value
|
||||||
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
||||||
parser_args = parser_args or []
|
parser_args = parser_args or []
|
||||||
parser_kwargs = parser_kwargs or {}
|
parser_kwargs = parser_kwargs or {}
|
||||||
|
parser_kwargs.update(extra_parser_kwargs)
|
||||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||||
parser_kwargs['strict'] = False
|
parser_kwargs['strict'] = False
|
||||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||||
|
@ -234,23 +334,57 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
document_declared_encoding=None, exclude_encodings=None):
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
"""
|
|
||||||
:return: A 4-tuple (markup, original encoding, encoding
|
"""Run any preliminary steps necessary to make incoming markup
|
||||||
declared within markup, whether any characters had to be
|
acceptable to the parser.
|
||||||
replaced with REPLACEMENT CHARACTER).
|
|
||||||
|
:param markup: Some markup -- probably a bytestring.
|
||||||
|
:param user_specified_encoding: The user asked to try this encoding.
|
||||||
|
:param document_declared_encoding: The markup itself claims to be
|
||||||
|
in this encoding.
|
||||||
|
:param exclude_encodings: The user asked _not_ to try any of
|
||||||
|
these encodings.
|
||||||
|
|
||||||
|
:yield: A series of 4-tuples:
|
||||||
|
(markup, encoding, declared encoding,
|
||||||
|
has undergone character replacement)
|
||||||
|
|
||||||
|
Each 4-tuple represents a strategy for converting the
|
||||||
|
document to Unicode and parsing it. Each strategy will be tried
|
||||||
|
in turn.
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, str):
|
if isinstance(markup, str):
|
||||||
|
# Parse Unicode as-is.
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Ask UnicodeDammit to sniff the most likely encoding.
|
||||||
|
|
||||||
|
# This was provided by the end-user; treat it as a known
|
||||||
|
# definite encoding per the algorithm laid out in the HTML5
|
||||||
|
# spec. (See the EncodingDetector class for details.)
|
||||||
|
known_definite_encodings = [user_specified_encoding]
|
||||||
|
|
||||||
|
# This was found in the document; treat it as a slightly lower-priority
|
||||||
|
# user encoding.
|
||||||
|
user_encodings = [document_declared_encoding]
|
||||||
|
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
dammit = UnicodeDammit(
|
||||||
exclude_encodings=exclude_encodings)
|
markup,
|
||||||
|
known_definite_encodings=known_definite_encodings,
|
||||||
|
user_encodings=user_encodings,
|
||||||
|
is_html=True,
|
||||||
|
exclude_encodings=exclude_encodings
|
||||||
|
)
|
||||||
yield (dammit.markup, dammit.original_encoding,
|
yield (dammit.markup, dammit.original_encoding,
|
||||||
dammit.declared_html_encoding,
|
dammit.declared_html_encoding,
|
||||||
dammit.contains_replacement_characters)
|
dammit.contains_replacement_characters)
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
|
"""Run some incoming markup through some parsing process,
|
||||||
|
populating the `BeautifulSoup` object in self.soup.
|
||||||
|
"""
|
||||||
args, kwargs = self.parser_args
|
args, kwargs = self.parser_args
|
||||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
|
|
|
@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# But instead we build an XMLParser or HTMLParser object to serve
|
# But instead we build an XMLParser or HTMLParser object to serve
|
||||||
# as the target of parse messages, and those messages don't include
|
# as the target of parse messages, and those messages don't include
|
||||||
# line numbers.
|
# line numbers.
|
||||||
|
# See: https://bugs.launchpad.net/lxml/+bug/1846906
|
||||||
|
|
||||||
def initialize_soup(self, soup):
|
def initialize_soup(self, soup):
|
||||||
"""Let the BeautifulSoup object know about the standard namespace
|
"""Let the BeautifulSoup object know about the standard namespace
|
||||||
mapping.
|
mapping.
|
||||||
|
|
||||||
|
:param soup: A `BeautifulSoup`.
|
||||||
"""
|
"""
|
||||||
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||||
self._register_namespaces(self.DEFAULT_NSMAPS)
|
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||||
|
@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
while parsing the document.
|
while parsing the document.
|
||||||
|
|
||||||
This might be useful later on when creating CSS selectors.
|
This might be useful later on when creating CSS selectors.
|
||||||
|
|
||||||
|
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
||||||
"""
|
"""
|
||||||
for key, value in list(mapping.items()):
|
for key, value in list(mapping.items()):
|
||||||
if key and key not in self.soup._namespaces:
|
if key and key not in self.soup._namespaces:
|
||||||
|
@ -84,20 +89,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
self.soup._namespaces[key] = value
|
self.soup._namespaces[key] = value
|
||||||
|
|
||||||
def default_parser(self, encoding):
|
def default_parser(self, encoding):
|
||||||
# This can either return a parser object or a class, which
|
"""Find the default parser for the given encoding.
|
||||||
# will be instantiated with default arguments.
|
|
||||||
|
:param encoding: A string.
|
||||||
|
:return: Either a parser object or a class, which
|
||||||
|
will be instantiated with default arguments.
|
||||||
|
"""
|
||||||
if self._default_parser is not None:
|
if self._default_parser is not None:
|
||||||
return self._default_parser
|
return self._default_parser
|
||||||
return etree.XMLParser(
|
return etree.XMLParser(
|
||||||
target=self, strip_cdata=False, recover=True, encoding=encoding)
|
target=self, strip_cdata=False, recover=True, encoding=encoding)
|
||||||
|
|
||||||
def parser_for(self, encoding):
|
def parser_for(self, encoding):
|
||||||
|
"""Instantiate an appropriate parser for the given encoding.
|
||||||
|
|
||||||
|
:param encoding: A string.
|
||||||
|
:return: A parser object such as an `etree.XMLParser`.
|
||||||
|
"""
|
||||||
# Use the default parser.
|
# Use the default parser.
|
||||||
parser = self.default_parser(encoding)
|
parser = self.default_parser(encoding)
|
||||||
|
|
||||||
if isinstance(parser, Callable):
|
if isinstance(parser, Callable):
|
||||||
# Instantiate the parser with default arguments
|
# Instantiate the parser with default arguments
|
||||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
parser = parser(
|
||||||
|
target=self, strip_cdata=False, recover=True, encoding=encoding
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
|
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
|
||||||
|
@ -122,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
exclude_encodings=None,
|
exclude_encodings=None,
|
||||||
document_declared_encoding=None):
|
document_declared_encoding=None):
|
||||||
"""
|
"""Run any preliminary steps necessary to make incoming markup
|
||||||
:yield: A series of 4-tuples.
|
acceptable to the parser.
|
||||||
|
|
||||||
|
lxml really wants to get a bytestring and convert it to
|
||||||
|
Unicode itself. So instead of using UnicodeDammit to convert
|
||||||
|
the bytestring to Unicode using different encodings, this
|
||||||
|
implementation uses EncodingDetector to iterate over the
|
||||||
|
encodings, and tell lxml to try to parse the document as each
|
||||||
|
one in turn.
|
||||||
|
|
||||||
|
:param markup: Some markup -- hopefully a bytestring.
|
||||||
|
:param user_specified_encoding: The user asked to try this encoding.
|
||||||
|
:param document_declared_encoding: The markup itself claims to be
|
||||||
|
in this encoding.
|
||||||
|
:param exclude_encodings: The user asked _not_ to try any of
|
||||||
|
these encodings.
|
||||||
|
|
||||||
|
:yield: A series of 4-tuples:
|
||||||
(markup, encoding, declared encoding,
|
(markup, encoding, declared encoding,
|
||||||
has undergone character replacement)
|
has undergone character replacement)
|
||||||
|
|
||||||
Each 4-tuple represents a strategy for parsing the document.
|
Each 4-tuple represents a strategy for converting the
|
||||||
|
document to Unicode and parsing it. Each strategy will be tried
|
||||||
|
in turn.
|
||||||
"""
|
"""
|
||||||
# Instead of using UnicodeDammit to convert the bytestring to
|
|
||||||
# Unicode using different encodings, use EncodingDetector to
|
|
||||||
# iterate over the encodings, and tell lxml to try to parse
|
|
||||||
# the document as each one in turn.
|
|
||||||
is_html = not self.is_xml
|
is_html = not self.is_xml
|
||||||
if is_html:
|
if is_html:
|
||||||
self.processing_instruction_class = ProcessingInstruction
|
self.processing_instruction_class = ProcessingInstruction
|
||||||
|
@ -150,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
yield (markup.encode("utf8"), "utf8",
|
yield (markup.encode("utf8"), "utf8",
|
||||||
document_declared_encoding, False)
|
document_declared_encoding, False)
|
||||||
|
|
||||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
# This was provided by the end-user; treat it as a known
|
||||||
|
# definite encoding per the algorithm laid out in the HTML5
|
||||||
|
# spec. (See the EncodingDetector class for details.)
|
||||||
|
known_definite_encodings = [user_specified_encoding]
|
||||||
|
|
||||||
|
# This was found in the document; treat it as a slightly lower-priority
|
||||||
|
# user encoding.
|
||||||
|
user_encodings = [document_declared_encoding]
|
||||||
detector = EncodingDetector(
|
detector = EncodingDetector(
|
||||||
markup, try_encodings, is_html, exclude_encodings)
|
markup, known_definite_encodings=known_definite_encodings,
|
||||||
|
user_encodings=user_encodings, is_html=is_html,
|
||||||
|
exclude_encodings=exclude_encodings
|
||||||
|
)
|
||||||
for encoding in detector.encodings:
|
for encoding in detector.encodings:
|
||||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
import requests
|
|
||||||
data = requests.get("https://www.crummy.com/").content
|
|
||||||
from bs4 import _s
|
|
||||||
data = [x for x in _s(data).block_text()]
|
|
2566
lib/bs4/dammit.py
2566
lib/bs4/dammit.py
File diff suppressed because it is too large
Load diff
|
@ -20,9 +20,13 @@ import sys
|
||||||
import cProfile
|
import cProfile
|
||||||
|
|
||||||
def diagnose(data):
|
def diagnose(data):
|
||||||
"""Diagnostic suite for isolating common problems."""
|
"""Diagnostic suite for isolating common problems.
|
||||||
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
|
||||||
print("Python version %s" % sys.version)
|
:param data: A string containing markup that needs to be explained.
|
||||||
|
:return: None; diagnostics are printed to standard output.
|
||||||
|
"""
|
||||||
|
print(("Diagnostic running on Beautiful Soup %s" % __version__))
|
||||||
|
print(("Python version %s" % sys.version))
|
||||||
|
|
||||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||||
for name in basic_parsers:
|
for name in basic_parsers:
|
||||||
|
@ -39,65 +43,76 @@ def diagnose(data):
|
||||||
basic_parsers.append("lxml-xml")
|
basic_parsers.append("lxml-xml")
|
||||||
try:
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
print (
|
print(
|
||||||
"lxml is not installed or couldn't be imported.")
|
"lxml is not installed or couldn't be imported.")
|
||||||
|
|
||||||
|
|
||||||
if 'html5lib' in basic_parsers:
|
if 'html5lib' in basic_parsers:
|
||||||
try:
|
try:
|
||||||
import html5lib
|
import html5lib
|
||||||
print("Found html5lib version %s" % html5lib.__version__)
|
print(("Found html5lib version %s" % html5lib.__version__))
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
print (
|
print(
|
||||||
"html5lib is not installed or couldn't be imported.")
|
"html5lib is not installed or couldn't be imported.")
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
|
||||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
if os.path.exists(data):
|
if os.path.exists(data):
|
||||||
print('"%s" looks like a filename. Reading data from the file.' % data)
|
print(('"%s" looks like a filename. Reading data from the file.' % data))
|
||||||
with open(data) as fp:
|
with open(data) as fp:
|
||||||
data = fp.read()
|
data = fp.read()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# This can happen on some platforms when the 'filename' is
|
# This can happen on some platforms when the 'filename' is
|
||||||
# too long. Assume it's data and not a filename.
|
# too long. Assume it's data and not a filename.
|
||||||
pass
|
pass
|
||||||
print()
|
print("")
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print("Trying to parse your markup with %s" % parser)
|
print(("Trying to parse your markup with %s" % parser))
|
||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(data, features=parser)
|
soup = BeautifulSoup(data, features=parser)
|
||||||
success = True
|
success = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("%s could not parse the markup." % parser)
|
print(("%s could not parse the markup." % parser))
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print("Here's what %s did with the markup:" % parser)
|
print(("Here's what %s did with the markup:" % parser))
|
||||||
print(soup.prettify())
|
print((soup.prettify()))
|
||||||
|
|
||||||
print("-" * 80)
|
print(("-" * 80))
|
||||||
|
|
||||||
def lxml_trace(data, html=True, **kwargs):
|
def lxml_trace(data, html=True, **kwargs):
|
||||||
"""Print out the lxml events that occur during parsing.
|
"""Print out the lxml events that occur during parsing.
|
||||||
|
|
||||||
This lets you see how lxml parses a document when no Beautiful
|
This lets you see how lxml parses a document when no Beautiful
|
||||||
Soup code is running.
|
Soup code is running. You can use this to determine whether
|
||||||
|
an lxml-specific problem is in Beautiful Soup's lxml tree builders
|
||||||
|
or in lxml itself.
|
||||||
|
|
||||||
|
:param data: Some markup.
|
||||||
|
:param html: If True, markup will be parsed with lxml's HTML parser.
|
||||||
|
if False, lxml's XML parser will be used.
|
||||||
"""
|
"""
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||||
|
|
||||||
class AnnouncingParser(HTMLParser):
|
class AnnouncingParser(HTMLParser):
|
||||||
"""Announces HTMLParser parse events, without doing anything else."""
|
"""Subclass of HTMLParser that announces parse events, without doing
|
||||||
|
anything else.
|
||||||
|
|
||||||
|
You can use this to get a picture of how html.parser sees a given
|
||||||
|
document. The easiest way to do this is to call `htmlparser_trace`.
|
||||||
|
"""
|
||||||
|
|
||||||
def _p(self, s):
|
def _p(self, s):
|
||||||
print(s)
|
print(s)
|
||||||
|
@ -134,6 +149,8 @@ def htmlparser_trace(data):
|
||||||
|
|
||||||
This lets you see how HTMLParser parses a document when no
|
This lets you see how HTMLParser parses a document when no
|
||||||
Beautiful Soup code is running.
|
Beautiful Soup code is running.
|
||||||
|
|
||||||
|
:param data: Some markup.
|
||||||
"""
|
"""
|
||||||
parser = AnnouncingParser()
|
parser = AnnouncingParser()
|
||||||
parser.feed(data)
|
parser.feed(data)
|
||||||
|
@ -176,9 +193,9 @@ def rdoc(num_elements=1000):
|
||||||
|
|
||||||
def benchmark_parsers(num_elements=100000):
|
def benchmark_parsers(num_elements=100000):
|
||||||
"""Very basic head-to-head performance benchmark."""
|
"""Very basic head-to-head performance benchmark."""
|
||||||
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
|
||||||
data = rdoc(num_elements)
|
data = rdoc(num_elements)
|
||||||
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
|
||||||
|
|
||||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||||
success = False
|
success = False
|
||||||
|
@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
|
||||||
b = time.time()
|
b = time.time()
|
||||||
success = True
|
success = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("%s could not parse the markup." % parser)
|
print(("%s could not parse the markup." % parser))
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
a = time.time()
|
a = time.time()
|
||||||
etree.HTML(data)
|
etree.HTML(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
parser = html5lib.HTMLParser()
|
parser = html5lib.HTMLParser()
|
||||||
a = time.time()
|
a = time.time()
|
||||||
parser.parse(data)
|
parser.parse(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
|
||||||
|
|
||||||
def profile(num_elements=100000, parser="lxml"):
|
def profile(num_elements=100000, parser="lxml"):
|
||||||
|
"""Use Python's profiler on a randomly generated document."""
|
||||||
filehandle = tempfile.NamedTemporaryFile()
|
filehandle = tempfile.NamedTemporaryFile()
|
||||||
filename = filehandle.name
|
filename = filehandle.name
|
||||||
|
|
||||||
|
@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
|
||||||
stats.sort_stats("cumulative")
|
stats.sort_stats("cumulative")
|
||||||
stats.print_stats('_html5lib|bs4', 50)
|
stats.print_stats('_html5lib|bs4', 50)
|
||||||
|
|
||||||
|
# If this file is run as a script, standard input is diagnosed.
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
diagnose(sys.stdin.read())
|
diagnose(sys.stdin.read())
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):
|
||||||
|
|
||||||
Some parts of this strategy come from the distinction between
|
Some parts of this strategy come from the distinction between
|
||||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||||
|
|
||||||
|
Formatters are passed in as the `formatter` argument to methods
|
||||||
|
like `PageElement.encode`. Most people won't need to think about
|
||||||
|
formatters, and most people who need to think about them can pass
|
||||||
|
in one of these predefined strings as `formatter` rather than
|
||||||
|
making a new Formatter object:
|
||||||
|
|
||||||
|
For HTML documents:
|
||||||
|
* 'html' - HTML entity substitution for generic HTML documents. (default)
|
||||||
|
* 'html5' - HTML entity substitution for HTML5 documents, as
|
||||||
|
well as some optimizations in the way tags are rendered.
|
||||||
|
* 'minimal' - Only make the substitutions necessary to guarantee
|
||||||
|
valid HTML.
|
||||||
|
* None - Do not perform any substitution. This will be faster
|
||||||
|
but may result in invalid markup.
|
||||||
|
|
||||||
|
For XML documents:
|
||||||
|
* 'html' - Entity substitution for XHTML documents.
|
||||||
|
* 'minimal' - Only make the substitutions necessary to guarantee
|
||||||
|
valid XML. (default)
|
||||||
|
* None - Do not perform any substitution. This will be faster
|
||||||
|
but may result in invalid markup.
|
||||||
"""
|
"""
|
||||||
# Registries of XML and HTML formatters.
|
# Registries of XML and HTML formatters.
|
||||||
XML_FORMATTERS = {}
|
XML_FORMATTERS = {}
|
||||||
|
@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, language=None, entity_substitution=None,
|
self, language=None, entity_substitution=None,
|
||||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||||
|
empty_attributes_are_booleans=False,
|
||||||
):
|
):
|
||||||
"""
|
"""Constructor.
|
||||||
|
|
||||||
:param void_element_close_prefix: By default, represent void
|
:param language: This should be Formatter.XML if you are formatting
|
||||||
elements as <tag/> rather than <tag>
|
XML markup and Formatter.HTML if you are formatting HTML markup.
|
||||||
|
|
||||||
|
:param entity_substitution: A function to call to replace special
|
||||||
|
characters with XML/HTML entities. For examples, see
|
||||||
|
bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
|
||||||
|
:param void_element_close_prefix: By default, void elements
|
||||||
|
are represented as <tag/> (XML rules) rather than <tag>
|
||||||
|
(HTML rules). To get <tag>, pass in the empty string.
|
||||||
|
:param cdata_containing_tags: The list of tags that are defined
|
||||||
|
as containing CDATA in this dialect. For example, in HTML,
|
||||||
|
<script> and <style> tags are defined as containing CDATA,
|
||||||
|
and their contents should not be formatted.
|
||||||
|
:param blank_attributes_are_booleans: Render attributes whose value
|
||||||
|
is the empty string as HTML-style boolean attributes.
|
||||||
|
(Attributes whose value is None are always rendered this way.)
|
||||||
"""
|
"""
|
||||||
self.language = language
|
self.language = language
|
||||||
self.entity_substitution = entity_substitution
|
self.entity_substitution = entity_substitution
|
||||||
|
@ -39,9 +76,17 @@ class Formatter(EntitySubstitution):
|
||||||
self.cdata_containing_tags = self._default(
|
self.cdata_containing_tags = self._default(
|
||||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||||
)
|
)
|
||||||
|
self.empty_attributes_are_booleans=empty_attributes_are_booleans
|
||||||
|
|
||||||
def substitute(self, ns):
|
def substitute(self, ns):
|
||||||
"""Process a string that needs to undergo entity substitution."""
|
"""Process a string that needs to undergo entity substitution.
|
||||||
|
This may be a string encountered in an attribute value or as
|
||||||
|
text.
|
||||||
|
|
||||||
|
:param ns: A string.
|
||||||
|
:return: A string with certain characters replaced by named
|
||||||
|
or numeric entities.
|
||||||
|
"""
|
||||||
if not self.entity_substitution:
|
if not self.entity_substitution:
|
||||||
return ns
|
return ns
|
||||||
from .element import NavigableString
|
from .element import NavigableString
|
||||||
|
@ -54,21 +99,41 @@ class Formatter(EntitySubstitution):
|
||||||
return self.entity_substitution(ns)
|
return self.entity_substitution(ns)
|
||||||
|
|
||||||
def attribute_value(self, value):
|
def attribute_value(self, value):
|
||||||
"""Process the value of an attribute."""
|
"""Process the value of an attribute.
|
||||||
|
|
||||||
|
:param ns: A string.
|
||||||
|
:return: A string with certain characters replaced by named
|
||||||
|
or numeric entities.
|
||||||
|
"""
|
||||||
return self.substitute(value)
|
return self.substitute(value)
|
||||||
|
|
||||||
def attributes(self, tag):
|
def attributes(self, tag):
|
||||||
"""Reorder a tag's attributes however you want."""
|
"""Reorder a tag's attributes however you want.
|
||||||
return sorted(tag.attrs.items())
|
|
||||||
|
|
||||||
|
By default, attributes are sorted alphabetically. This makes
|
||||||
|
behavior consistent between Python 2 and Python 3, and preserves
|
||||||
|
backwards compatibility with older versions of Beautiful Soup.
|
||||||
|
|
||||||
|
If `empty_boolean_attributes` is True, then attributes whose
|
||||||
|
values are set to the empty string will be treated as boolean
|
||||||
|
attributes.
|
||||||
|
"""
|
||||||
|
if tag.attrs is None:
|
||||||
|
return []
|
||||||
|
return sorted(
|
||||||
|
(k, (None if self.empty_attributes_are_booleans and v == '' else v))
|
||||||
|
for k, v in list(tag.attrs.items())
|
||||||
|
)
|
||||||
|
|
||||||
class HTMLFormatter(Formatter):
|
class HTMLFormatter(Formatter):
|
||||||
|
"""A generic Formatter for HTML."""
|
||||||
REGISTRY = {}
|
REGISTRY = {}
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class XMLFormatter(Formatter):
|
class XMLFormatter(Formatter):
|
||||||
|
"""A generic Formatter for XML."""
|
||||||
REGISTRY = {}
|
REGISTRY = {}
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||||
|
@ -80,7 +145,8 @@ HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||||
)
|
)
|
||||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||||
entity_substitution=EntitySubstitution.substitute_html,
|
entity_substitution=EntitySubstitution.substitute_html,
|
||||||
void_element_close_prefix = None
|
void_element_close_prefix=None,
|
||||||
|
empty_attributes_are_booleans=True,
|
||||||
)
|
)
|
||||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||||
entity_substitution=EntitySubstitution.substitute_xml
|
entity_substitution=EntitySubstitution.substitute_xml
|
||||||
|
|
|
@ -8,6 +8,7 @@ import pickle
|
||||||
import copy
|
import copy
|
||||||
import functools
|
import functools
|
||||||
import unittest
|
import unittest
|
||||||
|
import warnings
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
|
@ -15,7 +16,10 @@ from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
PYTHON_SPECIFIC_ENCODINGS,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
|
Script,
|
||||||
|
Stylesheet,
|
||||||
Tag
|
Tag
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -83,8 +87,22 @@ class SoupTest(unittest.TestCase):
|
||||||
if compare_parsed_to is None:
|
if compare_parsed_to is None:
|
||||||
compare_parsed_to = to_parse
|
compare_parsed_to = to_parse
|
||||||
|
|
||||||
|
# Verify that the documents come out the same.
|
||||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||||
|
|
||||||
|
# Also run some checks on the BeautifulSoup object itself:
|
||||||
|
|
||||||
|
# Verify that every tag that was opened was eventually closed.
|
||||||
|
|
||||||
|
# There are no tags in the open tag counter.
|
||||||
|
assert all(v==0 for v in list(obj.open_tag_counter.values()))
|
||||||
|
|
||||||
|
# The only tag in the tag stack is the one for the root
|
||||||
|
# document.
|
||||||
|
self.assertEqual(
|
||||||
|
[obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
|
||||||
|
)
|
||||||
|
|
||||||
def assertConnectedness(self, element):
|
def assertConnectedness(self, element):
|
||||||
"""Ensure that next_element and previous_element are properly
|
"""Ensure that next_element and previous_element are properly
|
||||||
set for all descendants of the given element.
|
set for all descendants of the given element.
|
||||||
|
@ -211,7 +229,41 @@ class SoupTest(unittest.TestCase):
|
||||||
return child
|
return child
|
||||||
|
|
||||||
|
|
||||||
class HTMLTreeBuilderSmokeTest(object):
|
class TreeBuilderSmokeTest(object):
|
||||||
|
# Tests that are common to HTML and XML tree builders.
|
||||||
|
|
||||||
|
def test_fuzzed_input(self):
|
||||||
|
# This test centralizes in one place the various fuzz tests
|
||||||
|
# for Beautiful Soup created by the oss-fuzz project.
|
||||||
|
|
||||||
|
# These strings superficially resemble markup, but they
|
||||||
|
# generally can't be parsed into anything. The best we can
|
||||||
|
# hope for is that parsing these strings won't crash the
|
||||||
|
# parser.
|
||||||
|
#
|
||||||
|
# n.b. This markup is commented out because these fuzz tests
|
||||||
|
# _do_ crash the parser. However the crashes are due to bugs
|
||||||
|
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
|
||||||
|
# bugs!
|
||||||
|
|
||||||
|
bad_markup = [
|
||||||
|
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
|
||||||
|
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
|
||||||
|
# https://bugs.python.org/issue37747
|
||||||
|
#
|
||||||
|
#b'\n<![\xff\xfe\xfe\xcd\x00',
|
||||||
|
|
||||||
|
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
|
||||||
|
# https://bugs.python.org/issue34480
|
||||||
|
#
|
||||||
|
#b'<![n\x00'
|
||||||
|
]
|
||||||
|
for markup in bad_markup:
|
||||||
|
with warnings.catch_warnings(record=False):
|
||||||
|
soup = self.soup(markup)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
|
||||||
|
|
||||||
"""A basic test of a treebuilder's competence.
|
"""A basic test of a treebuilder's competence.
|
||||||
|
|
||||||
|
@ -233,6 +285,22 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
new_tag = soup.new_tag(name)
|
new_tag = soup.new_tag(name)
|
||||||
self.assertEqual(True, new_tag.is_empty_element)
|
self.assertEqual(True, new_tag.is_empty_element)
|
||||||
|
|
||||||
|
def test_special_string_containers(self):
|
||||||
|
soup = self.soup(
|
||||||
|
"<style>Some CSS</style><script>Some Javascript</script>"
|
||||||
|
)
|
||||||
|
assert isinstance(soup.style.string, Stylesheet)
|
||||||
|
assert isinstance(soup.script.string, Script)
|
||||||
|
|
||||||
|
soup = self.soup(
|
||||||
|
"<style><!--Some CSS--></style>"
|
||||||
|
)
|
||||||
|
assert isinstance(soup.style.string, Stylesheet)
|
||||||
|
# The contents of the style tag resemble an HTML comment, but
|
||||||
|
# it's not treated as a comment.
|
||||||
|
self.assertEqual("<!--Some CSS-->", soup.style.string)
|
||||||
|
assert isinstance(soup.style.string, Stylesheet)
|
||||||
|
|
||||||
def test_pickle_and_unpickle_identity(self):
|
def test_pickle_and_unpickle_identity(self):
|
||||||
# Pickling a tree, then unpickling it, yields a tree identical
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
# to the original.
|
# to the original.
|
||||||
|
@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
doctype = soup.contents[0]
|
doctype = soup.contents[0]
|
||||||
self.assertEqual(doctype.__class__, Doctype)
|
self.assertEqual(doctype.__class__, Doctype)
|
||||||
self.assertEqual(doctype, doctype_fragment)
|
self.assertEqual(doctype, doctype_fragment)
|
||||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
self.assertEqual(
|
||||||
|
soup.encode("utf8")[:len(doctype_str)],
|
||||||
|
doctype_str
|
||||||
|
)
|
||||||
|
|
||||||
# Make sure that the doctype was correctly associated with the
|
# Make sure that the doctype was correctly associated with the
|
||||||
# parse tree and that the rest of the document parsed.
|
# parse tree and that the rest of the document parsed.
|
||||||
self.assertEqual(soup.p.contents[0], 'foo')
|
self.assertEqual(soup.p.contents[0], 'foo')
|
||||||
|
|
||||||
def _document_with_doctype(self, doctype_fragment):
|
def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
|
||||||
"""Generate and parse a document with the given doctype."""
|
"""Generate and parse a document with the given doctype."""
|
||||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
|
||||||
markup = doctype + '\n<p>foo</p>'
|
markup = doctype + '\n<p>foo</p>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
return doctype, soup
|
return doctype.encode("utf8"), soup
|
||||||
|
|
||||||
def test_normal_doctypes(self):
|
def test_normal_doctypes(self):
|
||||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||||
|
@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
doctype = soup.contents[0]
|
doctype = soup.contents[0]
|
||||||
self.assertEqual("", doctype.strip())
|
self.assertEqual("", doctype.strip())
|
||||||
|
|
||||||
|
def test_mixed_case_doctype(self):
|
||||||
|
# A lowercase or mixed-case doctype becomes a Doctype.
|
||||||
|
for doctype_fragment in ("doctype", "DocType"):
|
||||||
|
doctype_str, soup = self._document_with_doctype(
|
||||||
|
"html", doctype_fragment
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure a Doctype object was created and that the DOCTYPE
|
||||||
|
# is uppercase.
|
||||||
|
doctype = soup.contents[0]
|
||||||
|
self.assertEqual(doctype.__class__, Doctype)
|
||||||
|
self.assertEqual(doctype, "html")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.encode("utf8")[:len(doctype_str)],
|
||||||
|
b"<!DOCTYPE html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure that the doctype was correctly associated with the
|
||||||
|
# parse tree and that the rest of the document parsed.
|
||||||
|
self.assertEqual(soup.p.contents[0], 'foo')
|
||||||
|
|
||||||
def test_public_doctype_with_url(self):
|
def test_public_doctype_with_url(self):
|
||||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||||
self.assertDoctypeHandled(doctype)
|
self.assertDoctypeHandled(doctype)
|
||||||
|
@ -779,11 +871,44 @@ Hello, world!
|
||||||
# encoding.
|
# encoding.
|
||||||
self.assertEqual('utf8', charset.encode("utf8"))
|
self.assertEqual('utf8', charset.encode("utf8"))
|
||||||
|
|
||||||
|
def test_python_specific_encodings_not_used_in_charset(self):
|
||||||
|
# You can encode an HTML document using a Python-specific
|
||||||
|
# encoding, but that encoding won't be mentioned _inside_ the
|
||||||
|
# resulting document. Instead, the document will appear to
|
||||||
|
# have no encoding.
|
||||||
|
for markup in [
|
||||||
|
b'<meta charset="utf8"></head>'
|
||||||
|
b'<meta id="encoding" charset="utf-8" />'
|
||||||
|
]:
|
||||||
|
soup = self.soup(markup)
|
||||||
|
for encoding in PYTHON_SPECIFIC_ENCODINGS:
|
||||||
|
if encoding in (
|
||||||
|
'idna', 'mbcs', 'oem', 'undefined',
|
||||||
|
'string_escape', 'string-escape'
|
||||||
|
):
|
||||||
|
# For one reason or another, these will raise an
|
||||||
|
# exception if we actually try to use them, so don't
|
||||||
|
# bother.
|
||||||
|
continue
|
||||||
|
encoded = soup.encode(encoding)
|
||||||
|
assert b'meta charset=""' in encoded
|
||||||
|
assert encoding.encode("ascii") not in encoded
|
||||||
|
|
||||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||||
data = self.soup("<a>text</a>")
|
data = self.soup("<a>text</a>")
|
||||||
data.a['foo'] = 'bar'
|
data.a['foo'] = 'bar'
|
||||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||||
|
|
||||||
|
def test_closing_tag_with_no_opening_tag(self):
|
||||||
|
# Without BeautifulSoup.open_tag_counter, the </span> tag will
|
||||||
|
# cause _popToTag to be called over and over again as we look
|
||||||
|
# for a <span> tag that wasn't there. The result is that 'text2'
|
||||||
|
# will show up outside the body of the document.
|
||||||
|
soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
|
||||||
|
self.assertEqual(
|
||||||
|
"<body><div><p>text1</p>text2</div></body>", soup.body.decode()
|
||||||
|
)
|
||||||
|
|
||||||
def test_worst_case(self):
|
def test_worst_case(self):
|
||||||
"""Test the worst case (currently) for linking issues."""
|
"""Test the worst case (currently) for linking issues."""
|
||||||
|
|
||||||
|
@ -791,7 +916,7 @@ Hello, world!
|
||||||
self.linkage_validator(soup)
|
self.linkage_validator(soup)
|
||||||
|
|
||||||
|
|
||||||
class XMLTreeBuilderSmokeTest(object):
|
class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
|
||||||
|
|
||||||
def test_pickle_and_unpickle_identity(self):
|
def test_pickle_and_unpickle_identity(self):
|
||||||
# Pickling a tree, then unpickling it, yields a tree identical
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
|
@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(markup, soup.encode("utf8"))
|
self.assertEqual(markup, soup.encode("utf8"))
|
||||||
|
|
||||||
|
def test_python_specific_encodings_not_used_in_xml_declaration(self):
|
||||||
|
# You can encode an XML document using a Python-specific
|
||||||
|
# encoding, but that encoding won't be mentioned _inside_ the
|
||||||
|
# resulting document.
|
||||||
|
markup = b"""<?xml version="1.0"?>\n<foo/>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
for encoding in PYTHON_SPECIFIC_ENCODINGS:
|
||||||
|
if encoding in (
|
||||||
|
'idna', 'mbcs', 'oem', 'undefined',
|
||||||
|
'string_escape', 'string-escape'
|
||||||
|
):
|
||||||
|
# For one reason or another, these will raise an
|
||||||
|
# exception if we actually try to use them, so don't
|
||||||
|
# bother.
|
||||||
|
continue
|
||||||
|
encoded = soup.encode(encoding)
|
||||||
|
assert b'<?xml version="1.0"?>' in encoded
|
||||||
|
assert encoding.encode("ascii") not in encoded
|
||||||
|
|
||||||
def test_processing_instruction(self):
|
def test_processing_instruction(self):
|
||||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
|
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
|
|
|
@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
soup = self.soup(markup, store_line_numbers=False)
|
soup = self.soup(markup, store_line_numbers=False)
|
||||||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
||||||
|
|
||||||
|
def test_special_string_containers(self):
|
||||||
|
# The html5lib tree builder doesn't support this standard feature,
|
||||||
|
# because there's no way of knowing, when a string is created,
|
||||||
|
# where in the tree it will eventually end up.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_html5_attributes(self):
|
||||||
|
# The html5lib TreeBuilder can convert any entity named in
|
||||||
|
# the HTML5 spec to a sequence of Unicode characters, and
|
||||||
|
# convert those Unicode characters to a (potentially
|
||||||
|
# different) named entity on the way out.
|
||||||
|
#
|
||||||
|
# This is a copy of the same test from
|
||||||
|
# HTMLParserTreeBuilderSmokeTest. It's not in the superclass
|
||||||
|
# because the lxml HTML TreeBuilder _doesn't_ work this way.
|
||||||
|
for input_element, output_unicode, output_element in (
|
||||||
|
("⇄", '\u21c4', b'⇄'),
|
||||||
|
('⊧', '\u22a7', b'⊧'),
|
||||||
|
('𝔑', '\U0001d511', b'𝔑'),
|
||||||
|
('≧̸', '\u2267\u0338', b'≧̸'),
|
||||||
|
('¬', '\xac', b'¬'),
|
||||||
|
('⫬', '\u2aec', b'⫬'),
|
||||||
|
('"', '"', b'"'),
|
||||||
|
('∴', '\u2234', b'∴'),
|
||||||
|
('∴', '\u2234', b'∴'),
|
||||||
|
('∴', '\u2234', b'∴'),
|
||||||
|
("fj", 'fj', b'fj'),
|
||||||
|
("⊔", '\u2294', b'⊔'),
|
||||||
|
("⊔︀", '\u2294\ufe00', b'⊔︀'),
|
||||||
|
("'", "'", b"'"),
|
||||||
|
("|", "|", b"|"),
|
||||||
|
):
|
||||||
|
markup = '<div>%s</div>' % input_element
|
||||||
|
div = self.soup(markup).div
|
||||||
|
without_element = div.encode()
|
||||||
|
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
||||||
|
self.assertEqual(without_element, expect)
|
||||||
|
|
||||||
|
with_element = div.encode(formatter="html")
|
||||||
|
expect = b"<div>%s</div>" % output_element
|
||||||
|
self.assertEqual(with_element, expect)
|
||||||
|
|
|
@ -3,6 +3,7 @@ trees."""
|
||||||
|
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import pickle
|
import pickle
|
||||||
|
import warnings
|
||||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||||
|
@ -51,6 +52,74 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
||||||
|
|
||||||
|
def test_on_duplicate_attribute(self):
|
||||||
|
# The html.parser tree builder has a variety of ways of
|
||||||
|
# handling a tag that contains the same attribute multiple times.
|
||||||
|
|
||||||
|
markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
|
||||||
|
|
||||||
|
# If you don't provide any particular value for
|
||||||
|
# on_duplicate_attribute, later values replace earlier values.
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("url3", soup.a['href'])
|
||||||
|
self.assertEqual(["cls"], soup.a['class'])
|
||||||
|
self.assertEqual("id", soup.a['id'])
|
||||||
|
|
||||||
|
# You can also get this behavior explicitly.
|
||||||
|
def assert_attribute(on_duplicate_attribute, expected):
|
||||||
|
soup = self.soup(
|
||||||
|
markup, on_duplicate_attribute=on_duplicate_attribute
|
||||||
|
)
|
||||||
|
self.assertEqual(expected, soup.a['href'])
|
||||||
|
|
||||||
|
# Verify that non-duplicate attributes are treated normally.
|
||||||
|
self.assertEqual(["cls"], soup.a['class'])
|
||||||
|
self.assertEqual("id", soup.a['id'])
|
||||||
|
assert_attribute(None, "url3")
|
||||||
|
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
|
||||||
|
|
||||||
|
# You can ignore subsequent values in favor of the first.
|
||||||
|
assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
|
||||||
|
|
||||||
|
# And you can pass in a callable that does whatever you want.
|
||||||
|
def accumulate(attrs, key, value):
|
||||||
|
if not isinstance(attrs[key], list):
|
||||||
|
attrs[key] = [attrs[key]]
|
||||||
|
attrs[key].append(value)
|
||||||
|
assert_attribute(accumulate, ["url1", "url2", "url3"])
|
||||||
|
|
||||||
|
def test_html5_attributes(self):
|
||||||
|
# The html.parser TreeBuilder can convert any entity named in
|
||||||
|
# the HTML5 spec to a sequence of Unicode characters, and
|
||||||
|
# convert those Unicode characters to a (potentially
|
||||||
|
# different) named entity on the way out.
|
||||||
|
for input_element, output_unicode, output_element in (
|
||||||
|
("⇄", '\u21c4', b'⇄'),
|
||||||
|
('⊧', '\u22a7', b'⊧'),
|
||||||
|
('𝔑', '\U0001d511', b'𝔑'),
|
||||||
|
('≧̸', '\u2267\u0338', b'≧̸'),
|
||||||
|
('¬', '\xac', b'¬'),
|
||||||
|
('⫬', '\u2aec', b'⫬'),
|
||||||
|
('"', '"', b'"'),
|
||||||
|
('∴', '\u2234', b'∴'),
|
||||||
|
('∴', '\u2234', b'∴'),
|
||||||
|
('∴', '\u2234', b'∴'),
|
||||||
|
("fj", 'fj', b'fj'),
|
||||||
|
("⊔", '\u2294', b'⊔'),
|
||||||
|
("⊔︀", '\u2294\ufe00', b'⊔︀'),
|
||||||
|
("'", "'", b"'"),
|
||||||
|
("|", "|", b"|"),
|
||||||
|
):
|
||||||
|
markup = '<div>%s</div>' % input_element
|
||||||
|
div = self.soup(markup).div
|
||||||
|
without_element = div.encode()
|
||||||
|
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
||||||
|
self.assertEqual(without_element, expect)
|
||||||
|
|
||||||
|
with_element = div.encode(formatter="html")
|
||||||
|
expect = b"<div>%s</div>" % output_element
|
||||||
|
self.assertEqual(with_element, expect)
|
||||||
|
|
||||||
|
|
||||||
class TestHTMLParserSubclass(SoupTest):
|
class TestHTMLParserSubclass(SoupTest):
|
||||||
def test_error(self):
|
def test_error(self):
|
||||||
|
@ -58,4 +127,8 @@ class TestHTMLParserSubclass(SoupTest):
|
||||||
that doesn't cause a crash.
|
that doesn't cause a crash.
|
||||||
"""
|
"""
|
||||||
parser = BeautifulSoupHTMLParser()
|
parser = BeautifulSoupHTMLParser()
|
||||||
|
with warnings.catch_warnings(record=True) as warns:
|
||||||
parser.error("don't crash")
|
parser.error("don't crash")
|
||||||
|
[warning] = warns
|
||||||
|
assert "don't crash" == str(warning.message)
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
@ -10,6 +11,8 @@ import tempfile
|
||||||
from bs4 import (
|
from bs4 import (
|
||||||
BeautifulSoup,
|
BeautifulSoup,
|
||||||
BeautifulStoneSoup,
|
BeautifulStoneSoup,
|
||||||
|
GuessedAtParserWarning,
|
||||||
|
MarkupResemblesLocatorWarning,
|
||||||
)
|
)
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
TreeBuilder,
|
TreeBuilder,
|
||||||
|
@ -29,7 +32,6 @@ import bs4.dammit
|
||||||
from bs4.dammit import (
|
from bs4.dammit import (
|
||||||
EntitySubstitution,
|
EntitySubstitution,
|
||||||
UnicodeDammit,
|
UnicodeDammit,
|
||||||
EncodingDetector,
|
|
||||||
)
|
)
|
||||||
from bs4.testing import (
|
from bs4.testing import (
|
||||||
default_builder,
|
default_builder,
|
||||||
|
@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
|
||||||
self.store_line_numbers = False
|
self.store_line_numbers = False
|
||||||
self.cdata_list_attributes = []
|
self.cdata_list_attributes = []
|
||||||
self.preserve_whitespace_tags = []
|
self.preserve_whitespace_tags = []
|
||||||
|
self.string_containers = {}
|
||||||
def initialize_soup(self, soup):
|
def initialize_soup(self, soup):
|
||||||
pass
|
pass
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
|
@ -187,27 +190,68 @@ class TestConstructor(SoupTest):
|
||||||
for x in soup.recursiveChildGenerator()
|
for x in soup.recursiveChildGenerator()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_alternate_string_containers(self):
|
||||||
|
# Test the ability to customize the string containers for
|
||||||
|
# different types of tags.
|
||||||
|
class PString(NavigableString):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class BString(NavigableString):
|
||||||
|
pass
|
||||||
|
|
||||||
|
soup = self.soup(
|
||||||
|
"<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
|
||||||
|
string_containers = {
|
||||||
|
'b': BString,
|
||||||
|
'p': PString,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# The string before the <p> tag is a regular NavigableString.
|
||||||
|
assert isinstance(soup.div.contents[0], NavigableString)
|
||||||
|
|
||||||
|
# The string inside the <p> tag, but not inside the <i> tag,
|
||||||
|
# is a PString.
|
||||||
|
assert isinstance(soup.p.contents[0], PString)
|
||||||
|
|
||||||
|
# Every string inside the <b> tag is a BString, even the one that
|
||||||
|
# was also inside an <i> tag.
|
||||||
|
for s in soup.b.strings:
|
||||||
|
assert isinstance(s, BString)
|
||||||
|
|
||||||
|
# Now that parsing was complete, the string_container_stack
|
||||||
|
# (where this information was kept) has been cleared out.
|
||||||
|
self.assertEqual([], soup.string_container_stack)
|
||||||
|
|
||||||
|
|
||||||
class TestWarnings(SoupTest):
|
class TestWarnings(SoupTest):
|
||||||
|
|
||||||
def _no_parser_specified(self, s, is_there=True):
|
def _assert_warning(self, warnings, cls):
|
||||||
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
|
for w in warnings:
|
||||||
self.assertTrue(v)
|
if isinstance(w.message, cls):
|
||||||
|
return w
|
||||||
|
raise Exception("%s warning not found in %r" % cls, warnings)
|
||||||
|
|
||||||
|
def _assert_no_parser_specified(self, w):
|
||||||
|
warning = self._assert_warning(w, GuessedAtParserWarning)
|
||||||
|
message = str(warning.message)
|
||||||
|
self.assertTrue(
|
||||||
|
message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
|
||||||
|
)
|
||||||
|
|
||||||
def test_warning_if_no_parser_specified(self):
|
def test_warning_if_no_parser_specified(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup("<a><b></b></a>")
|
soup = BeautifulSoup("<a><b></b></a>")
|
||||||
msg = str(w[0].message)
|
self._assert_no_parser_specified(w)
|
||||||
self._assert_no_parser_specified(msg)
|
|
||||||
|
|
||||||
def test_warning_if_parser_specified_too_vague(self):
|
def test_warning_if_parser_specified_too_vague(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup("<a><b></b></a>", "html")
|
soup = BeautifulSoup("<a><b></b></a>", "html")
|
||||||
msg = str(w[0].message)
|
self._assert_no_parser_specified(w)
|
||||||
self._assert_no_parser_specified(msg)
|
|
||||||
|
|
||||||
def test_no_warning_if_explicit_parser_specified(self):
|
def test_no_warning_if_explicit_parser_specified(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup("<a><b></b></a>", "html.parser")
|
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
|
||||||
self.assertEqual([], w)
|
self.assertEqual([], w)
|
||||||
|
|
||||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||||
|
@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||||
|
|
||||||
class TestWarnings(SoupTest):
|
|
||||||
|
|
||||||
def test_disk_file_warning(self):
|
def test_disk_file_warning(self):
|
||||||
filehandle = tempfile.NamedTemporaryFile()
|
filehandle = tempfile.NamedTemporaryFile()
|
||||||
filename = filehandle.name
|
filename = filehandle.name
|
||||||
try:
|
try:
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup(filename)
|
soup = self.soup(filename)
|
||||||
msg = str(w[0].message)
|
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
||||||
self.assertTrue("looks like a filename" in msg)
|
self.assertTrue("looks like a filename" in str(warning.message))
|
||||||
finally:
|
finally:
|
||||||
filehandle.close()
|
filehandle.close()
|
||||||
|
|
||||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup(filename)
|
soup = self.soup(filename)
|
||||||
self.assertEqual(0, len(w))
|
self.assertEqual([], w)
|
||||||
|
|
||||||
|
def test_directory_warning(self):
|
||||||
|
try:
|
||||||
|
filename = tempfile.mkdtemp()
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup(filename)
|
||||||
|
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
||||||
|
self.assertTrue("looks like a directory" in str(warning.message))
|
||||||
|
finally:
|
||||||
|
os.rmdir(filename)
|
||||||
|
|
||||||
|
# The directory no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = self.soup(filename)
|
||||||
|
self.assertEqual([], w)
|
||||||
|
|
||||||
def test_url_warning_with_bytes_url(self):
|
def test_url_warning_with_bytes_url(self):
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
soup = self.soup(b"http://www.crummybytes.com/")
|
soup = self.soup(b"http://www.crummybytes.com/")
|
||||||
# Be aware this isn't the only warning that can be raised during
|
warning = self._assert_warning(
|
||||||
# execution..
|
warning_list, MarkupResemblesLocatorWarning
|
||||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
)
|
||||||
for w in warning_list))
|
self.assertTrue("looks like a URL" in str(warning.message))
|
||||||
|
|
||||||
def test_url_warning_with_unicode_url(self):
|
def test_url_warning_with_unicode_url(self):
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
# note - this url must differ from the bytes one otherwise
|
# note - this url must differ from the bytes one otherwise
|
||||||
# python's warnings system swallows the second warning
|
# python's warnings system swallows the second warning
|
||||||
soup = self.soup("http://www.crummyunicode.com/")
|
soup = self.soup("http://www.crummyunicode.com/")
|
||||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
warning = self._assert_warning(
|
||||||
for w in warning_list))
|
warning_list, MarkupResemblesLocatorWarning
|
||||||
|
)
|
||||||
|
self.assertTrue("looks like a URL" in str(warning.message))
|
||||||
|
|
||||||
def test_url_warning_with_bytes_and_space(self):
|
def test_url_warning_with_bytes_and_space(self):
|
||||||
|
# Here the markup contains something besides a URL, so no warning
|
||||||
|
# is issued.
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||||
|
@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
|
||||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||||
"‘’foo“”")
|
"‘’foo“”")
|
||||||
|
|
||||||
|
def test_html5_entity(self):
|
||||||
|
# Some HTML5 entities correspond to single- or multi-character
|
||||||
|
# Unicode sequences.
|
||||||
|
|
||||||
|
for entity, u in (
|
||||||
|
# A few spot checks of our ability to recognize
|
||||||
|
# special character sequences and convert them
|
||||||
|
# to named entities.
|
||||||
|
('⊧', '\u22a7'),
|
||||||
|
('𝔑', '\U0001d511'),
|
||||||
|
('≧̸', '\u2267\u0338'),
|
||||||
|
('¬', '\xac'),
|
||||||
|
('⫬', '\u2aec'),
|
||||||
|
|
||||||
|
# We _could_ convert | to &verbarr;, but we don't, because
|
||||||
|
# | is an ASCII character.
|
||||||
|
('|' '|'),
|
||||||
|
|
||||||
|
# Similarly for the fj ligature, which we could convert to
|
||||||
|
# fj, but we don't.
|
||||||
|
("fj", "fj"),
|
||||||
|
|
||||||
|
# We do convert _these_ ASCII characters to HTML entities,
|
||||||
|
# because that's required to generate valid HTML.
|
||||||
|
('>', '>'),
|
||||||
|
('<', '<'),
|
||||||
|
('&', '&'),
|
||||||
|
):
|
||||||
|
template = '3 %s 4'
|
||||||
|
raw = template % u
|
||||||
|
with_entities = template % entity
|
||||||
|
self.assertEqual(self.sub.substitute_html(raw), with_entities)
|
||||||
|
|
||||||
|
def test_html5_entity_with_variation_selector(self):
|
||||||
|
# Some HTML5 entities correspond either to a single-character
|
||||||
|
# Unicode sequence _or_ to the same character plus U+FE00,
|
||||||
|
# VARIATION SELECTOR 1. We can handle this.
|
||||||
|
data = "fjords \u2294 penguins"
|
||||||
|
markup = "fjords ⊔ penguins"
|
||||||
|
self.assertEqual(self.sub.substitute_html(data), markup)
|
||||||
|
|
||||||
|
data = "fjords \u2294\ufe00 penguins"
|
||||||
|
markup = "fjords ⊔︀ penguins"
|
||||||
|
self.assertEqual(self.sub.substitute_html(data), markup)
|
||||||
|
|
||||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||||
s = 'Welcome to "my bar"'
|
s = 'Welcome to "my bar"'
|
||||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||||
|
@ -416,225 +522,6 @@ class TestEncodingConversion(SoupTest):
|
||||||
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||||
|
|
||||||
class TestUnicodeDammit(unittest.TestCase):
|
|
||||||
"""Standalone tests of UnicodeDammit."""
|
|
||||||
|
|
||||||
def test_unicode_input(self):
|
|
||||||
markup = "I'm already Unicode! \N{SNOWMAN}"
|
|
||||||
dammit = UnicodeDammit(markup)
|
|
||||||
self.assertEqual(dammit.unicode_markup, markup)
|
|
||||||
|
|
||||||
def test_smart_quotes_to_unicode(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup)
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
|
|
||||||
|
|
||||||
def test_smart_quotes_to_xml_entities(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
|
||||||
|
|
||||||
def test_smart_quotes_to_html_entities(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
|
||||||
|
|
||||||
def test_smart_quotes_to_ascii(self):
|
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
|
||||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
|
||||||
self.assertEqual(
|
|
||||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
|
||||||
|
|
||||||
def test_detect_utf8(self):
|
|
||||||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
|
||||||
dammit = UnicodeDammit(utf8)
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
|
|
||||||
|
|
||||||
|
|
||||||
def test_convert_hebrew(self):
|
|
||||||
hebrew = b"\xed\xe5\xec\xf9"
|
|
||||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
|
||||||
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
|
|
||||||
|
|
||||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
|
||||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
|
||||||
dammit = UnicodeDammit(utf_8)
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
|
||||||
|
|
||||||
def test_ignore_inappropriate_codecs(self):
|
|
||||||
utf8_data = "Räksmörgås".encode("utf-8")
|
|
||||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
|
|
||||||
def test_ignore_invalid_codecs(self):
|
|
||||||
utf8_data = "Räksmörgås".encode("utf-8")
|
|
||||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
|
||||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
|
||||||
|
|
||||||
def test_exclude_encodings(self):
|
|
||||||
# This is UTF-8.
|
|
||||||
utf8_data = "Räksmörgås".encode("utf-8")
|
|
||||||
|
|
||||||
# But if we exclude UTF-8 from consideration, the guess is
|
|
||||||
# Windows-1252.
|
|
||||||
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
|
|
||||||
|
|
||||||
# And if we exclude that, there is no valid guess at all.
|
|
||||||
dammit = UnicodeDammit(
|
|
||||||
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
|
||||||
self.assertEqual(dammit.original_encoding, None)
|
|
||||||
|
|
||||||
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
|
||||||
detected = EncodingDetector(
|
|
||||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
|
||||||
encodings = list(detected.encodings)
|
|
||||||
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
|
||||||
|
|
||||||
def test_detect_html5_style_meta_tag(self):
|
|
||||||
|
|
||||||
for data in (
|
|
||||||
b'<html><meta charset="euc-jp" /></html>',
|
|
||||||
b"<html><meta charset='euc-jp' /></html>",
|
|
||||||
b"<html><meta charset=euc-jp /></html>",
|
|
||||||
b"<html><meta charset=euc-jp/></html>"):
|
|
||||||
dammit = UnicodeDammit(data, is_html=True)
|
|
||||||
self.assertEqual(
|
|
||||||
"euc-jp", dammit.original_encoding)
|
|
||||||
|
|
||||||
def test_last_ditch_entity_replacement(self):
|
|
||||||
# This is a UTF-8 document that contains bytestrings
|
|
||||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
|
||||||
# encoding).
|
|
||||||
#
|
|
||||||
# Since there is no consistent encoding for the document,
|
|
||||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
|
||||||
# and encode the incompatible characters as REPLACEMENT
|
|
||||||
# CHARACTER.
|
|
||||||
#
|
|
||||||
# If chardet is installed, it will detect that the document
|
|
||||||
# can be converted into ISO-8859-1 without errors. This happens
|
|
||||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
|
||||||
# code we're testing here won't run.
|
|
||||||
#
|
|
||||||
# So we temporarily disable chardet if it's present.
|
|
||||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<html><b>\330\250\330\252\330\261</b>
|
|
||||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
|
||||||
chardet = bs4.dammit.chardet_dammit
|
|
||||||
logging.disable(logging.WARNING)
|
|
||||||
try:
|
|
||||||
def noop(str):
|
|
||||||
return None
|
|
||||||
bs4.dammit.chardet_dammit = noop
|
|
||||||
dammit = UnicodeDammit(doc)
|
|
||||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
|
||||||
self.assertTrue("\ufffd" in dammit.unicode_markup)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(doc, "html.parser")
|
|
||||||
self.assertTrue(soup.contains_replacement_characters)
|
|
||||||
finally:
|
|
||||||
logging.disable(logging.NOTSET)
|
|
||||||
bs4.dammit.chardet_dammit = chardet
|
|
||||||
|
|
||||||
def test_byte_order_mark_removed(self):
|
|
||||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
|
||||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
|
||||||
dammit = UnicodeDammit(data)
|
|
||||||
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
|
|
||||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
|
||||||
|
|
||||||
def test_detwingle(self):
|
|
||||||
# Here's a UTF8 document.
|
|
||||||
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
|
||||||
|
|
||||||
# Here's a Windows-1252 document.
|
|
||||||
windows_1252 = (
|
|
||||||
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
|
||||||
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
|
||||||
|
|
||||||
# Through some unholy alchemy, they've been stuck together.
|
|
||||||
doc = utf8 + windows_1252 + utf8
|
|
||||||
|
|
||||||
# The document can't be turned into UTF-8:
|
|
||||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
|
||||||
|
|
||||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
|
||||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
|
||||||
|
|
||||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
|
||||||
|
|
||||||
fixed = UnicodeDammit.detwingle(doc)
|
|
||||||
self.assertEqual(
|
|
||||||
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
|
||||||
|
|
||||||
def test_detwingle_ignores_multibyte_characters(self):
|
|
||||||
# Each of these characters has a UTF-8 representation ending
|
|
||||||
# in \x93. \x93 is a smart quote if interpreted as
|
|
||||||
# Windows-1252. But our code knows to skip over multibyte
|
|
||||||
# UTF-8 characters, so they'll survive the process unscathed.
|
|
||||||
for tricky_unicode_char in (
|
|
||||||
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
|
||||||
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
|
||||||
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
|
||||||
):
|
|
||||||
input = tricky_unicode_char.encode("utf8")
|
|
||||||
self.assertTrue(input.endswith(b'\x93'))
|
|
||||||
output = UnicodeDammit.detwingle(input)
|
|
||||||
self.assertEqual(output, input)
|
|
||||||
|
|
||||||
def test_find_declared_encoding(self):
|
|
||||||
# Test our ability to find a declared encoding inside an
|
|
||||||
# XML or HTML document.
|
|
||||||
#
|
|
||||||
# Even if the document comes in as Unicode, it may be
|
|
||||||
# interesting to know what encoding was claimed
|
|
||||||
# originally.
|
|
||||||
|
|
||||||
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
|
|
||||||
html_bytes = html_unicode.encode("ascii")
|
|
||||||
|
|
||||||
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
|
|
||||||
xml_bytes = xml_unicode.encode("ascii")
|
|
||||||
|
|
||||||
m = EncodingDetector.find_declared_encoding
|
|
||||||
self.assertEqual(None, m(html_unicode, is_html=False))
|
|
||||||
self.assertEqual("utf-8", m(html_unicode, is_html=True))
|
|
||||||
self.assertEqual("utf-8", m(html_bytes, is_html=True))
|
|
||||||
|
|
||||||
self.assertEqual("iso-8859-1", m(xml_unicode))
|
|
||||||
self.assertEqual("iso-8859-1", m(xml_bytes))
|
|
||||||
|
|
||||||
# Normally, only the first few kilobytes of a document are checked for
|
|
||||||
# an encoding.
|
|
||||||
spacer = b' ' * 5000
|
|
||||||
self.assertEqual(None, m(spacer + html_bytes))
|
|
||||||
self.assertEqual(None, m(spacer + xml_bytes))
|
|
||||||
|
|
||||||
# But you can tell find_declared_encoding to search an entire
|
|
||||||
# HTML document.
|
|
||||||
self.assertEqual(
|
|
||||||
"utf-8",
|
|
||||||
m(spacer + html_bytes, is_html=True, search_entire_document=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
# The XML encoding declaration has to be the very first thing
|
|
||||||
# in the document. We'll allow whitespace before the document
|
|
||||||
# starts, but nothing else.
|
|
||||||
self.assertEqual(
|
|
||||||
"iso-8859-1",
|
|
||||||
m(xml_bytes, search_entire_document=True)
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
None, m(b'a' + xml_bytes, search_entire_document=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
class TestNamedspacedAttribute(SoupTest):
|
class TestNamedspacedAttribute(SoupTest):
|
||||||
|
|
||||||
|
@ -642,9 +529,19 @@ class TestNamedspacedAttribute(SoupTest):
|
||||||
a = NamespacedAttribute("xmlns", None)
|
a = NamespacedAttribute("xmlns", None)
|
||||||
self.assertEqual(a, "xmlns")
|
self.assertEqual(a, "xmlns")
|
||||||
|
|
||||||
|
a = NamespacedAttribute("xmlns", "")
|
||||||
|
self.assertEqual(a, "xmlns")
|
||||||
|
|
||||||
a = NamespacedAttribute("xmlns")
|
a = NamespacedAttribute("xmlns")
|
||||||
self.assertEqual(a, "xmlns")
|
self.assertEqual(a, "xmlns")
|
||||||
|
|
||||||
|
def test_namespace_may_be_none_or_missing(self):
|
||||||
|
a = NamespacedAttribute(None, "tag")
|
||||||
|
self.assertEqual(a, "tag")
|
||||||
|
|
||||||
|
a = NamespacedAttribute("", "tag")
|
||||||
|
self.assertEqual(a, "tag")
|
||||||
|
|
||||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||||
a = NamespacedAttribute("a", "b")
|
a = NamespacedAttribute("a", "b")
|
||||||
self.assertEqual("a:b", a)
|
self.assertEqual("a:b", a)
|
||||||
|
|
|
@ -27,13 +27,17 @@ from bs4.element import (
|
||||||
Doctype,
|
Doctype,
|
||||||
Formatter,
|
Formatter,
|
||||||
NavigableString,
|
NavigableString,
|
||||||
|
Script,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
|
Stylesheet,
|
||||||
Tag,
|
Tag,
|
||||||
|
TemplateString,
|
||||||
)
|
)
|
||||||
from bs4.testing import (
|
from bs4.testing import (
|
||||||
SoupTest,
|
SoupTest,
|
||||||
skipIf,
|
skipIf,
|
||||||
)
|
)
|
||||||
|
from soupsieve import SelectorSyntaxError
|
||||||
|
|
||||||
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
|
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
|
||||||
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
|
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
|
||||||
|
@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
|
||||||
soup.a.extend(l)
|
soup.a.extend(l)
|
||||||
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
|
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
|
||||||
|
|
||||||
|
def test_extend_with_another_tags_contents(self):
|
||||||
|
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
|
||||||
|
soup = self.soup(data)
|
||||||
|
d1 = soup.find('div', id='d1')
|
||||||
|
d2 = soup.find('div', id='d2')
|
||||||
|
d2.extend(d1)
|
||||||
|
self.assertEqual('<div id="d1"></div>', d1.decode())
|
||||||
|
self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
|
||||||
|
|
||||||
def test_move_tag_to_beginning_of_parent(self):
|
def test_move_tag_to_beginning_of_parent(self):
|
||||||
data = "<a><b></b><c></c><d></d></a>"
|
data = "<a><b></b><c></c><d></d></a>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
|
@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
|
||||||
self.assertEqual(no.next_element, "no")
|
self.assertEqual(no.next_element, "no")
|
||||||
self.assertEqual(no.next_sibling, " business")
|
self.assertEqual(no.next_sibling, " business")
|
||||||
|
|
||||||
|
def test_replace_with_errors(self):
|
||||||
|
# Can't replace a tag that's not part of a tree.
|
||||||
|
a_tag = Tag(name="a")
|
||||||
|
self.assertRaises(ValueError, a_tag.replace_with, "won't work")
|
||||||
|
|
||||||
|
# Can't replace a tag with its parent.
|
||||||
|
a_tag = self.soup("<a><b></b></a>").a
|
||||||
|
self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
|
||||||
|
|
||||||
|
# Or with a list that includes its parent.
|
||||||
|
self.assertRaises(ValueError, a_tag.b.replace_with,
|
||||||
|
"string1", a_tag, "string2")
|
||||||
|
|
||||||
|
def test_replace_with_multiple(self):
|
||||||
|
data = "<a><b></b><c></c></a>"
|
||||||
|
soup = self.soup(data)
|
||||||
|
d_tag = soup.new_tag("d")
|
||||||
|
d_tag.string = "Text In D Tag"
|
||||||
|
e_tag = soup.new_tag("e")
|
||||||
|
f_tag = soup.new_tag("f")
|
||||||
|
a_string = "Random Text"
|
||||||
|
soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
|
||||||
|
self.assertEqual(
|
||||||
|
"<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>",
|
||||||
|
soup.decode()
|
||||||
|
)
|
||||||
|
assert soup.b.next_element == d_tag
|
||||||
|
assert d_tag.string.next_element==e_tag
|
||||||
|
assert e_tag.next_element.string == a_string
|
||||||
|
assert e_tag.next_element.next_element == f_tag
|
||||||
|
|
||||||
def test_replace_first_child(self):
|
def test_replace_first_child(self):
|
||||||
data = "<a><b></b><c></c></a>"
|
data = "<a><b></b><c></c></a>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
|
@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
|
||||||
a.clear(decompose=True)
|
a.clear(decompose=True)
|
||||||
self.assertEqual(0, len(em.contents))
|
self.assertEqual(0, len(em.contents))
|
||||||
|
|
||||||
|
|
||||||
|
def test_decompose(self):
|
||||||
|
# Test PageElement.decompose() and PageElement.decomposed
|
||||||
|
soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
|
||||||
|
p1, p2 = soup.find_all('p')
|
||||||
|
a = p1.a
|
||||||
|
text = p1.em.string
|
||||||
|
for i in [p1, p2, a, text]:
|
||||||
|
self.assertEqual(False, i.decomposed)
|
||||||
|
|
||||||
|
# This sets p1 and everything beneath it to decomposed.
|
||||||
|
p1.decompose()
|
||||||
|
for i in [p1, a, text]:
|
||||||
|
self.assertEqual(True, i.decomposed)
|
||||||
|
# p2 is unaffected.
|
||||||
|
self.assertEqual(False, p2.decomposed)
|
||||||
|
|
||||||
def test_string_set(self):
|
def test_string_set(self):
|
||||||
"""Tag.string = 'string'"""
|
"""Tag.string = 'string'"""
|
||||||
soup = self.soup("<a></a> <b><c></c></b>")
|
soup = self.soup("<a></a> <b><c></c></b>")
|
||||||
|
@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
|
||||||
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
|
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
|
||||||
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
|
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
|
||||||
|
|
||||||
def test_get_text_ignores_comments(self):
|
def test_get_text_ignores_special_string_containers(self):
|
||||||
soup = self.soup("foo<!--IGNORE-->bar")
|
soup = self.soup("foo<!--IGNORE-->bar")
|
||||||
self.assertEqual(soup.get_text(), "foobar")
|
self.assertEqual(soup.get_text(), "foobar")
|
||||||
|
|
||||||
|
@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.get_text(types=None), "fooIGNOREbar")
|
soup.get_text(types=None), "fooIGNOREbar")
|
||||||
|
|
||||||
def test_all_strings_ignores_comments(self):
|
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
|
||||||
|
self.assertEqual(soup.get_text(), "foobar")
|
||||||
|
|
||||||
|
def test_all_strings_ignores_special_string_containers(self):
|
||||||
soup = self.soup("foo<!--IGNORE-->bar")
|
soup = self.soup("foo<!--IGNORE-->bar")
|
||||||
self.assertEqual(['foo', 'bar'], list(soup.strings))
|
self.assertEqual(['foo', 'bar'], list(soup.strings))
|
||||||
|
|
||||||
|
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
|
||||||
|
self.assertEqual(['foo', 'bar'], list(soup.strings))
|
||||||
|
|
||||||
|
def test_string_methods_inside_special_string_container_tags(self):
|
||||||
|
# Strings inside tags like <script> are generally ignored by
|
||||||
|
# methods like get_text, because they're not what humans
|
||||||
|
# consider 'text'. But if you call get_text on the <script>
|
||||||
|
# tag itself, those strings _are_ considered to be 'text',
|
||||||
|
# because there's nothing else you might be looking for.
|
||||||
|
|
||||||
|
style = self.soup("<div>a<style>Some CSS</style></div>")
|
||||||
|
template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
|
||||||
|
script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
|
||||||
|
|
||||||
|
self.assertEqual(style.div.get_text(), "a")
|
||||||
|
self.assertEqual(list(style.div.strings), ["a"])
|
||||||
|
self.assertEqual(style.div.style.get_text(), "Some CSS")
|
||||||
|
self.assertEqual(list(style.div.style.strings),
|
||||||
|
['Some CSS'])
|
||||||
|
|
||||||
|
# The comment is not picked up here. That's because it was
|
||||||
|
# parsed into a Comment object, which is not considered
|
||||||
|
# interesting by template.strings.
|
||||||
|
self.assertEqual(template.div.get_text(), "a")
|
||||||
|
self.assertEqual(list(template.div.strings), ["a"])
|
||||||
|
self.assertEqual(template.div.template.get_text(), "Templated text.")
|
||||||
|
self.assertEqual(list(template.div.template.strings),
|
||||||
|
["Templated ", "text", "."])
|
||||||
|
|
||||||
|
# The comment is included here, because it didn't get parsed
|
||||||
|
# into a Comment object--it's part of the Script string.
|
||||||
|
self.assertEqual(script.div.get_text(), "a")
|
||||||
|
self.assertEqual(list(script.div.strings), ["a"])
|
||||||
|
self.assertEqual(script.div.script.get_text(),
|
||||||
|
"<!--a comment-->Some text")
|
||||||
|
self.assertEqual(list(script.div.script.strings),
|
||||||
|
['<!--a comment-->Some text'])
|
||||||
|
|
||||||
class TestCDAtaListAttributes(SoupTest):
|
class TestCDAtaListAttributes(SoupTest):
|
||||||
|
|
||||||
"""Testing cdata-list attributes like 'class'.
|
"""Testing cdata-list attributes like 'class'.
|
||||||
|
@ -1775,70 +1877,6 @@ class TestEncoding(SoupTest):
|
||||||
else:
|
else:
|
||||||
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
|
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
|
||||||
|
|
||||||
class TestFormatter(SoupTest):
|
|
||||||
|
|
||||||
def test_sort_attributes(self):
|
|
||||||
# Test the ability to override Formatter.attributes() to,
|
|
||||||
# e.g., disable the normal sorting of attributes.
|
|
||||||
class UnsortedFormatter(Formatter):
|
|
||||||
def attributes(self, tag):
|
|
||||||
self.called_with = tag
|
|
||||||
for k, v in sorted(tag.attrs.items()):
|
|
||||||
if k == 'ignore':
|
|
||||||
continue
|
|
||||||
yield k,v
|
|
||||||
|
|
||||||
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
|
|
||||||
formatter = UnsortedFormatter()
|
|
||||||
decoded = soup.decode(formatter=formatter)
|
|
||||||
|
|
||||||
# attributes() was called on the <p> tag. It filtered out one
|
|
||||||
# attribute and sorted the other two.
|
|
||||||
self.assertEqual(formatter.called_with, soup.p)
|
|
||||||
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
|
|
||||||
|
|
||||||
|
|
||||||
class TestNavigableStringSubclasses(SoupTest):
|
|
||||||
|
|
||||||
def test_cdata(self):
|
|
||||||
# None of the current builders turn CDATA sections into CData
|
|
||||||
# objects, but you can create them manually.
|
|
||||||
soup = self.soup("")
|
|
||||||
cdata = CData("foo")
|
|
||||||
soup.insert(1, cdata)
|
|
||||||
self.assertEqual(str(soup), "<![CDATA[foo]]>")
|
|
||||||
self.assertEqual(soup.find(text="foo"), "foo")
|
|
||||||
self.assertEqual(soup.contents[0], "foo")
|
|
||||||
|
|
||||||
def test_cdata_is_never_formatted(self):
|
|
||||||
"""Text inside a CData object is passed into the formatter.
|
|
||||||
|
|
||||||
But the return value is ignored.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.count = 0
|
|
||||||
def increment(*args):
|
|
||||||
self.count += 1
|
|
||||||
return "BITTER FAILURE"
|
|
||||||
|
|
||||||
soup = self.soup("")
|
|
||||||
cdata = CData("<><><>")
|
|
||||||
soup.insert(1, cdata)
|
|
||||||
self.assertEqual(
|
|
||||||
b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
|
|
||||||
self.assertEqual(1, self.count)
|
|
||||||
|
|
||||||
def test_doctype_ends_in_newline(self):
|
|
||||||
# Unlike other NavigableString subclasses, a DOCTYPE always ends
|
|
||||||
# in a newline.
|
|
||||||
doctype = Doctype("foo")
|
|
||||||
soup = self.soup("")
|
|
||||||
soup.insert(1, doctype)
|
|
||||||
self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
|
|
||||||
|
|
||||||
def test_declaration(self):
|
|
||||||
d = Declaration("foo")
|
|
||||||
self.assertEqual("<?foo?>", d.output_ready())
|
|
||||||
|
|
||||||
class TestSoupSelector(TreeTest):
|
class TestSoupSelector(TreeTest):
|
||||||
|
|
||||||
|
@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertEqual(len(self.soup.select('del')), 0)
|
self.assertEqual(len(self.soup.select('del')), 0)
|
||||||
|
|
||||||
def test_invalid_tag(self):
|
def test_invalid_tag(self):
|
||||||
self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
|
self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
|
||||||
|
|
||||||
def test_select_dashed_tag_ids(self):
|
def test_select_dashed_tag_ids(self):
|
||||||
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
|
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||||
|
@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
|
||||||
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
|
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
|
||||||
|
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
SyntaxError, self.soup.select, "a:nth-of-type(a)")
|
SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
|
||||||
|
|
||||||
def test_nth_of_type(self):
|
def test_nth_of_type(self):
|
||||||
# Try to select first paragraph
|
# Try to select first paragraph
|
||||||
|
@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
||||||
|
|
||||||
def test_dangling_combinator(self):
|
def test_dangling_combinator(self):
|
||||||
self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
|
self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
|
||||||
|
|
||||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||||
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||||
|
@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||||
|
|
||||||
def test_invalid_multiple_select(self):
|
def test_invalid_multiple_select(self):
|
||||||
self.assertRaises(SyntaxError, self.soup.select, ',x, y')
|
self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
|
||||||
self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
|
self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
|
||||||
|
|
||||||
def test_multiple_select_attrs(self):
|
def test_multiple_select_attrs(self):
|
||||||
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue