mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 05:01:14 -07:00
Bump beautifulsoup4 from 4.11.2 to 4.12.2 (#2037)
* Bump beautifulsoup4 from 4.11.2 to 4.12.2 Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.2 to 4.12.2. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update beautifulsoup4==4.12.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
This commit is contained in:
parent
1798594569
commit
e70e08c3f5
32 changed files with 1439 additions and 755 deletions
|
@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.11.2"
|
__version__ = "4.12.2"
|
||||||
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
|
||||||
# Use of this source code is governed by the MIT license.
|
# Use of this source code is governed by the MIT license.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
@ -38,11 +38,13 @@ from .builder import (
|
||||||
builder_registry,
|
builder_registry,
|
||||||
ParserRejectedMarkup,
|
ParserRejectedMarkup,
|
||||||
XMLParsedAsHTMLWarning,
|
XMLParsedAsHTMLWarning,
|
||||||
|
HTMLParserTreeBuilder
|
||||||
)
|
)
|
||||||
from .dammit import UnicodeDammit
|
from .dammit import UnicodeDammit
|
||||||
from .element import (
|
from .element import (
|
||||||
CData,
|
CData,
|
||||||
Comment,
|
Comment,
|
||||||
|
CSS,
|
||||||
DEFAULT_OUTPUT_ENCODING,
|
DEFAULT_OUTPUT_ENCODING,
|
||||||
Declaration,
|
Declaration,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
@ -116,7 +118,7 @@ class BeautifulSoup(Tag):
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
element_classes=None, **kwargs):
|
element_classes=None, **kwargs):
|
||||||
|
@ -348,25 +350,49 @@ class BeautifulSoup(Tag):
|
||||||
self.markup = None
|
self.markup = None
|
||||||
self.builder.soup = None
|
self.builder.soup = None
|
||||||
|
|
||||||
def __copy__(self):
|
def _clone(self):
|
||||||
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
|
"""Create a new BeautifulSoup object with the same TreeBuilder,
|
||||||
copy = type(self)(
|
but not associated with any markup.
|
||||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Although we encoded the tree to UTF-8, that may not have
|
This is the first step of the deepcopy process.
|
||||||
# been the encoding of the original markup. Set the copy's
|
"""
|
||||||
# .original_encoding to reflect the original object's
|
clone = type(self)("", None, self.builder)
|
||||||
# .original_encoding.
|
|
||||||
copy.original_encoding = self.original_encoding
|
|
||||||
return copy
|
|
||||||
|
|
||||||
|
# Keep track of the encoding of the original document,
|
||||||
|
# since we won't be parsing it again.
|
||||||
|
clone.original_encoding = self.original_encoding
|
||||||
|
return clone
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
# Frequently a tree builder can't be pickled.
|
# Frequently a tree builder can't be pickled.
|
||||||
d = dict(self.__dict__)
|
d = dict(self.__dict__)
|
||||||
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
||||||
d['builder'] = None
|
d['builder'] = type(self.builder)
|
||||||
|
# Store the contents as a Unicode string.
|
||||||
|
d['contents'] = []
|
||||||
|
d['markup'] = self.decode()
|
||||||
|
|
||||||
|
# If _most_recent_element is present, it's a Tag object left
|
||||||
|
# over from initial parse. It might not be picklable and we
|
||||||
|
# don't need it.
|
||||||
|
if '_most_recent_element' in d:
|
||||||
|
del d['_most_recent_element']
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
# If necessary, restore the TreeBuilder by looking it up.
|
||||||
|
self.__dict__ = state
|
||||||
|
if isinstance(self.builder, type):
|
||||||
|
self.builder = self.builder()
|
||||||
|
elif not self.builder:
|
||||||
|
# We don't know which builder was used to build this
|
||||||
|
# parse tree, so use a default we know is always available.
|
||||||
|
self.builder = HTMLParserTreeBuilder()
|
||||||
|
self.builder.soup = self
|
||||||
|
self.reset()
|
||||||
|
self._feed()
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _decode_markup(cls, markup):
|
def _decode_markup(cls, markup):
|
||||||
|
@ -468,6 +494,7 @@ class BeautifulSoup(Tag):
|
||||||
self.open_tag_counter = Counter()
|
self.open_tag_counter = Counter()
|
||||||
self.preserve_whitespace_tag_stack = []
|
self.preserve_whitespace_tag_stack = []
|
||||||
self.string_container_stack = []
|
self.string_container_stack = []
|
||||||
|
self._most_recent_element = None
|
||||||
self.pushTag(self)
|
self.pushTag(self)
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
||||||
|
@ -749,7 +776,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def decode(self, pretty_print=False,
|
def decode(self, pretty_print=False,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal", iterator=None):
|
||||||
"""Returns a string or Unicode representation of the parse tree
|
"""Returns a string or Unicode representation of the parse tree
|
||||||
as an HTML or XML document.
|
as an HTML or XML document.
|
||||||
|
|
||||||
|
@ -776,7 +803,7 @@ class BeautifulSoup(Tag):
|
||||||
else:
|
else:
|
||||||
indent_level = 0
|
indent_level = 0
|
||||||
return prefix + super(BeautifulSoup, self).decode(
|
return prefix + super(BeautifulSoup, self).decode(
|
||||||
indent_level, eventual_encoding, formatter)
|
indent_level, eventual_encoding, formatter, iterator)
|
||||||
|
|
||||||
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
||||||
_s = BeautifulSoup
|
_s = BeautifulSoup
|
||||||
|
|
|
@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||||
|
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
DetectsXMLParsedAsHTML,
|
DetectsXMLParsedAsHTML,
|
||||||
|
ParserRejectedMarkup,
|
||||||
HTML,
|
HTML,
|
||||||
HTMLTreeBuilder,
|
HTMLTreeBuilder,
|
||||||
STRICT,
|
STRICT,
|
||||||
|
@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||||
|
|
||||||
self._initialize_xml_detector()
|
self._initialize_xml_detector()
|
||||||
|
|
||||||
|
def error(self, message):
|
||||||
|
# NOTE: This method is required so long as Python 3.9 is
|
||||||
|
# supported. The corresponding code is removed from HTMLParser
|
||||||
|
# in 3.5, but not removed from ParserBase until 3.10.
|
||||||
|
# https://github.com/python/cpython/issues/76025
|
||||||
|
#
|
||||||
|
# The original implementation turned the error into a warning,
|
||||||
|
# but in every case I discovered, this made HTMLParser
|
||||||
|
# immediately crash with an error message that was less
|
||||||
|
# helpful than the warning. The new implementation makes it
|
||||||
|
# more clear that html.parser just can't parse this
|
||||||
|
# markup. The 3.10 implementation does the same, though it
|
||||||
|
# raises AssertionError rather than calling a method. (We
|
||||||
|
# catch this error and wrap it in a ParserRejectedMarkup.)
|
||||||
|
raise ParserRejectedMarkup(message)
|
||||||
|
|
||||||
def handle_startendtag(self, name, attrs):
|
def handle_startendtag(self, name, attrs):
|
||||||
"""Handle an incoming empty-element tag.
|
"""Handle an incoming empty-element tag.
|
||||||
|
|
||||||
|
@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
args, kwargs = self.parser_args
|
args, kwargs = self.parser_args
|
||||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
parser.feed(markup)
|
try:
|
||||||
|
parser.feed(markup)
|
||||||
|
except AssertionError as e:
|
||||||
|
# html.parser raises AssertionError in rare cases to
|
||||||
|
# indicate a fatal problem with the markup, especially
|
||||||
|
# when there's an error in the doctype declaration.
|
||||||
|
raise ParserRejectedMarkup(e)
|
||||||
parser.close()
|
parser.close()
|
||||||
parser.already_closed_empty_element = []
|
parser.already_closed_empty_element = []
|
||||||
|
|
280
lib/bs4/css.py
Normal file
280
lib/bs4/css.py
Normal file
|
@ -0,0 +1,280 @@
|
||||||
|
"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
try:
|
||||||
|
import soupsieve
|
||||||
|
except ImportError as e:
|
||||||
|
soupsieve = None
|
||||||
|
warnings.warn(
|
||||||
|
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CSS(object):
|
||||||
|
"""A proxy object against the soupsieve library, to simplify its
|
||||||
|
CSS selector API.
|
||||||
|
|
||||||
|
Acquire this object through the .css attribute on the
|
||||||
|
BeautifulSoup object, or on the Tag you want to use as the
|
||||||
|
starting point for a CSS selector.
|
||||||
|
|
||||||
|
The main advantage of doing this is that the tag to be selected
|
||||||
|
against doesn't need to be explicitly specified in the function
|
||||||
|
calls, since it's already scoped to a tag.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tag, api=soupsieve):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
You don't need to instantiate this class yourself; instead,
|
||||||
|
access the .css attribute on the BeautifulSoup object, or on
|
||||||
|
the Tag you want to use as the starting point for your CSS
|
||||||
|
selector.
|
||||||
|
|
||||||
|
:param tag: All CSS selectors will use this as their starting
|
||||||
|
point.
|
||||||
|
|
||||||
|
:param api: A plug-in replacement for the soupsieve module,
|
||||||
|
designed mainly for use in tests.
|
||||||
|
"""
|
||||||
|
if api is None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||||||
|
)
|
||||||
|
self.api = api
|
||||||
|
self.tag = tag
|
||||||
|
|
||||||
|
def escape(self, ident):
|
||||||
|
"""Escape a CSS identifier.
|
||||||
|
|
||||||
|
This is a simple wrapper around soupselect.escape(). See the
|
||||||
|
documentation for that function for more information.
|
||||||
|
"""
|
||||||
|
if soupsieve is None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Cannot escape CSS identifiers because the soupsieve package is not installed."
|
||||||
|
)
|
||||||
|
return self.api.escape(ident)
|
||||||
|
|
||||||
|
def _ns(self, ns, select):
|
||||||
|
"""Normalize a dictionary of namespaces."""
|
||||||
|
if not isinstance(select, self.api.SoupSieve) and ns is None:
|
||||||
|
# If the selector is a precompiled pattern, it already has
|
||||||
|
# a namespace context compiled in, which cannot be
|
||||||
|
# replaced.
|
||||||
|
ns = self.tag._namespaces
|
||||||
|
return ns
|
||||||
|
|
||||||
|
def _rs(self, results):
|
||||||
|
"""Normalize a list of results to a Resultset.
|
||||||
|
|
||||||
|
A ResultSet is more consistent with the rest of Beautiful
|
||||||
|
Soup's API, and ResultSet.__getattr__ has a helpful error
|
||||||
|
message if you try to treat a list of results as a single
|
||||||
|
result (a common mistake).
|
||||||
|
"""
|
||||||
|
# Import here to avoid circular import
|
||||||
|
from bs4.element import ResultSet
|
||||||
|
return ResultSet(None, results)
|
||||||
|
|
||||||
|
def compile(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Pre-compile a selector and return the compiled object.
|
||||||
|
|
||||||
|
:param selector: A CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will use the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.compile() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.compile() method.
|
||||||
|
|
||||||
|
:return: A precompiled selector object.
|
||||||
|
:rtype: soupsieve.SoupSieve
|
||||||
|
"""
|
||||||
|
return self.api.compile(
|
||||||
|
select, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def select_one(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Perform a CSS selection operation on the current Tag and return the
|
||||||
|
first result.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.select_one()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param selector: A CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will use the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.select_one() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.select_one() method.
|
||||||
|
|
||||||
|
:return: A Tag, or None if the selector has no match.
|
||||||
|
:rtype: bs4.element.Tag
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.api.select_one(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||||||
|
"""Perform a CSS selection operation on the current Tag.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.select()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param selector: A string containing a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param limit: After finding this number of results, stop looking.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.select() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.select() method.
|
||||||
|
|
||||||
|
:return: A ResultSet of Tag objects.
|
||||||
|
:rtype: bs4.element.ResultSet
|
||||||
|
|
||||||
|
"""
|
||||||
|
if limit is None:
|
||||||
|
limit = 0
|
||||||
|
|
||||||
|
return self._rs(
|
||||||
|
self.api.select(
|
||||||
|
select, self.tag, self._ns(namespaces, select), limit, flags,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||||||
|
"""Perform a CSS selection operation on the current Tag.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.iselect()
|
||||||
|
method. It is the same as select(), but it returns a generator
|
||||||
|
instead of a list.
|
||||||
|
|
||||||
|
:param selector: A string containing a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param limit: After finding this number of results, stop looking.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.iselect() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.iselect() method.
|
||||||
|
|
||||||
|
:return: A generator
|
||||||
|
:rtype: types.GeneratorType
|
||||||
|
"""
|
||||||
|
return self.api.iselect(
|
||||||
|
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def closest(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Find the Tag closest to this one that matches the given selector.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.closest()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param selector: A string containing a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.closest() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.closest() method.
|
||||||
|
|
||||||
|
:return: A Tag, or None if there is no match.
|
||||||
|
:rtype: bs4.Tag
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.api.closest(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def match(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Check whether this Tag matches the given CSS selector.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. For more information, see
|
||||||
|
that library's documentation for the soupsieve.match()
|
||||||
|
method.
|
||||||
|
|
||||||
|
:param: a CSS selector.
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.match() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.match() method.
|
||||||
|
|
||||||
|
:return: True if this Tag matches the selector; False otherwise.
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
return self.api.match(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def filter(self, select, namespaces=None, flags=0, **kwargs):
|
||||||
|
"""Filter this Tag's direct children based on the given CSS selector.
|
||||||
|
|
||||||
|
This uses the Soup Sieve library. It works the same way as
|
||||||
|
passing this Tag into that library's soupsieve.filter()
|
||||||
|
method. More information, for more information see the
|
||||||
|
documentation for soupsieve.filter().
|
||||||
|
|
||||||
|
:param namespaces: A dictionary mapping namespace prefixes
|
||||||
|
used in the CSS selector to namespace URIs. By default,
|
||||||
|
Beautiful Soup will pass in the prefixes it encountered while
|
||||||
|
parsing the document.
|
||||||
|
|
||||||
|
:param flags: Flags to be passed into Soup Sieve's
|
||||||
|
soupsieve.filter() method.
|
||||||
|
|
||||||
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
|
soupsieve.filter() method.
|
||||||
|
|
||||||
|
:return: A ResultSet of Tag objects.
|
||||||
|
:rtype: bs4.element.ResultSet
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._rs(
|
||||||
|
self.api.filter(
|
||||||
|
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||||
|
)
|
||||||
|
)
|
|
@ -59,21 +59,6 @@ def diagnose(data):
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
|
||||||
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
|
|
||||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
if os.path.exists(data):
|
|
||||||
print(('"%s" looks like a filename. Reading data from the file.' % data))
|
|
||||||
with open(data) as fp:
|
|
||||||
data = fp.read()
|
|
||||||
except ValueError:
|
|
||||||
# This can happen on some platforms when the 'filename' is
|
|
||||||
# too long. Assume it's data and not a filename.
|
|
||||||
pass
|
|
||||||
print("")
|
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print(("Trying to parse your markup with %s" % parser))
|
print(("Trying to parse your markup with %s" % parser))
|
||||||
|
|
|
@ -8,14 +8,8 @@ except ImportError as e:
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
try:
|
|
||||||
import soupsieve
|
|
||||||
except ImportError as e:
|
|
||||||
soupsieve = None
|
|
||||||
warnings.warn(
|
|
||||||
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
from bs4.css import CSS
|
||||||
from bs4.formatter import (
|
from bs4.formatter import (
|
||||||
Formatter,
|
Formatter,
|
||||||
HTMLFormatter,
|
HTMLFormatter,
|
||||||
|
@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([
|
||||||
"string-escape",
|
"string-escape",
|
||||||
"string_escape",
|
"string_escape",
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
class NamespacedAttribute(str):
|
class NamespacedAttribute(str):
|
||||||
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
|
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
|
||||||
('xml') and the name ('lang') that were used to create it.
|
('xml') and the name ('lang') that were used to create it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __new__(cls, prefix, name=None, namespace=None):
|
def __new__(cls, prefix, name=None, namespace=None):
|
||||||
if not name:
|
if not name:
|
||||||
# This is the default namespace. Its name "has no value"
|
# This is the default namespace. Its name "has no value"
|
||||||
|
@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||||
return match.group(1) + encoding
|
return match.group(1) + encoding
|
||||||
return self.CHARSET_RE.sub(rewrite, self.original_value)
|
return self.CHARSET_RE.sub(rewrite, self.original_value)
|
||||||
|
|
||||||
|
|
||||||
class PageElement(object):
|
class PageElement(object):
|
||||||
"""Contains the navigational information for some part of the page:
|
"""Contains the navigational information for some part of the page:
|
||||||
that is, its current location in the parse tree.
|
that is, its current location in the parse tree.
|
||||||
|
|
||||||
NavigableString, Tag, etc. are all subclasses of PageElement.
|
NavigableString, Tag, etc. are all subclasses of PageElement.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# In general, we can't tell just by looking at an element whether
|
||||||
|
# it's contained in an XML document or an HTML document. But for
|
||||||
|
# Tags (q.v.) we can store this information at parse time.
|
||||||
|
known_xml = None
|
||||||
|
|
||||||
def setup(self, parent=None, previous_element=None, next_element=None,
|
def setup(self, parent=None, previous_element=None, next_element=None,
|
||||||
previous_sibling=None, next_sibling=None):
|
previous_sibling=None, next_sibling=None):
|
||||||
"""Sets up the initial relations between this element and
|
"""Sets up the initial relations between this element and
|
||||||
|
@ -163,7 +162,7 @@ class PageElement(object):
|
||||||
|
|
||||||
:param previous_element: The element parsed immediately before
|
:param previous_element: The element parsed immediately before
|
||||||
this one.
|
this one.
|
||||||
|
|
||||||
:param next_element: The element parsed immediately before
|
:param next_element: The element parsed immediately before
|
||||||
this one.
|
this one.
|
||||||
|
|
||||||
|
@ -257,11 +256,11 @@ class PageElement(object):
|
||||||
default = object()
|
default = object()
|
||||||
def _all_strings(self, strip=False, types=default):
|
def _all_strings(self, strip=False, types=default):
|
||||||
"""Yield all strings of certain classes, possibly stripping them.
|
"""Yield all strings of certain classes, possibly stripping them.
|
||||||
|
|
||||||
This is implemented differently in Tag and NavigableString.
|
This is implemented differently in Tag and NavigableString.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def stripped_strings(self):
|
def stripped_strings(self):
|
||||||
"""Yield all strings in this PageElement, stripping them first.
|
"""Yield all strings in this PageElement, stripping them first.
|
||||||
|
@ -294,11 +293,11 @@ class PageElement(object):
|
||||||
strip, types=types)])
|
strip, types=types)])
|
||||||
getText = get_text
|
getText = get_text
|
||||||
text = property(get_text)
|
text = property(get_text)
|
||||||
|
|
||||||
def replace_with(self, *args):
|
def replace_with(self, *args):
|
||||||
"""Replace this PageElement with one or more PageElements, keeping the
|
"""Replace this PageElement with one or more PageElements, keeping the
|
||||||
rest of the tree the same.
|
rest of the tree the same.
|
||||||
|
|
||||||
:param args: One or more PageElements.
|
:param args: One or more PageElements.
|
||||||
:return: `self`, no longer part of the tree.
|
:return: `self`, no longer part of the tree.
|
||||||
"""
|
"""
|
||||||
|
@ -410,7 +409,7 @@ class PageElement(object):
|
||||||
This works the same way as `list.insert`.
|
This works the same way as `list.insert`.
|
||||||
|
|
||||||
:param position: The numeric position that should be occupied
|
:param position: The numeric position that should be occupied
|
||||||
in `self.children` by the new PageElement.
|
in `self.children` by the new PageElement.
|
||||||
:param new_child: A PageElement.
|
:param new_child: A PageElement.
|
||||||
"""
|
"""
|
||||||
if new_child is None:
|
if new_child is None:
|
||||||
|
@ -546,7 +545,7 @@ class PageElement(object):
|
||||||
"Element has no parent, so 'after' has no meaning.")
|
"Element has no parent, so 'after' has no meaning.")
|
||||||
if any(x is self for x in args):
|
if any(x is self for x in args):
|
||||||
raise ValueError("Can't insert an element after itself.")
|
raise ValueError("Can't insert an element after itself.")
|
||||||
|
|
||||||
offset = 0
|
offset = 0
|
||||||
for successor in args:
|
for successor in args:
|
||||||
# Extract first so that the index won't be screwed up if they
|
# Extract first so that the index won't be screwed up if they
|
||||||
|
@ -912,7 +911,7 @@ class PageElement(object):
|
||||||
:rtype: bool
|
:rtype: bool
|
||||||
"""
|
"""
|
||||||
return getattr(self, '_decomposed', False) or False
|
return getattr(self, '_decomposed', False) or False
|
||||||
|
|
||||||
# Old non-property versions of the generators, for backwards
|
# Old non-property versions of the generators, for backwards
|
||||||
# compatibility with BS3.
|
# compatibility with BS3.
|
||||||
def nextGenerator(self):
|
def nextGenerator(self):
|
||||||
|
@ -936,16 +935,11 @@ class NavigableString(str, PageElement):
|
||||||
|
|
||||||
When Beautiful Soup parses the markup <b>penguin</b>, it will
|
When Beautiful Soup parses the markup <b>penguin</b>, it will
|
||||||
create a NavigableString for the string "penguin".
|
create a NavigableString for the string "penguin".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PREFIX = ''
|
PREFIX = ''
|
||||||
SUFFIX = ''
|
SUFFIX = ''
|
||||||
|
|
||||||
# We can't tell just by looking at a string whether it's contained
|
|
||||||
# in an XML document or an HTML document.
|
|
||||||
|
|
||||||
known_xml = None
|
|
||||||
|
|
||||||
def __new__(cls, value):
|
def __new__(cls, value):
|
||||||
"""Create a new NavigableString.
|
"""Create a new NavigableString.
|
||||||
|
|
||||||
|
@ -961,12 +955,22 @@ class NavigableString(str, PageElement):
|
||||||
u.setup()
|
u.setup()
|
||||||
return u
|
return u
|
||||||
|
|
||||||
def __copy__(self):
|
def __deepcopy__(self, memo, recursive=False):
|
||||||
"""A copy of a NavigableString has the same contents and class
|
"""A copy of a NavigableString has the same contents and class
|
||||||
as the original, but it is not connected to the parse tree.
|
as the original, but it is not connected to the parse tree.
|
||||||
|
|
||||||
|
:param recursive: This parameter is ignored; it's only defined
|
||||||
|
so that NavigableString.__deepcopy__ implements the same
|
||||||
|
signature as Tag.__deepcopy__.
|
||||||
"""
|
"""
|
||||||
return type(self)(self)
|
return type(self)(self)
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
"""A copy of a NavigableString can only be a deep copy, because
|
||||||
|
only one PageElement can occupy a given place in a parse tree.
|
||||||
|
"""
|
||||||
|
return self.__deepcopy__({})
|
||||||
|
|
||||||
def __getnewargs__(self):
|
def __getnewargs__(self):
|
||||||
return (str(self),)
|
return (str(self),)
|
||||||
|
|
||||||
|
@ -1059,10 +1063,10 @@ class PreformattedString(NavigableString):
|
||||||
as comments (the Comment class) and CDATA blocks (the CData
|
as comments (the Comment class) and CDATA blocks (the CData
|
||||||
class).
|
class).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PREFIX = ''
|
PREFIX = ''
|
||||||
SUFFIX = ''
|
SUFFIX = ''
|
||||||
|
|
||||||
def output_ready(self, formatter=None):
|
def output_ready(self, formatter=None):
|
||||||
"""Make this string ready for output by adding any subclass-specific
|
"""Make this string ready for output by adding any subclass-specific
|
||||||
prefix or suffix.
|
prefix or suffix.
|
||||||
|
@ -1144,7 +1148,7 @@ class Stylesheet(NavigableString):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Script(NavigableString):
|
class Script(NavigableString):
|
||||||
"""A NavigableString representing an executable script (probably
|
"""A NavigableString representing an executable script (probably
|
||||||
Javascript).
|
Javascript).
|
||||||
|
@ -1250,7 +1254,7 @@ class Tag(PageElement):
|
||||||
if ((not builder or builder.store_line_numbers)
|
if ((not builder or builder.store_line_numbers)
|
||||||
and (sourceline is not None or sourcepos is not None)):
|
and (sourceline is not None or sourcepos is not None)):
|
||||||
self.sourceline = sourceline
|
self.sourceline = sourceline
|
||||||
self.sourcepos = sourcepos
|
self.sourcepos = sourcepos
|
||||||
if attrs is None:
|
if attrs is None:
|
||||||
attrs = {}
|
attrs = {}
|
||||||
elif attrs:
|
elif attrs:
|
||||||
|
@ -1308,13 +1312,49 @@ class Tag(PageElement):
|
||||||
self.interesting_string_types = builder.string_containers[self.name]
|
self.interesting_string_types = builder.string_containers[self.name]
|
||||||
else:
|
else:
|
||||||
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
|
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
|
||||||
|
|
||||||
parserClass = _alias("parser_class") # BS3
|
parserClass = _alias("parser_class") # BS3
|
||||||
|
|
||||||
def __copy__(self):
|
def __deepcopy__(self, memo, recursive=True):
|
||||||
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
|
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
|
||||||
Its contents are a copy of the old Tag's contents.
|
Its contents are a copy of the old Tag's contents.
|
||||||
"""
|
"""
|
||||||
|
clone = self._clone()
|
||||||
|
|
||||||
|
if recursive:
|
||||||
|
# Clone this tag's descendants recursively, but without
|
||||||
|
# making any recursive function calls.
|
||||||
|
tag_stack = [clone]
|
||||||
|
for event, element in self._event_stream(self.descendants):
|
||||||
|
if event is Tag.END_ELEMENT_EVENT:
|
||||||
|
# Stop appending incoming Tags to the Tag that was
|
||||||
|
# just closed.
|
||||||
|
tag_stack.pop()
|
||||||
|
else:
|
||||||
|
descendant_clone = element.__deepcopy__(
|
||||||
|
memo, recursive=False
|
||||||
|
)
|
||||||
|
# Add to its parent's .contents
|
||||||
|
tag_stack[-1].append(descendant_clone)
|
||||||
|
|
||||||
|
if event is Tag.START_ELEMENT_EVENT:
|
||||||
|
# Add the Tag itself to the stack so that its
|
||||||
|
# children will be .appended to it.
|
||||||
|
tag_stack.append(descendant_clone)
|
||||||
|
return clone
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
"""A copy of a Tag must always be a deep copy, because a Tag's
|
||||||
|
children can only have one parent at a time.
|
||||||
|
"""
|
||||||
|
return self.__deepcopy__({})
|
||||||
|
|
||||||
|
def _clone(self):
|
||||||
|
"""Create a new Tag just like this one, but with no
|
||||||
|
contents and unattached to any parse tree.
|
||||||
|
|
||||||
|
This is the first step in the deepcopy process.
|
||||||
|
"""
|
||||||
clone = type(self)(
|
clone = type(self)(
|
||||||
None, self.builder, self.name, self.namespace,
|
None, self.builder, self.name, self.namespace,
|
||||||
self.prefix, self.attrs, is_xml=self._is_xml,
|
self.prefix, self.attrs, is_xml=self._is_xml,
|
||||||
|
@ -1326,8 +1366,6 @@ class Tag(PageElement):
|
||||||
)
|
)
|
||||||
for attr in ('can_be_empty_element', 'hidden'):
|
for attr in ('can_be_empty_element', 'hidden'):
|
||||||
setattr(clone, attr, getattr(self, attr))
|
setattr(clone, attr, getattr(self, attr))
|
||||||
for child in self.contents:
|
|
||||||
clone.append(child.__copy__())
|
|
||||||
return clone
|
return clone
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -1433,7 +1471,7 @@ class Tag(PageElement):
|
||||||
i.contents = []
|
i.contents = []
|
||||||
i._decomposed = True
|
i._decomposed = True
|
||||||
i = n
|
i = n
|
||||||
|
|
||||||
def clear(self, decompose=False):
|
def clear(self, decompose=False):
|
||||||
"""Wipe out all children of this PageElement by calling extract()
|
"""Wipe out all children of this PageElement by calling extract()
|
||||||
on them.
|
on them.
|
||||||
|
@ -1521,7 +1559,7 @@ class Tag(PageElement):
|
||||||
if not isinstance(value, list):
|
if not isinstance(value, list):
|
||||||
value = [value]
|
value = [value]
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def has_attr(self, key):
|
def has_attr(self, key):
|
||||||
"""Does this PageElement have an attribute with the given name?"""
|
"""Does this PageElement have an attribute with the given name?"""
|
||||||
return key in self.attrs
|
return key in self.attrs
|
||||||
|
@ -1608,7 +1646,7 @@ class Tag(PageElement):
|
||||||
def __repr__(self, encoding="unicode-escape"):
|
def __repr__(self, encoding="unicode-escape"):
|
||||||
"""Renders this PageElement as a string.
|
"""Renders this PageElement as a string.
|
||||||
|
|
||||||
:param encoding: The encoding to use (Python 2 only).
|
:param encoding: The encoding to use (Python 2 only).
|
||||||
TODO: This is now ignored and a warning should be issued
|
TODO: This is now ignored and a warning should be issued
|
||||||
if a value is provided.
|
if a value is provided.
|
||||||
:return: A (Unicode) string.
|
:return: A (Unicode) string.
|
||||||
|
@ -1650,106 +1688,212 @@ class Tag(PageElement):
|
||||||
|
|
||||||
def decode(self, indent_level=None,
|
def decode(self, indent_level=None,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal",
|
||||||
"""Render a Unicode representation of this PageElement and its
|
iterator=None):
|
||||||
contents.
|
pieces = []
|
||||||
|
|
||||||
:param indent_level: Each line of the rendering will be
|
|
||||||
indented this many spaces. Used internally in
|
|
||||||
recursive calls while pretty-printing.
|
|
||||||
:param eventual_encoding: The tag is destined to be
|
|
||||||
encoded into this encoding. This method is _not_
|
|
||||||
responsible for performing that encoding. This information
|
|
||||||
is passed in so that it can be substituted in if the
|
|
||||||
document contains a <META> tag that mentions the document's
|
|
||||||
encoding.
|
|
||||||
:param formatter: A Formatter object, or a string naming one of
|
|
||||||
the standard formatters.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# First off, turn a non-Formatter `formatter` into a Formatter
|
# First off, turn a non-Formatter `formatter` into a Formatter
|
||||||
# object. This will stop the lookup from happening over and
|
# object. This will stop the lookup from happening over and
|
||||||
# over again.
|
# over again.
|
||||||
if not isinstance(formatter, Formatter):
|
if not isinstance(formatter, Formatter):
|
||||||
formatter = self.formatter_for_name(formatter)
|
formatter = self.formatter_for_name(formatter)
|
||||||
attributes = formatter.attributes(self)
|
|
||||||
attrs = []
|
if indent_level is True:
|
||||||
for key, val in attributes:
|
indent_level = 0
|
||||||
if val is None:
|
|
||||||
decoded = key
|
# The currently active tag that put us into string literal
|
||||||
|
# mode. Until this element is closed, children will be treated
|
||||||
|
# as string literals and not pretty-printed. String literal
|
||||||
|
# mode is turned on immediately after this tag begins, and
|
||||||
|
# turned off immediately before it's closed. This means there
|
||||||
|
# will be whitespace before and after the tag itself.
|
||||||
|
string_literal_tag = None
|
||||||
|
|
||||||
|
for event, element in self._event_stream(iterator):
|
||||||
|
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
|
||||||
|
piece = element._format_tag(
|
||||||
|
eventual_encoding, formatter, opening=True
|
||||||
|
)
|
||||||
|
elif event is Tag.END_ELEMENT_EVENT:
|
||||||
|
piece = element._format_tag(
|
||||||
|
eventual_encoding, formatter, opening=False
|
||||||
|
)
|
||||||
|
if indent_level is not None:
|
||||||
|
indent_level -= 1
|
||||||
else:
|
else:
|
||||||
if isinstance(val, list) or isinstance(val, tuple):
|
piece = element.output_ready(formatter)
|
||||||
val = ' '.join(val)
|
|
||||||
elif not isinstance(val, str):
|
|
||||||
val = str(val)
|
|
||||||
elif (
|
|
||||||
isinstance(val, AttributeValueWithCharsetSubstitution)
|
|
||||||
and eventual_encoding is not None
|
|
||||||
):
|
|
||||||
val = val.encode(eventual_encoding)
|
|
||||||
|
|
||||||
text = formatter.attribute_value(val)
|
# Now we need to apply the 'prettiness' -- extra
|
||||||
decoded = (
|
# whitespace before and/or after this tag. This can get
|
||||||
str(key) + '='
|
# complicated because certain tags, like <pre> and
|
||||||
+ formatter.quoted_attribute_value(text))
|
# <script>, can't be prettified, since adding whitespace would
|
||||||
attrs.append(decoded)
|
# change the meaning of the content.
|
||||||
close = ''
|
|
||||||
closeTag = ''
|
|
||||||
|
|
||||||
|
# The default behavior is to add whitespace before and
|
||||||
|
# after an element when string literal mode is off, and to
|
||||||
|
# leave things as they are when string literal mode is on.
|
||||||
|
if string_literal_tag:
|
||||||
|
indent_before = indent_after = False
|
||||||
|
else:
|
||||||
|
indent_before = indent_after = True
|
||||||
|
|
||||||
|
# The only time the behavior is more complex than that is
|
||||||
|
# when we encounter an opening or closing tag that might
|
||||||
|
# put us into or out of string literal mode.
|
||||||
|
if (event is Tag.START_ELEMENT_EVENT
|
||||||
|
and not string_literal_tag
|
||||||
|
and not element._should_pretty_print()):
|
||||||
|
# We are about to enter string literal mode. Add
|
||||||
|
# whitespace before this tag, but not after. We
|
||||||
|
# will stay in string literal mode until this tag
|
||||||
|
# is closed.
|
||||||
|
indent_before = True
|
||||||
|
indent_after = False
|
||||||
|
string_literal_tag = element
|
||||||
|
elif (event is Tag.END_ELEMENT_EVENT
|
||||||
|
and element is string_literal_tag):
|
||||||
|
# We are about to exit string literal mode by closing
|
||||||
|
# the tag that sent us into that mode. Add whitespace
|
||||||
|
# after this tag, but not before.
|
||||||
|
indent_before = False
|
||||||
|
indent_after = True
|
||||||
|
string_literal_tag = None
|
||||||
|
|
||||||
|
# Now we know whether to add whitespace before and/or
|
||||||
|
# after this element.
|
||||||
|
if indent_level is not None:
|
||||||
|
if (indent_before or indent_after):
|
||||||
|
if isinstance(element, NavigableString):
|
||||||
|
piece = piece.strip()
|
||||||
|
if piece:
|
||||||
|
piece = self._indent_string(
|
||||||
|
piece, indent_level, formatter,
|
||||||
|
indent_before, indent_after
|
||||||
|
)
|
||||||
|
if event == Tag.START_ELEMENT_EVENT:
|
||||||
|
indent_level += 1
|
||||||
|
pieces.append(piece)
|
||||||
|
return "".join(pieces)
|
||||||
|
|
||||||
|
# Names for the different events yielded by _event_stream
|
||||||
|
START_ELEMENT_EVENT = object()
|
||||||
|
END_ELEMENT_EVENT = object()
|
||||||
|
EMPTY_ELEMENT_EVENT = object()
|
||||||
|
STRING_ELEMENT_EVENT = object()
|
||||||
|
|
||||||
|
def _event_stream(self, iterator=None):
|
||||||
|
"""Yield a sequence of events that can be used to reconstruct the DOM
|
||||||
|
for this element.
|
||||||
|
|
||||||
|
This lets us recreate the nested structure of this element
|
||||||
|
(e.g. when formatting it as a string) without using recursive
|
||||||
|
method calls.
|
||||||
|
|
||||||
|
This is similar in concept to the SAX API, but it's a simpler
|
||||||
|
interface designed for internal use. The events are different
|
||||||
|
from SAX and the arguments associated with the events are Tags
|
||||||
|
and other Beautiful Soup objects.
|
||||||
|
|
||||||
|
:param iterator: An alternate iterator to use when traversing
|
||||||
|
the tree.
|
||||||
|
"""
|
||||||
|
tag_stack = []
|
||||||
|
|
||||||
|
iterator = iterator or self.self_and_descendants
|
||||||
|
|
||||||
|
for c in iterator:
|
||||||
|
# If the parent of the element we're about to yield is not
|
||||||
|
# the tag currently on the stack, it means that the tag on
|
||||||
|
# the stack closed before this element appeared.
|
||||||
|
while tag_stack and c.parent != tag_stack[-1]:
|
||||||
|
now_closed_tag = tag_stack.pop()
|
||||||
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
||||||
|
|
||||||
|
if isinstance(c, Tag):
|
||||||
|
if c.is_empty_element:
|
||||||
|
yield Tag.EMPTY_ELEMENT_EVENT, c
|
||||||
|
else:
|
||||||
|
yield Tag.START_ELEMENT_EVENT, c
|
||||||
|
tag_stack.append(c)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
yield Tag.STRING_ELEMENT_EVENT, c
|
||||||
|
|
||||||
|
while tag_stack:
|
||||||
|
now_closed_tag = tag_stack.pop()
|
||||||
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
||||||
|
|
||||||
|
def _indent_string(self, s, indent_level, formatter,
|
||||||
|
indent_before, indent_after):
|
||||||
|
"""Add indentation whitespace before and/or after a string.
|
||||||
|
|
||||||
|
:param s: The string to amend with whitespace.
|
||||||
|
:param indent_level: The indentation level; affects how much
|
||||||
|
whitespace goes before the string.
|
||||||
|
:param indent_before: Whether or not to add whitespace
|
||||||
|
before the string.
|
||||||
|
:param indent_after: Whether or not to add whitespace
|
||||||
|
(a newline) after the string.
|
||||||
|
"""
|
||||||
|
space_before = ''
|
||||||
|
if indent_before and indent_level:
|
||||||
|
space_before = (formatter.indent * indent_level)
|
||||||
|
|
||||||
|
space_after = ''
|
||||||
|
if indent_after:
|
||||||
|
space_after = "\n"
|
||||||
|
|
||||||
|
return space_before + s + space_after
|
||||||
|
|
||||||
|
def _format_tag(self, eventual_encoding, formatter, opening):
|
||||||
|
# A tag starts with the < character (see below).
|
||||||
|
|
||||||
|
# Then the / character, if this is a closing tag.
|
||||||
|
closing_slash = ''
|
||||||
|
if not opening:
|
||||||
|
closing_slash = '/'
|
||||||
|
|
||||||
|
# Then an optional namespace prefix.
|
||||||
prefix = ''
|
prefix = ''
|
||||||
if self.prefix:
|
if self.prefix:
|
||||||
prefix = self.prefix + ":"
|
prefix = self.prefix + ":"
|
||||||
|
|
||||||
if self.is_empty_element:
|
# Then a list of attribute values, if this is an opening tag.
|
||||||
close = formatter.void_element_close_prefix or ''
|
attribute_string = ''
|
||||||
else:
|
if opening:
|
||||||
closeTag = '</%s%s>' % (prefix, self.name)
|
attributes = formatter.attributes(self)
|
||||||
|
attrs = []
|
||||||
|
for key, val in attributes:
|
||||||
|
if val is None:
|
||||||
|
decoded = key
|
||||||
|
else:
|
||||||
|
if isinstance(val, list) or isinstance(val, tuple):
|
||||||
|
val = ' '.join(val)
|
||||||
|
elif not isinstance(val, str):
|
||||||
|
val = str(val)
|
||||||
|
elif (
|
||||||
|
isinstance(val, AttributeValueWithCharsetSubstitution)
|
||||||
|
and eventual_encoding is not None
|
||||||
|
):
|
||||||
|
val = val.encode(eventual_encoding)
|
||||||
|
|
||||||
pretty_print = self._should_pretty_print(indent_level)
|
text = formatter.attribute_value(val)
|
||||||
space = ''
|
decoded = (
|
||||||
indent_space = ''
|
str(key) + '='
|
||||||
if indent_level is not None:
|
+ formatter.quoted_attribute_value(text))
|
||||||
indent_space = (formatter.indent * (indent_level - 1))
|
attrs.append(decoded)
|
||||||
if pretty_print:
|
|
||||||
space = indent_space
|
|
||||||
indent_contents = indent_level + 1
|
|
||||||
else:
|
|
||||||
indent_contents = None
|
|
||||||
contents = self.decode_contents(
|
|
||||||
indent_contents, eventual_encoding, formatter
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.hidden:
|
|
||||||
# This is the 'document root' object.
|
|
||||||
s = contents
|
|
||||||
else:
|
|
||||||
s = []
|
|
||||||
attribute_string = ''
|
|
||||||
if attrs:
|
if attrs:
|
||||||
attribute_string = ' ' + ' '.join(attrs)
|
attribute_string = ' ' + ' '.join(attrs)
|
||||||
if indent_level is not None:
|
|
||||||
# Even if this particular tag is not pretty-printed,
|
|
||||||
# we should indent up to the start of the tag.
|
|
||||||
s.append(indent_space)
|
|
||||||
s.append('<%s%s%s%s>' % (
|
|
||||||
prefix, self.name, attribute_string, close))
|
|
||||||
if pretty_print:
|
|
||||||
s.append("\n")
|
|
||||||
s.append(contents)
|
|
||||||
if pretty_print and contents and contents[-1] != "\n":
|
|
||||||
s.append("\n")
|
|
||||||
if pretty_print and closeTag:
|
|
||||||
s.append(space)
|
|
||||||
s.append(closeTag)
|
|
||||||
if indent_level is not None and closeTag and self.next_sibling:
|
|
||||||
# Even if this particular tag is not pretty-printed,
|
|
||||||
# we're now done with the tag, and we should add a
|
|
||||||
# newline if appropriate.
|
|
||||||
s.append("\n")
|
|
||||||
s = ''.join(s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def _should_pretty_print(self, indent_level):
|
# Then an optional closing slash (for a void element in an
|
||||||
|
# XML document).
|
||||||
|
void_element_closing_slash = ''
|
||||||
|
if self.is_empty_element:
|
||||||
|
void_element_closing_slash = formatter.void_element_close_prefix or ''
|
||||||
|
|
||||||
|
# Put it all together.
|
||||||
|
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
|
||||||
|
|
||||||
|
def _should_pretty_print(self, indent_level=1):
|
||||||
"""Should this tag be pretty-printed?
|
"""Should this tag be pretty-printed?
|
||||||
|
|
||||||
Most of them should, but some (such as <pre> in HTML
|
Most of them should, but some (such as <pre> in HTML
|
||||||
|
@ -1770,7 +1914,7 @@ class Tag(PageElement):
|
||||||
a Unicode string will be returned.
|
a Unicode string will be returned.
|
||||||
:param formatter: A Formatter object, or a string naming one of
|
:param formatter: A Formatter object, or a string naming one of
|
||||||
the standard formatters.
|
the standard formatters.
|
||||||
:return: A Unicode string (if encoding==None) or a bytestring
|
:return: A Unicode string (if encoding==None) or a bytestring
|
||||||
(otherwise).
|
(otherwise).
|
||||||
"""
|
"""
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
|
@ -1800,33 +1944,9 @@ class Tag(PageElement):
|
||||||
the standard Formatters.
|
the standard Formatters.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# First off, turn a string formatter into a Formatter object. This
|
return self.decode(indent_level, eventual_encoding, formatter,
|
||||||
# will stop the lookup from happening over and over again.
|
iterator=self.descendants)
|
||||||
if not isinstance(formatter, Formatter):
|
|
||||||
formatter = self.formatter_for_name(formatter)
|
|
||||||
|
|
||||||
pretty_print = (indent_level is not None)
|
|
||||||
s = []
|
|
||||||
for c in self:
|
|
||||||
text = None
|
|
||||||
if isinstance(c, NavigableString):
|
|
||||||
text = c.output_ready(formatter)
|
|
||||||
elif isinstance(c, Tag):
|
|
||||||
s.append(c.decode(indent_level, eventual_encoding,
|
|
||||||
formatter))
|
|
||||||
preserve_whitespace = (
|
|
||||||
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
|
|
||||||
)
|
|
||||||
if text and indent_level and not preserve_whitespace:
|
|
||||||
text = text.strip()
|
|
||||||
if text:
|
|
||||||
if pretty_print and not preserve_whitespace:
|
|
||||||
s.append(formatter.indent * (indent_level - 1))
|
|
||||||
s.append(text)
|
|
||||||
if pretty_print and not preserve_whitespace:
|
|
||||||
s.append("\n")
|
|
||||||
return ''.join(s)
|
|
||||||
|
|
||||||
def encode_contents(
|
def encode_contents(
|
||||||
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal"):
|
||||||
|
@ -1922,6 +2042,18 @@ class Tag(PageElement):
|
||||||
# return iter() to make the purpose of the method clear
|
# return iter() to make the purpose of the method clear
|
||||||
return iter(self.contents) # XXX This seems to be untested.
|
return iter(self.contents) # XXX This seems to be untested.
|
||||||
|
|
||||||
|
@property
|
||||||
|
def self_and_descendants(self):
|
||||||
|
"""Iterate over this PageElement and its children in a
|
||||||
|
breadth-first sequence.
|
||||||
|
|
||||||
|
:yield: A sequence of PageElements.
|
||||||
|
"""
|
||||||
|
if not self.hidden:
|
||||||
|
yield self
|
||||||
|
for i in self.descendants:
|
||||||
|
yield i
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def descendants(self):
|
def descendants(self):
|
||||||
"""Iterate over all children of this PageElement in a
|
"""Iterate over all children of this PageElement in a
|
||||||
|
@ -1948,16 +2080,13 @@ class Tag(PageElement):
|
||||||
Beautiful Soup will use the prefixes it encountered while
|
Beautiful Soup will use the prefixes it encountered while
|
||||||
parsing the document.
|
parsing the document.
|
||||||
|
|
||||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||||
soupsieve.select() method.
|
soupsieve.select() method.
|
||||||
|
|
||||||
:return: A Tag.
|
:return: A Tag.
|
||||||
:rtype: bs4.element.Tag
|
:rtype: bs4.element.Tag
|
||||||
"""
|
"""
|
||||||
value = self.select(selector, namespaces, 1, **kwargs)
|
return self.css.select_one(selector, namespaces, **kwargs)
|
||||||
if value:
|
|
||||||
return value[0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
def select(self, selector, namespaces=None, limit=None, **kwargs):
|
def select(self, selector, namespaces=None, limit=None, **kwargs):
|
||||||
"""Perform a CSS selection operation on the current element.
|
"""Perform a CSS selection operation on the current element.
|
||||||
|
@ -1973,27 +2102,18 @@ class Tag(PageElement):
|
||||||
|
|
||||||
:param limit: After finding this number of results, stop looking.
|
:param limit: After finding this number of results, stop looking.
|
||||||
|
|
||||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||||
soupsieve.select() method.
|
soupsieve.select() method.
|
||||||
|
|
||||||
:return: A ResultSet of Tags.
|
:return: A ResultSet of Tags.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
if namespaces is None:
|
return self.css.select(selector, namespaces, limit, **kwargs)
|
||||||
namespaces = self._namespaces
|
|
||||||
|
|
||||||
if limit is None:
|
|
||||||
limit = 0
|
|
||||||
if soupsieve is None:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
|
||||||
)
|
|
||||||
|
|
||||||
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
|
|
||||||
|
|
||||||
# We do this because it's more consistent and because
|
@property
|
||||||
# ResultSet.__getattr__ has a helpful error message.
|
def css(self):
|
||||||
return ResultSet(None, results)
|
"""Return an interface to the CSS selector API."""
|
||||||
|
return CSS(self)
|
||||||
|
|
||||||
# Old names for backwards compatibility
|
# Old names for backwards compatibility
|
||||||
def childGenerator(self):
|
def childGenerator(self):
|
||||||
|
@ -2038,7 +2158,7 @@ class SoupStrainer(object):
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param string: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
"""
|
"""
|
||||||
if string is None and 'text' in kwargs:
|
if string is None and 'text' in kwargs:
|
||||||
string = kwargs.pop('text')
|
string = kwargs.pop('text')
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
@ -2137,7 +2257,7 @@ class SoupStrainer(object):
|
||||||
# looking at a tag with a different name.
|
# looking at a tag with a different name.
|
||||||
if markup and not markup.prefix and self.name != markup.name:
|
if markup and not markup.prefix and self.name != markup.name:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
call_function_with_tag_data = (
|
call_function_with_tag_data = (
|
||||||
isinstance(self.name, Callable)
|
isinstance(self.name, Callable)
|
||||||
and not isinstance(markup_name, Tag))
|
and not isinstance(markup_name, Tag))
|
||||||
|
@ -2223,7 +2343,7 @@ class SoupStrainer(object):
|
||||||
if self._matches(' '.join(markup), match_against):
|
if self._matches(' '.join(markup), match_against):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if match_against is True:
|
if match_against is True:
|
||||||
# True matches any non-None value.
|
# True matches any non-None value.
|
||||||
return markup is not None
|
return markup is not None
|
||||||
|
@ -2267,11 +2387,11 @@ class SoupStrainer(object):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Beyond this point we might need to run the test twice: once against
|
# Beyond this point we might need to run the test twice: once against
|
||||||
# the tag's name and once against its prefixed name.
|
# the tag's name and once against its prefixed name.
|
||||||
match = False
|
match = False
|
||||||
|
|
||||||
if not match and isinstance(match_against, str):
|
if not match and isinstance(match_against, str):
|
||||||
# Exact string match
|
# Exact string match
|
||||||
match = markup == match_against
|
match = markup == match_against
|
||||||
|
|
|
@ -97,7 +97,7 @@ class Formatter(EntitySubstitution):
|
||||||
else:
|
else:
|
||||||
indent = ' '
|
indent = ' '
|
||||||
self.indent = indent
|
self.indent = indent
|
||||||
|
|
||||||
def substitute(self, ns):
|
def substitute(self, ns):
|
||||||
"""Process a string that needs to undergo entity substitution.
|
"""Process a string that needs to undergo entity substitution.
|
||||||
This may be a string encountered in an attribute value or as
|
This may be a string encountered in an attribute value or as
|
||||||
|
|
|
@ -297,37 +297,11 @@ class TreeBuilderSmokeTest(object):
|
||||||
markup, multi_valued_attributes=multi_valued_attributes
|
markup, multi_valued_attributes=multi_valued_attributes
|
||||||
)
|
)
|
||||||
assert soup.a['class'] == ['a', 'b', 'c']
|
assert soup.a['class'] == ['a', 'b', 'c']
|
||||||
|
|
||||||
def test_fuzzed_input(self):
|
|
||||||
# This test centralizes in one place the various fuzz tests
|
|
||||||
# for Beautiful Soup created by the oss-fuzz project.
|
|
||||||
|
|
||||||
# These strings superficially resemble markup, but they
|
|
||||||
# generally can't be parsed into anything. The best we can
|
|
||||||
# hope for is that parsing these strings won't crash the
|
|
||||||
# parser.
|
|
||||||
#
|
|
||||||
# n.b. This markup is commented out because these fuzz tests
|
|
||||||
# _do_ crash the parser. However the crashes are due to bugs
|
|
||||||
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
|
|
||||||
# bugs!
|
|
||||||
|
|
||||||
bad_markup = [
|
|
||||||
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
|
|
||||||
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
|
|
||||||
# https://bugs.python.org/issue37747
|
|
||||||
#
|
|
||||||
#b'\n<![\xff\xfe\xfe\xcd\x00',
|
|
||||||
|
|
||||||
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
|
def test_invalid_doctype(self):
|
||||||
# https://bugs.python.org/issue34480
|
markup = '<![if word]>content<![endif]>'
|
||||||
#
|
markup = '<!DOCTYPE html]ff>'
|
||||||
#b'<![n\x00'
|
soup = self.soup(markup)
|
||||||
]
|
|
||||||
for markup in bad_markup:
|
|
||||||
with warnings.catch_warnings(record=False):
|
|
||||||
soup = self.soup(markup)
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
|
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
|
||||||
|
|
||||||
|
@ -577,8 +551,8 @@ Hello, world!
|
||||||
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
||||||
even if that would mean not prettifying the markup.
|
even if that would mean not prettifying the markup.
|
||||||
"""
|
"""
|
||||||
pre_markup = "<pre> </pre>"
|
pre_markup = "<pre>a z</pre>\n"
|
||||||
textarea_markup = "<textarea> woo\nwoo </textarea>"
|
textarea_markup = "<textarea> woo\nwoo </textarea>\n"
|
||||||
self.assert_soup(pre_markup)
|
self.assert_soup(pre_markup)
|
||||||
self.assert_soup(textarea_markup)
|
self.assert_soup(textarea_markup)
|
||||||
|
|
||||||
|
@ -589,7 +563,7 @@ Hello, world!
|
||||||
assert soup.textarea.prettify() == textarea_markup
|
assert soup.textarea.prettify() == textarea_markup
|
||||||
|
|
||||||
soup = self.soup("<textarea></textarea>")
|
soup = self.soup("<textarea></textarea>")
|
||||||
assert soup.textarea.prettify() == "<textarea></textarea>"
|
assert soup.textarea.prettify() == "<textarea></textarea>\n"
|
||||||
|
|
||||||
def test_nested_inline_elements(self):
|
def test_nested_inline_elements(self):
|
||||||
"""Inline elements can be nested indefinitely."""
|
"""Inline elements can be nested indefinitely."""
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
˙<!DOCTyPEV PUBLIC'''Đ'
|
|
@ -0,0 +1 @@
|
||||||
|
)<a><math><TR><a><mI><a><p><a>
|
Binary file not shown.
|
@ -0,0 +1,2 @@
|
||||||
|
|
||||||
|
<![
|
|
@ -0,0 +1 @@
|
||||||
|
-<math><sElect><mi><sElect><sElect>
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -0,0 +1 @@
|
||||||
|
ñ<table><svg><html>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
487
lib/bs4/tests/test_css.py
Normal file
487
lib/bs4/tests/test_css.py
Normal file
|
@ -0,0 +1,487 @@
|
||||||
|
import pytest
|
||||||
|
import types
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from bs4 import (
|
||||||
|
CSS,
|
||||||
|
BeautifulSoup,
|
||||||
|
ResultSet,
|
||||||
|
)
|
||||||
|
|
||||||
|
from . import (
|
||||||
|
SoupTest,
|
||||||
|
SOUP_SIEVE_PRESENT,
|
||||||
|
)
|
||||||
|
|
||||||
|
if SOUP_SIEVE_PRESENT:
|
||||||
|
from soupsieve import SelectorSyntaxError
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
|
||||||
|
class TestCSSSelectors(SoupTest):
|
||||||
|
"""Test basic CSS selector functionality.
|
||||||
|
|
||||||
|
This functionality is implemented in soupsieve, which has a much
|
||||||
|
more comprehensive test suite, so this is basically an extra check
|
||||||
|
that soupsieve works as expected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
HTML = """
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||||
|
"http://www.w3.org/TR/html4/strict.dtd">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>The title</title>
|
||||||
|
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
|
||||||
|
<div id="main" class="fancy">
|
||||||
|
<div id="inner">
|
||||||
|
<h1 id="header1">An H1</h1>
|
||||||
|
<p>Some text</p>
|
||||||
|
<p class="onep" id="p1">Some more text</p>
|
||||||
|
<h2 id="header2">An H2</h2>
|
||||||
|
<p class="class1 class2 class3" id="pmulti">Another</p>
|
||||||
|
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
|
||||||
|
<h2 id="header3">Another H2</h2>
|
||||||
|
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
|
||||||
|
<span class="s1">
|
||||||
|
<a href="#" id="s1a1">span1a1</a>
|
||||||
|
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
|
||||||
|
<span class="span2">
|
||||||
|
<a href="#" id="s2a1">span2a1</a>
|
||||||
|
</span>
|
||||||
|
<span class="span3"></span>
|
||||||
|
<custom-dashed-tag class="dashed" id="dash2"/>
|
||||||
|
<div data-tag="dashedvalue" id="data1"/>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<x id="xid">
|
||||||
|
<z id="zida"/>
|
||||||
|
<z id="zidab"/>
|
||||||
|
<z id="zidac"/>
|
||||||
|
</x>
|
||||||
|
<y id="yid">
|
||||||
|
<z id="zidb"/>
|
||||||
|
</y>
|
||||||
|
<p lang="en" id="lang-en">English</p>
|
||||||
|
<p lang="en-gb" id="lang-en-gb">English UK</p>
|
||||||
|
<p lang="en-us" id="lang-en-us">English US</p>
|
||||||
|
<p lang="fr" id="lang-fr">French</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="footer">
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
self.soup = BeautifulSoup(self.HTML, 'html.parser')
|
||||||
|
|
||||||
|
def assert_selects(self, selector, expected_ids, **kwargs):
|
||||||
|
results = self.soup.select(selector, **kwargs)
|
||||||
|
assert isinstance(results, ResultSet)
|
||||||
|
el_ids = [el['id'] for el in results]
|
||||||
|
el_ids.sort()
|
||||||
|
expected_ids.sort()
|
||||||
|
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
|
||||||
|
selector, ', '.join(expected_ids), ', '.join(el_ids)
|
||||||
|
)
|
||||||
|
|
||||||
|
assertSelect = assert_selects
|
||||||
|
|
||||||
|
def assert_select_multiple(self, *tests):
|
||||||
|
for selector, expected_ids in tests:
|
||||||
|
self.assert_selects(selector, expected_ids)
|
||||||
|
|
||||||
|
def test_precompiled(self):
|
||||||
|
sel = self.soup.css.compile('div')
|
||||||
|
|
||||||
|
els = self.soup.select(sel)
|
||||||
|
assert len(els) == 4
|
||||||
|
for div in els:
|
||||||
|
assert div.name == 'div'
|
||||||
|
|
||||||
|
el = self.soup.select_one(sel)
|
||||||
|
assert 'main' == el['id']
|
||||||
|
|
||||||
|
def test_one_tag_one(self):
|
||||||
|
els = self.soup.select('title')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].name == 'title'
|
||||||
|
assert els[0].contents == ['The title']
|
||||||
|
|
||||||
|
def test_one_tag_many(self):
|
||||||
|
els = self.soup.select('div')
|
||||||
|
assert len(els) == 4
|
||||||
|
for div in els:
|
||||||
|
assert div.name == 'div'
|
||||||
|
|
||||||
|
el = self.soup.select_one('div')
|
||||||
|
assert 'main' == el['id']
|
||||||
|
|
||||||
|
def test_select_one_returns_none_if_no_match(self):
|
||||||
|
match = self.soup.select_one('nonexistenttag')
|
||||||
|
assert None == match
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag_in_tag_one(self):
|
||||||
|
els = self.soup.select('div div')
|
||||||
|
self.assert_selects('div div', ['inner', 'data1'])
|
||||||
|
|
||||||
|
def test_tag_in_tag_many(self):
|
||||||
|
for selector in ('html div', 'html body div', 'body div'):
|
||||||
|
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_limit(self):
|
||||||
|
self.assert_selects('html div', ['main'], limit=1)
|
||||||
|
self.assert_selects('html body div', ['inner', 'main'], limit=2)
|
||||||
|
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
|
||||||
|
limit=10)
|
||||||
|
|
||||||
|
def test_tag_no_match(self):
|
||||||
|
assert len(self.soup.select('del')) == 0
|
||||||
|
|
||||||
|
def test_invalid_tag(self):
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select('tag%t')
|
||||||
|
|
||||||
|
def test_select_dashed_tag_ids(self):
|
||||||
|
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||||
|
|
||||||
|
def test_select_dashed_by_id(self):
|
||||||
|
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
|
||||||
|
assert dashed[0].name == 'custom-dashed-tag'
|
||||||
|
assert dashed[0]['id'] == 'dash2'
|
||||||
|
|
||||||
|
def test_dashed_tag_text(self):
|
||||||
|
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
|
||||||
|
|
||||||
|
def test_select_dashed_matches_find_all(self):
|
||||||
|
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
|
||||||
|
|
||||||
|
def test_header_tags(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('h1', ['header1']),
|
||||||
|
('h2', ['header2', 'header3']),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_class_one(self):
|
||||||
|
for selector in ('.onep', 'p.onep', 'html p.onep'):
|
||||||
|
els = self.soup.select(selector)
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].name == 'p'
|
||||||
|
assert els[0]['class'] == ['onep']
|
||||||
|
|
||||||
|
def test_class_mismatched_tag(self):
|
||||||
|
els = self.soup.select('div.onep')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
def test_one_id(self):
|
||||||
|
for selector in ('div#inner', '#inner', 'div div#inner'):
|
||||||
|
self.assert_selects(selector, ['inner'])
|
||||||
|
|
||||||
|
def test_bad_id(self):
|
||||||
|
els = self.soup.select('#doesnotexist')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
def test_items_in_id(self):
|
||||||
|
els = self.soup.select('div#inner p')
|
||||||
|
assert len(els) == 3
|
||||||
|
for el in els:
|
||||||
|
assert el.name == 'p'
|
||||||
|
assert els[1]['class'] == ['onep']
|
||||||
|
assert not els[0].has_attr('class')
|
||||||
|
|
||||||
|
def test_a_bunch_of_emptys(self):
|
||||||
|
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
||||||
|
assert len(self.soup.select(selector)) == 0
|
||||||
|
|
||||||
|
def test_multi_class_support(self):
|
||||||
|
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
|
||||||
|
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
|
||||||
|
self.assert_selects(selector, ['pmulti'])
|
||||||
|
|
||||||
|
def test_multi_class_selection(self):
|
||||||
|
for selector in ('.class1.class3', '.class3.class2',
|
||||||
|
'.class1.class2.class3'):
|
||||||
|
self.assert_selects(selector, ['pmulti'])
|
||||||
|
|
||||||
|
def test_child_selector(self):
|
||||||
|
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
|
||||||
|
self.assert_selects('.s1 > a span', ['s1a2s1'])
|
||||||
|
|
||||||
|
def test_child_selector_id(self):
|
||||||
|
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
|
||||||
|
|
||||||
|
def test_attribute_equals(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('p[class="onep"]', ['p1']),
|
||||||
|
('p[id="p1"]', ['p1']),
|
||||||
|
('[class="onep"]', ['p1']),
|
||||||
|
('[id="p1"]', ['p1']),
|
||||||
|
('link[rel="stylesheet"]', ['l1']),
|
||||||
|
('link[type="text/css"]', ['l1']),
|
||||||
|
('link[href="blah.css"]', ['l1']),
|
||||||
|
('link[href="no-blah.css"]', []),
|
||||||
|
('[rel="stylesheet"]', ['l1']),
|
||||||
|
('[type="text/css"]', ['l1']),
|
||||||
|
('[href="blah.css"]', ['l1']),
|
||||||
|
('[href="no-blah.css"]', []),
|
||||||
|
('p[href="no-blah.css"]', []),
|
||||||
|
('[href="no-blah.css"]', []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_tilde(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('p[class~="class1"]', ['pmulti']),
|
||||||
|
('p[class~="class2"]', ['pmulti']),
|
||||||
|
('p[class~="class3"]', ['pmulti']),
|
||||||
|
('[class~="class1"]', ['pmulti']),
|
||||||
|
('[class~="class2"]', ['pmulti']),
|
||||||
|
('[class~="class3"]', ['pmulti']),
|
||||||
|
('a[rel~="friend"]', ['bob']),
|
||||||
|
('a[rel~="met"]', ['bob']),
|
||||||
|
('[rel~="friend"]', ['bob']),
|
||||||
|
('[rel~="met"]', ['bob']),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_startswith(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('[rel^="style"]', ['l1']),
|
||||||
|
('link[rel^="style"]', ['l1']),
|
||||||
|
('notlink[rel^="notstyle"]', []),
|
||||||
|
('[rel^="notstyle"]', []),
|
||||||
|
('link[rel^="notstyle"]', []),
|
||||||
|
('link[href^="bla"]', ['l1']),
|
||||||
|
('a[href^="http://"]', ['bob', 'me']),
|
||||||
|
('[href^="http://"]', ['bob', 'me']),
|
||||||
|
('[id^="p"]', ['pmulti', 'p1']),
|
||||||
|
('[id^="m"]', ['me', 'main']),
|
||||||
|
('div[id^="m"]', ['main']),
|
||||||
|
('a[id^="m"]', ['me']),
|
||||||
|
('div[data-tag^="dashed"]', ['data1'])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_endswith(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('[href$=".css"]', ['l1']),
|
||||||
|
('link[href$=".css"]', ['l1']),
|
||||||
|
('link[id$="1"]', ['l1']),
|
||||||
|
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
|
||||||
|
('div[id$="1"]', ['data1']),
|
||||||
|
('[id$="noending"]', []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_contains(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
# From test_attribute_startswith
|
||||||
|
('[rel*="style"]', ['l1']),
|
||||||
|
('link[rel*="style"]', ['l1']),
|
||||||
|
('notlink[rel*="notstyle"]', []),
|
||||||
|
('[rel*="notstyle"]', []),
|
||||||
|
('link[rel*="notstyle"]', []),
|
||||||
|
('link[href*="bla"]', ['l1']),
|
||||||
|
('[href*="http://"]', ['bob', 'me']),
|
||||||
|
('[id*="p"]', ['pmulti', 'p1']),
|
||||||
|
('div[id*="m"]', ['main']),
|
||||||
|
('a[id*="m"]', ['me']),
|
||||||
|
# From test_attribute_endswith
|
||||||
|
('[href*=".css"]', ['l1']),
|
||||||
|
('link[href*=".css"]', ['l1']),
|
||||||
|
('link[id*="1"]', ['l1']),
|
||||||
|
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
|
||||||
|
('div[id*="1"]', ['data1']),
|
||||||
|
('[id*="noending"]', []),
|
||||||
|
# New for this test
|
||||||
|
('[href*="."]', ['bob', 'me', 'l1']),
|
||||||
|
('a[href*="."]', ['bob', 'me']),
|
||||||
|
('link[href*="."]', ['l1']),
|
||||||
|
('div[id*="n"]', ['main', 'inner']),
|
||||||
|
('div[id*="nn"]', ['inner']),
|
||||||
|
('div[data-tag*="edval"]', ['data1'])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_exact_or_hypen(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||||
|
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||||
|
('p[lang|="fr"]', ['lang-fr']),
|
||||||
|
('p[lang|="gb"]', []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_exists(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('[rel]', ['l1', 'bob', 'me']),
|
||||||
|
('link[rel]', ['l1']),
|
||||||
|
('a[rel]', ['bob', 'me']),
|
||||||
|
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
|
||||||
|
('p[class]', ['p1', 'pmulti']),
|
||||||
|
('[blah]', []),
|
||||||
|
('p[blah]', []),
|
||||||
|
('div[data-tag]', ['data1'])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_quoted_space_in_selector_name(self):
|
||||||
|
html = """<div style="display: wrong">nope</div>
|
||||||
|
<div style="display: right">yes</div>
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
[chosen] = soup.select('div[style="display: right"]')
|
||||||
|
assert "yes" == chosen.string
|
||||||
|
|
||||||
|
def test_unsupported_pseudoclass(self):
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
self.soup.select("a:no-such-pseudoclass")
|
||||||
|
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select("a:nth-of-type(a)")
|
||||||
|
|
||||||
|
def test_nth_of_type(self):
|
||||||
|
# Try to select first paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].string == 'Some text'
|
||||||
|
|
||||||
|
# Try to select third paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].string == 'Another'
|
||||||
|
|
||||||
|
# Try to select (non-existent!) fourth paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
# Zero will select no tags.
|
||||||
|
els = self.soup.select('div p:nth-of-type(0)')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
def test_nth_of_type_direct_descendant(self):
|
||||||
|
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].string == 'Some text'
|
||||||
|
|
||||||
|
def test_id_child_selector_nth_of_type(self):
|
||||||
|
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
|
||||||
|
|
||||||
|
def test_select_on_element(self):
|
||||||
|
# Other tests operate on the tree; this operates on an element
|
||||||
|
# within the tree.
|
||||||
|
inner = self.soup.find("div", id="main")
|
||||||
|
selected = inner.select("div")
|
||||||
|
# The <div id="inner"> tag was selected. The <div id="footer">
|
||||||
|
# tag was not.
|
||||||
|
self.assert_selects_ids(selected, ['inner', 'data1'])
|
||||||
|
|
||||||
|
def test_overspecified_child_id(self):
|
||||||
|
self.assert_selects(".fancy #inner", ['inner'])
|
||||||
|
self.assert_selects(".normal #inner", [])
|
||||||
|
|
||||||
|
def test_adjacent_sibling_selector(self):
|
||||||
|
self.assert_selects('#p1 + h2', ['header2'])
|
||||||
|
self.assert_selects('#p1 + h2 + p', ['pmulti'])
|
||||||
|
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
|
||||||
|
assert [] == self.soup.select('#p1 + p')
|
||||||
|
|
||||||
|
def test_general_sibling_selector(self):
|
||||||
|
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
|
||||||
|
self.assert_selects('#p1 ~ #header2', ['header2'])
|
||||||
|
self.assert_selects('#p1 ~ h2 + a', ['me'])
|
||||||
|
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
|
||||||
|
assert [] == self.soup.select('#inner ~ h2')
|
||||||
|
|
||||||
|
def test_dangling_combinator(self):
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select('h1 >')
|
||||||
|
|
||||||
|
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||||
|
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||||
|
|
||||||
|
# Test the selector grouping operator (the comma)
|
||||||
|
def test_multiple_select(self):
|
||||||
|
self.assert_selects('x, y', ['xid', 'yid'])
|
||||||
|
|
||||||
|
def test_multiple_select_with_no_space(self):
|
||||||
|
self.assert_selects('x,y', ['xid', 'yid'])
|
||||||
|
|
||||||
|
def test_multiple_select_with_more_space(self):
|
||||||
|
self.assert_selects('x, y', ['xid', 'yid'])
|
||||||
|
|
||||||
|
def test_multiple_select_duplicated(self):
|
||||||
|
self.assert_selects('x, x', ['xid'])
|
||||||
|
|
||||||
|
def test_multiple_select_sibling(self):
|
||||||
|
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
|
||||||
|
|
||||||
|
def test_multiple_select_tag_and_direct_descendant(self):
|
||||||
|
self.assert_selects('x, y > z', ['xid', 'zidb'])
|
||||||
|
|
||||||
|
def test_multiple_select_direct_descendant_and_tags(self):
|
||||||
|
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||||
|
|
||||||
|
def test_multiple_select_indirect_descendant(self):
|
||||||
|
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||||
|
|
||||||
|
def test_invalid_multiple_select(self):
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select(',x, y')
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select('x,,y')
|
||||||
|
|
||||||
|
def test_multiple_select_attrs(self):
|
||||||
|
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||||
|
|
||||||
|
def test_multiple_select_ids(self):
|
||||||
|
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
|
||||||
|
|
||||||
|
def test_multiple_select_nested(self):
|
||||||
|
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
|
||||||
|
|
||||||
|
def test_select_duplicate_elements(self):
|
||||||
|
# When markup contains duplicate elements, a multiple select
|
||||||
|
# will find all of them.
|
||||||
|
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||||||
|
soup = BeautifulSoup(markup, 'html.parser')
|
||||||
|
selected = soup.select(".c1, .c2")
|
||||||
|
assert 3 == len(selected)
|
||||||
|
|
||||||
|
# Verify that find_all finds the same elements, though because
|
||||||
|
# of an implementation detail it finds them in a different
|
||||||
|
# order.
|
||||||
|
for element in soup.find_all(class_=['c1', 'c2']):
|
||||||
|
assert element in selected
|
||||||
|
|
||||||
|
def test_closest(self):
|
||||||
|
inner = self.soup.find("div", id="inner")
|
||||||
|
closest = inner.css.closest("div[id=main]")
|
||||||
|
assert closest == self.soup.find("div", id="main")
|
||||||
|
|
||||||
|
def test_match(self):
|
||||||
|
inner = self.soup.find("div", id="inner")
|
||||||
|
main = self.soup.find("div", id="main")
|
||||||
|
assert inner.css.match("div[id=main]") == False
|
||||||
|
assert main.css.match("div[id=main]") == True
|
||||||
|
|
||||||
|
def test_iselect(self):
|
||||||
|
gen = self.soup.css.iselect("h2")
|
||||||
|
assert isinstance(gen, types.GeneratorType)
|
||||||
|
[header2, header3] = gen
|
||||||
|
assert header2['id'] == 'header2'
|
||||||
|
assert header3['id'] == 'header3'
|
||||||
|
|
||||||
|
def test_filter(self):
|
||||||
|
inner = self.soup.find("div", id="inner")
|
||||||
|
results = inner.css.filter("h2")
|
||||||
|
assert len(inner.css.filter("h2")) == 2
|
||||||
|
|
||||||
|
results = inner.css.filter("h2[id=header3]")
|
||||||
|
assert isinstance(results, ResultSet)
|
||||||
|
[result] = results
|
||||||
|
assert result['id'] == 'header3'
|
||||||
|
|
||||||
|
def test_escape(self):
|
||||||
|
m = self.soup.css.escape
|
||||||
|
assert m(".foo#bar") == '\\.foo\\#bar'
|
||||||
|
assert m("()[]{}") == '\\(\\)\\[\\]\\{\\}'
|
||||||
|
assert m(".foo") == self.soup.css.escape(".foo")
|
|
@ -80,20 +80,20 @@ class TestFormatter(SoupTest):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"indent,expect",
|
"indent,expect",
|
||||||
[
|
[
|
||||||
(None, '<a>\n<b>\ntext\n</b>\n</a>'),
|
(None, '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||||
(-1, '<a>\n<b>\ntext\n</b>\n</a>'),
|
(-1, '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||||
(0, '<a>\n<b>\ntext\n</b>\n</a>'),
|
(0, '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||||
("", '<a>\n<b>\ntext\n</b>\n</a>'),
|
("", '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||||
|
|
||||||
(1, '<a>\n <b>\n text\n </b>\n</a>'),
|
(1, '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||||
(2, '<a>\n <b>\n text\n </b>\n</a>'),
|
(2, '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||||
|
|
||||||
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
|
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n'),
|
||||||
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
|
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n'),
|
||||||
|
|
||||||
# Some invalid inputs -- the default behavior is used.
|
# Some invalid inputs -- the default behavior is used.
|
||||||
(object(), '<a>\n <b>\n text\n </b>\n</a>'),
|
(object(), '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||||
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>'),
|
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_indent(self, indent, expect):
|
def test_indent(self, indent, expect):
|
||||||
|
|
91
lib/bs4/tests/test_fuzz.py
Normal file
91
lib/bs4/tests/test_fuzz.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
"""This file contains test cases reported by third parties using
|
||||||
|
fuzzing tools, primarily from Google's oss-fuzz project. Some of these
|
||||||
|
represent real problems with Beautiful Soup, but many are problems in
|
||||||
|
libraries that Beautiful Soup depends on, and many of the test cases
|
||||||
|
represent different ways of triggering the same problem.
|
||||||
|
|
||||||
|
Grouping these test cases together makes it easy to see which test
|
||||||
|
cases represent the same problem, and puts the test cases in close
|
||||||
|
proximity to code that can trigger the problems.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from bs4 import (
|
||||||
|
BeautifulSoup,
|
||||||
|
ParserRejectedMarkup,
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestFuzz(object):
|
||||||
|
|
||||||
|
# Test case markup files from fuzzers are given this extension so
|
||||||
|
# they can be included in builds.
|
||||||
|
TESTCASE_SUFFIX = ".testcase"
|
||||||
|
|
||||||
|
# This class of error has been fixed by catching a less helpful
|
||||||
|
# exception from html.parser and raising ParserRejectedMarkup
|
||||||
|
# instead.
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename", [
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_rejected_markup(self, filename):
|
||||||
|
markup = self.__markup(filename)
|
||||||
|
with pytest.raises(ParserRejectedMarkup):
|
||||||
|
BeautifulSoup(markup, 'html.parser')
|
||||||
|
|
||||||
|
# This class of error has to do with very deeply nested documents
|
||||||
|
# which overflow the Python call stack when the tree is converted
|
||||||
|
# to a string. This is an issue with Beautiful Soup which was fixed
|
||||||
|
# as part of [bug=1471755].
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename", [
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632",
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464",
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_deeply_nested_document(self, filename):
|
||||||
|
# Parsing the document and encoding it back to a string is
|
||||||
|
# sufficient to demonstrate that the overflow problem has
|
||||||
|
# been fixed.
|
||||||
|
markup = self.__markup(filename)
|
||||||
|
BeautifulSoup(markup, 'html.parser').encode()
|
||||||
|
|
||||||
|
# This class of error represents problems with html5lib's parser,
|
||||||
|
# not Beautiful Soup. I use
|
||||||
|
# https://github.com/html5lib/html5lib-python/issues/568 to notify
|
||||||
|
# the html5lib developers of these issues.
|
||||||
|
@pytest.mark.skip("html5lib problems")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename", [
|
||||||
|
# b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320",
|
||||||
|
|
||||||
|
# b')<a><math><TR><a><mI><a><p><a>'
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456",
|
||||||
|
|
||||||
|
# b'-<math><sElect><mi><sElect><sElect>'
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",
|
||||||
|
|
||||||
|
# b'ñ<table><svg><html>'
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",
|
||||||
|
|
||||||
|
# <TABLE>, some ^@ characters, some <math> tags.
|
||||||
|
"clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744",
|
||||||
|
|
||||||
|
# Nested table
|
||||||
|
"crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_html5lib_parse_errors(self, filename):
|
||||||
|
markup = self.__markup(filename)
|
||||||
|
print(BeautifulSoup(markup, 'html5lib').encode())
|
||||||
|
|
||||||
|
def __markup(self, filename):
|
||||||
|
if not filename.endswith(self.TESTCASE_SUFFIX):
|
||||||
|
filename += self.TESTCASE_SUFFIX
|
||||||
|
this_dir = os.path.split(__file__)[0]
|
||||||
|
path = os.path.join(this_dir, 'fuzz', filename)
|
||||||
|
return open(path, 'rb').read()
|
|
@ -3,9 +3,11 @@ trees."""
|
||||||
|
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import pickle
|
import pickle
|
||||||
|
import pytest
|
||||||
import warnings
|
import warnings
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
HTMLParserTreeBuilder,
|
HTMLParserTreeBuilder,
|
||||||
|
ParserRejectedMarkup,
|
||||||
XMLParsedAsHTMLWarning,
|
XMLParsedAsHTMLWarning,
|
||||||
)
|
)
|
||||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||||
|
@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
default_builder = HTMLParserTreeBuilder
|
default_builder = HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
def test_rejected_input(self):
|
||||||
|
# Python's html.parser will occasionally reject markup,
|
||||||
|
# especially when there is a problem with the initial DOCTYPE
|
||||||
|
# declaration. Different versions of Python sound the alarm in
|
||||||
|
# different ways, but Beautiful Soup consistently raises
|
||||||
|
# errors as ParserRejectedMarkup exceptions.
|
||||||
|
bad_markup = [
|
||||||
|
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
|
||||||
|
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
|
||||||
|
# https://github.com/python/cpython/issues/81928
|
||||||
|
b'\n<![\xff\xfe\xfe\xcd\x00',
|
||||||
|
|
||||||
|
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
|
||||||
|
# https://github.com/python/cpython/issues/78661
|
||||||
|
#
|
||||||
|
b'<![n\x00',
|
||||||
|
b"<![UNKNOWN[]]>",
|
||||||
|
]
|
||||||
|
for markup in bad_markup:
|
||||||
|
with pytest.raises(ParserRejectedMarkup):
|
||||||
|
soup = self.soup(markup)
|
||||||
|
|
||||||
def test_namespaced_system_doctype(self):
|
def test_namespaced_system_doctype(self):
|
||||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -189,13 +189,15 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
assert soup.find('prefix:tag3').name == 'tag3'
|
assert soup.find('prefix:tag3').name == 'tag3'
|
||||||
assert soup.subtag.find('prefix:tag3').name == 'tag3'
|
assert soup.subtag.find('prefix:tag3').name == 'tag3'
|
||||||
|
|
||||||
def test_pickle_removes_builder(self):
|
def test_pickle_restores_builder(self):
|
||||||
# The lxml TreeBuilder is not picklable, so it won't be
|
# The lxml TreeBuilder is not picklable, so when unpickling
|
||||||
# preserved in a pickle/unpickle operation.
|
# a document created with it, a new TreeBuilder of the
|
||||||
|
# appropriate class is created.
|
||||||
soup = self.soup("<a>some markup</a>")
|
soup = self.soup("<a>some markup</a>")
|
||||||
assert isinstance(soup.builder, self.default_builder)
|
assert isinstance(soup.builder, self.default_builder)
|
||||||
pickled = pickle.dumps(soup)
|
pickled = pickle.dumps(soup)
|
||||||
unpickled = pickle.loads(pickled)
|
unpickled = pickle.loads(pickled)
|
||||||
|
|
||||||
assert "some markup" == unpickled.a.string
|
assert "some markup" == unpickled.a.string
|
||||||
assert unpickled.builder is None
|
assert unpickled.builder != soup.builder
|
||||||
|
assert isinstance(unpickled.builder, self.default_builder)
|
||||||
|
|
|
@ -2,20 +2,18 @@
|
||||||
import copy
|
import copy
|
||||||
import pickle
|
import pickle
|
||||||
import pytest
|
import pytest
|
||||||
|
import sys
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
|
ResultSet,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
)
|
)
|
||||||
from . import (
|
from . import (
|
||||||
SoupTest,
|
SoupTest,
|
||||||
SOUP_SIEVE_PRESENT,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if SOUP_SIEVE_PRESENT:
|
|
||||||
from soupsieve import SelectorSyntaxError
|
|
||||||
|
|
||||||
class TestEncoding(SoupTest):
|
class TestEncoding(SoupTest):
|
||||||
"""Test the ability to encode objects into strings."""
|
"""Test the ability to encode objects into strings."""
|
||||||
|
|
||||||
|
@ -51,10 +49,21 @@ class TestEncoding(SoupTest):
|
||||||
assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(
|
assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(
|
||||||
encoding="utf8"
|
encoding="utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_encode_deeply_nested_document(self):
|
||||||
|
# This test verifies that encoding a string doesn't involve
|
||||||
|
# any recursive function calls. If it did, this test would
|
||||||
|
# overflow the Python interpreter stack.
|
||||||
|
limit = sys.getrecursionlimit() + 1
|
||||||
|
markup = "<span>" * limit
|
||||||
|
soup = self.soup(markup)
|
||||||
|
encoded = soup.encode()
|
||||||
|
assert limit == encoded.count(b"<span>")
|
||||||
|
|
||||||
def test_deprecated_renderContents(self):
|
def test_deprecated_renderContents(self):
|
||||||
html = "<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
|
soup.renderContents()
|
||||||
assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents()
|
assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents()
|
||||||
|
|
||||||
def test_repr(self):
|
def test_repr(self):
|
||||||
|
@ -159,7 +168,31 @@ class TestFormatters(SoupTest):
|
||||||
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
|
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
|
||||||
# Everything outside the <pre> tag is reformatted, but everything
|
# Everything outside the <pre> tag is reformatted, but everything
|
||||||
# inside is left alone.
|
# inside is left alone.
|
||||||
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
|
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify()
|
||||||
|
|
||||||
|
def test_prettify_handles_nested_string_literal_tags(self):
|
||||||
|
# Most of this markup is inside a <pre> tag, so prettify()
|
||||||
|
# only does three things to it:
|
||||||
|
# 1. Add a newline and a space between the <div> and the <pre>
|
||||||
|
# 2. Add a newline after the </pre>
|
||||||
|
# 3. Add a newline at the end.
|
||||||
|
#
|
||||||
|
# The contents of the <pre> tag are left completely alone. In
|
||||||
|
# particular, we don't start adding whitespace again once we
|
||||||
|
# encounter the first </pre> tag, because we know it's not
|
||||||
|
# the one that put us into string literal mode.
|
||||||
|
markup = """<div><pre><code>some
|
||||||
|
<script><pre>code</pre></script> for you
|
||||||
|
</code></pre></div>"""
|
||||||
|
|
||||||
|
expect = """<div>
|
||||||
|
<pre><code>some
|
||||||
|
<script><pre>code</pre></script> for you
|
||||||
|
</code></pre>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
assert expect == soup.div.prettify()
|
||||||
|
|
||||||
def test_prettify_accepts_formatter_function(self):
|
def test_prettify_accepts_formatter_function(self):
|
||||||
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
||||||
|
@ -216,429 +249,6 @@ class TestFormatters(SoupTest):
|
||||||
assert soup.contents[0].name == 'pre'
|
assert soup.contents[0].name == 'pre'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
|
|
||||||
class TestCSSSelectors(SoupTest):
|
|
||||||
"""Test basic CSS selector functionality.
|
|
||||||
|
|
||||||
This functionality is implemented in soupsieve, which has a much
|
|
||||||
more comprehensive test suite, so this is basically an extra check
|
|
||||||
that soupsieve works as expected.
|
|
||||||
"""
|
|
||||||
|
|
||||||
HTML = """
|
|
||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
|
||||||
"http://www.w3.org/TR/html4/strict.dtd">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>The title</title>
|
|
||||||
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
|
|
||||||
<div id="main" class="fancy">
|
|
||||||
<div id="inner">
|
|
||||||
<h1 id="header1">An H1</h1>
|
|
||||||
<p>Some text</p>
|
|
||||||
<p class="onep" id="p1">Some more text</p>
|
|
||||||
<h2 id="header2">An H2</h2>
|
|
||||||
<p class="class1 class2 class3" id="pmulti">Another</p>
|
|
||||||
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
|
|
||||||
<h2 id="header3">Another H2</h2>
|
|
||||||
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
|
|
||||||
<span class="s1">
|
|
||||||
<a href="#" id="s1a1">span1a1</a>
|
|
||||||
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
|
|
||||||
<span class="span2">
|
|
||||||
<a href="#" id="s2a1">span2a1</a>
|
|
||||||
</span>
|
|
||||||
<span class="span3"></span>
|
|
||||||
<custom-dashed-tag class="dashed" id="dash2"/>
|
|
||||||
<div data-tag="dashedvalue" id="data1"/>
|
|
||||||
</span>
|
|
||||||
</div>
|
|
||||||
<x id="xid">
|
|
||||||
<z id="zida"/>
|
|
||||||
<z id="zidab"/>
|
|
||||||
<z id="zidac"/>
|
|
||||||
</x>
|
|
||||||
<y id="yid">
|
|
||||||
<z id="zidb"/>
|
|
||||||
</y>
|
|
||||||
<p lang="en" id="lang-en">English</p>
|
|
||||||
<p lang="en-gb" id="lang-en-gb">English UK</p>
|
|
||||||
<p lang="en-us" id="lang-en-us">English US</p>
|
|
||||||
<p lang="fr" id="lang-fr">French</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="footer">
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
def setup_method(self):
|
|
||||||
self.soup = BeautifulSoup(self.HTML, 'html.parser')
|
|
||||||
|
|
||||||
def assert_selects(self, selector, expected_ids, **kwargs):
|
|
||||||
el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
|
|
||||||
el_ids.sort()
|
|
||||||
expected_ids.sort()
|
|
||||||
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
|
|
||||||
selector, ', '.join(expected_ids), ', '.join(el_ids)
|
|
||||||
)
|
|
||||||
|
|
||||||
assertSelect = assert_selects
|
|
||||||
|
|
||||||
def assert_select_multiple(self, *tests):
|
|
||||||
for selector, expected_ids in tests:
|
|
||||||
self.assert_selects(selector, expected_ids)
|
|
||||||
|
|
||||||
def test_one_tag_one(self):
|
|
||||||
els = self.soup.select('title')
|
|
||||||
assert len(els) == 1
|
|
||||||
assert els[0].name == 'title'
|
|
||||||
assert els[0].contents == ['The title']
|
|
||||||
|
|
||||||
def test_one_tag_many(self):
|
|
||||||
els = self.soup.select('div')
|
|
||||||
assert len(els) == 4
|
|
||||||
for div in els:
|
|
||||||
assert div.name == 'div'
|
|
||||||
|
|
||||||
el = self.soup.select_one('div')
|
|
||||||
assert 'main' == el['id']
|
|
||||||
|
|
||||||
def test_select_one_returns_none_if_no_match(self):
|
|
||||||
match = self.soup.select_one('nonexistenttag')
|
|
||||||
assert None == match
|
|
||||||
|
|
||||||
|
|
||||||
def test_tag_in_tag_one(self):
|
|
||||||
els = self.soup.select('div div')
|
|
||||||
self.assert_selects('div div', ['inner', 'data1'])
|
|
||||||
|
|
||||||
def test_tag_in_tag_many(self):
|
|
||||||
for selector in ('html div', 'html body div', 'body div'):
|
|
||||||
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_limit(self):
|
|
||||||
self.assert_selects('html div', ['main'], limit=1)
|
|
||||||
self.assert_selects('html body div', ['inner', 'main'], limit=2)
|
|
||||||
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
|
|
||||||
limit=10)
|
|
||||||
|
|
||||||
def test_tag_no_match(self):
|
|
||||||
assert len(self.soup.select('del')) == 0
|
|
||||||
|
|
||||||
def test_invalid_tag(self):
|
|
||||||
with pytest.raises(SelectorSyntaxError):
|
|
||||||
self.soup.select('tag%t')
|
|
||||||
|
|
||||||
def test_select_dashed_tag_ids(self):
|
|
||||||
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
|
|
||||||
|
|
||||||
def test_select_dashed_by_id(self):
|
|
||||||
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
|
|
||||||
assert dashed[0].name == 'custom-dashed-tag'
|
|
||||||
assert dashed[0]['id'] == 'dash2'
|
|
||||||
|
|
||||||
def test_dashed_tag_text(self):
|
|
||||||
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
|
|
||||||
|
|
||||||
def test_select_dashed_matches_find_all(self):
|
|
||||||
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
|
|
||||||
|
|
||||||
def test_header_tags(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('h1', ['header1']),
|
|
||||||
('h2', ['header2', 'header3']),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_class_one(self):
|
|
||||||
for selector in ('.onep', 'p.onep', 'html p.onep'):
|
|
||||||
els = self.soup.select(selector)
|
|
||||||
assert len(els) == 1
|
|
||||||
assert els[0].name == 'p'
|
|
||||||
assert els[0]['class'] == ['onep']
|
|
||||||
|
|
||||||
def test_class_mismatched_tag(self):
|
|
||||||
els = self.soup.select('div.onep')
|
|
||||||
assert len(els) == 0
|
|
||||||
|
|
||||||
def test_one_id(self):
|
|
||||||
for selector in ('div#inner', '#inner', 'div div#inner'):
|
|
||||||
self.assert_selects(selector, ['inner'])
|
|
||||||
|
|
||||||
def test_bad_id(self):
|
|
||||||
els = self.soup.select('#doesnotexist')
|
|
||||||
assert len(els) == 0
|
|
||||||
|
|
||||||
def test_items_in_id(self):
|
|
||||||
els = self.soup.select('div#inner p')
|
|
||||||
assert len(els) == 3
|
|
||||||
for el in els:
|
|
||||||
assert el.name == 'p'
|
|
||||||
assert els[1]['class'] == ['onep']
|
|
||||||
assert not els[0].has_attr('class')
|
|
||||||
|
|
||||||
def test_a_bunch_of_emptys(self):
|
|
||||||
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
|
||||||
assert len(self.soup.select(selector)) == 0
|
|
||||||
|
|
||||||
def test_multi_class_support(self):
|
|
||||||
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
|
|
||||||
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
|
|
||||||
self.assert_selects(selector, ['pmulti'])
|
|
||||||
|
|
||||||
def test_multi_class_selection(self):
|
|
||||||
for selector in ('.class1.class3', '.class3.class2',
|
|
||||||
'.class1.class2.class3'):
|
|
||||||
self.assert_selects(selector, ['pmulti'])
|
|
||||||
|
|
||||||
def test_child_selector(self):
|
|
||||||
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
|
|
||||||
self.assert_selects('.s1 > a span', ['s1a2s1'])
|
|
||||||
|
|
||||||
def test_child_selector_id(self):
|
|
||||||
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
|
|
||||||
|
|
||||||
def test_attribute_equals(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('p[class="onep"]', ['p1']),
|
|
||||||
('p[id="p1"]', ['p1']),
|
|
||||||
('[class="onep"]', ['p1']),
|
|
||||||
('[id="p1"]', ['p1']),
|
|
||||||
('link[rel="stylesheet"]', ['l1']),
|
|
||||||
('link[type="text/css"]', ['l1']),
|
|
||||||
('link[href="blah.css"]', ['l1']),
|
|
||||||
('link[href="no-blah.css"]', []),
|
|
||||||
('[rel="stylesheet"]', ['l1']),
|
|
||||||
('[type="text/css"]', ['l1']),
|
|
||||||
('[href="blah.css"]', ['l1']),
|
|
||||||
('[href="no-blah.css"]', []),
|
|
||||||
('p[href="no-blah.css"]', []),
|
|
||||||
('[href="no-blah.css"]', []),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_attribute_tilde(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('p[class~="class1"]', ['pmulti']),
|
|
||||||
('p[class~="class2"]', ['pmulti']),
|
|
||||||
('p[class~="class3"]', ['pmulti']),
|
|
||||||
('[class~="class1"]', ['pmulti']),
|
|
||||||
('[class~="class2"]', ['pmulti']),
|
|
||||||
('[class~="class3"]', ['pmulti']),
|
|
||||||
('a[rel~="friend"]', ['bob']),
|
|
||||||
('a[rel~="met"]', ['bob']),
|
|
||||||
('[rel~="friend"]', ['bob']),
|
|
||||||
('[rel~="met"]', ['bob']),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_attribute_startswith(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('[rel^="style"]', ['l1']),
|
|
||||||
('link[rel^="style"]', ['l1']),
|
|
||||||
('notlink[rel^="notstyle"]', []),
|
|
||||||
('[rel^="notstyle"]', []),
|
|
||||||
('link[rel^="notstyle"]', []),
|
|
||||||
('link[href^="bla"]', ['l1']),
|
|
||||||
('a[href^="http://"]', ['bob', 'me']),
|
|
||||||
('[href^="http://"]', ['bob', 'me']),
|
|
||||||
('[id^="p"]', ['pmulti', 'p1']),
|
|
||||||
('[id^="m"]', ['me', 'main']),
|
|
||||||
('div[id^="m"]', ['main']),
|
|
||||||
('a[id^="m"]', ['me']),
|
|
||||||
('div[data-tag^="dashed"]', ['data1'])
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_attribute_endswith(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('[href$=".css"]', ['l1']),
|
|
||||||
('link[href$=".css"]', ['l1']),
|
|
||||||
('link[id$="1"]', ['l1']),
|
|
||||||
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
|
|
||||||
('div[id$="1"]', ['data1']),
|
|
||||||
('[id$="noending"]', []),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_attribute_contains(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
# From test_attribute_startswith
|
|
||||||
('[rel*="style"]', ['l1']),
|
|
||||||
('link[rel*="style"]', ['l1']),
|
|
||||||
('notlink[rel*="notstyle"]', []),
|
|
||||||
('[rel*="notstyle"]', []),
|
|
||||||
('link[rel*="notstyle"]', []),
|
|
||||||
('link[href*="bla"]', ['l1']),
|
|
||||||
('[href*="http://"]', ['bob', 'me']),
|
|
||||||
('[id*="p"]', ['pmulti', 'p1']),
|
|
||||||
('div[id*="m"]', ['main']),
|
|
||||||
('a[id*="m"]', ['me']),
|
|
||||||
# From test_attribute_endswith
|
|
||||||
('[href*=".css"]', ['l1']),
|
|
||||||
('link[href*=".css"]', ['l1']),
|
|
||||||
('link[id*="1"]', ['l1']),
|
|
||||||
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
|
|
||||||
('div[id*="1"]', ['data1']),
|
|
||||||
('[id*="noending"]', []),
|
|
||||||
# New for this test
|
|
||||||
('[href*="."]', ['bob', 'me', 'l1']),
|
|
||||||
('a[href*="."]', ['bob', 'me']),
|
|
||||||
('link[href*="."]', ['l1']),
|
|
||||||
('div[id*="n"]', ['main', 'inner']),
|
|
||||||
('div[id*="nn"]', ['inner']),
|
|
||||||
('div[data-tag*="edval"]', ['data1'])
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_attribute_exact_or_hypen(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
|
||||||
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
|
||||||
('p[lang|="fr"]', ['lang-fr']),
|
|
||||||
('p[lang|="gb"]', []),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_attribute_exists(self):
|
|
||||||
self.assert_select_multiple(
|
|
||||||
('[rel]', ['l1', 'bob', 'me']),
|
|
||||||
('link[rel]', ['l1']),
|
|
||||||
('a[rel]', ['bob', 'me']),
|
|
||||||
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
|
|
||||||
('p[class]', ['p1', 'pmulti']),
|
|
||||||
('[blah]', []),
|
|
||||||
('p[blah]', []),
|
|
||||||
('div[data-tag]', ['data1'])
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_quoted_space_in_selector_name(self):
|
|
||||||
html = """<div style="display: wrong">nope</div>
|
|
||||||
<div style="display: right">yes</div>
|
|
||||||
"""
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
[chosen] = soup.select('div[style="display: right"]')
|
|
||||||
assert "yes" == chosen.string
|
|
||||||
|
|
||||||
def test_unsupported_pseudoclass(self):
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
self.soup.select("a:no-such-pseudoclass")
|
|
||||||
|
|
||||||
with pytest.raises(SelectorSyntaxError):
|
|
||||||
self.soup.select("a:nth-of-type(a)")
|
|
||||||
|
|
||||||
def test_nth_of_type(self):
|
|
||||||
# Try to select first paragraph
|
|
||||||
els = self.soup.select('div#inner p:nth-of-type(1)')
|
|
||||||
assert len(els) == 1
|
|
||||||
assert els[0].string == 'Some text'
|
|
||||||
|
|
||||||
# Try to select third paragraph
|
|
||||||
els = self.soup.select('div#inner p:nth-of-type(3)')
|
|
||||||
assert len(els) == 1
|
|
||||||
assert els[0].string == 'Another'
|
|
||||||
|
|
||||||
# Try to select (non-existent!) fourth paragraph
|
|
||||||
els = self.soup.select('div#inner p:nth-of-type(4)')
|
|
||||||
assert len(els) == 0
|
|
||||||
|
|
||||||
# Zero will select no tags.
|
|
||||||
els = self.soup.select('div p:nth-of-type(0)')
|
|
||||||
assert len(els) == 0
|
|
||||||
|
|
||||||
def test_nth_of_type_direct_descendant(self):
|
|
||||||
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
|
||||||
assert len(els) == 1
|
|
||||||
assert els[0].string == 'Some text'
|
|
||||||
|
|
||||||
def test_id_child_selector_nth_of_type(self):
|
|
||||||
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
|
|
||||||
|
|
||||||
def test_select_on_element(self):
|
|
||||||
# Other tests operate on the tree; this operates on an element
|
|
||||||
# within the tree.
|
|
||||||
inner = self.soup.find("div", id="main")
|
|
||||||
selected = inner.select("div")
|
|
||||||
# The <div id="inner"> tag was selected. The <div id="footer">
|
|
||||||
# tag was not.
|
|
||||||
self.assert_selects_ids(selected, ['inner', 'data1'])
|
|
||||||
|
|
||||||
def test_overspecified_child_id(self):
|
|
||||||
self.assert_selects(".fancy #inner", ['inner'])
|
|
||||||
self.assert_selects(".normal #inner", [])
|
|
||||||
|
|
||||||
def test_adjacent_sibling_selector(self):
|
|
||||||
self.assert_selects('#p1 + h2', ['header2'])
|
|
||||||
self.assert_selects('#p1 + h2 + p', ['pmulti'])
|
|
||||||
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
|
|
||||||
assert [] == self.soup.select('#p1 + p')
|
|
||||||
|
|
||||||
def test_general_sibling_selector(self):
|
|
||||||
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
|
|
||||||
self.assert_selects('#p1 ~ #header2', ['header2'])
|
|
||||||
self.assert_selects('#p1 ~ h2 + a', ['me'])
|
|
||||||
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
|
|
||||||
assert [] == self.soup.select('#inner ~ h2')
|
|
||||||
|
|
||||||
def test_dangling_combinator(self):
|
|
||||||
with pytest.raises(SelectorSyntaxError):
|
|
||||||
self.soup.select('h1 >')
|
|
||||||
|
|
||||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
|
||||||
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
|
||||||
|
|
||||||
# Test the selector grouping operator (the comma)
|
|
||||||
def test_multiple_select(self):
|
|
||||||
self.assert_selects('x, y', ['xid', 'yid'])
|
|
||||||
|
|
||||||
def test_multiple_select_with_no_space(self):
|
|
||||||
self.assert_selects('x,y', ['xid', 'yid'])
|
|
||||||
|
|
||||||
def test_multiple_select_with_more_space(self):
|
|
||||||
self.assert_selects('x, y', ['xid', 'yid'])
|
|
||||||
|
|
||||||
def test_multiple_select_duplicated(self):
|
|
||||||
self.assert_selects('x, x', ['xid'])
|
|
||||||
|
|
||||||
def test_multiple_select_sibling(self):
|
|
||||||
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
|
|
||||||
|
|
||||||
def test_multiple_select_tag_and_direct_descendant(self):
|
|
||||||
self.assert_selects('x, y > z', ['xid', 'zidb'])
|
|
||||||
|
|
||||||
def test_multiple_select_direct_descendant_and_tags(self):
|
|
||||||
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
|
||||||
|
|
||||||
def test_multiple_select_indirect_descendant(self):
|
|
||||||
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
|
||||||
|
|
||||||
def test_invalid_multiple_select(self):
|
|
||||||
with pytest.raises(SelectorSyntaxError):
|
|
||||||
self.soup.select(',x, y')
|
|
||||||
with pytest.raises(SelectorSyntaxError):
|
|
||||||
self.soup.select('x,,y')
|
|
||||||
|
|
||||||
def test_multiple_select_attrs(self):
|
|
||||||
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
|
||||||
|
|
||||||
def test_multiple_select_ids(self):
|
|
||||||
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
|
|
||||||
|
|
||||||
def test_multiple_select_nested(self):
|
|
||||||
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
|
|
||||||
|
|
||||||
def test_select_duplicate_elements(self):
|
|
||||||
# When markup contains duplicate elements, a multiple select
|
|
||||||
# will find all of them.
|
|
||||||
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
|
||||||
soup = BeautifulSoup(markup, 'html.parser')
|
|
||||||
selected = soup.select(".c1, .c2")
|
|
||||||
assert 3 == len(selected)
|
|
||||||
|
|
||||||
# Verify that find_all finds the same elements, though because
|
|
||||||
# of an implementation detail it finds them in a different
|
|
||||||
# order.
|
|
||||||
for element in soup.find_all(class_=['c1', 'c2']):
|
|
||||||
assert element in selected
|
|
||||||
|
|
||||||
|
|
||||||
class TestPersistence(SoupTest):
|
class TestPersistence(SoupTest):
|
||||||
"Testing features like pickle and deepcopy."
|
"Testing features like pickle and deepcopy."
|
||||||
|
|
||||||
|
@ -668,12 +278,24 @@ class TestPersistence(SoupTest):
|
||||||
loaded = pickle.loads(dumped)
|
loaded = pickle.loads(dumped)
|
||||||
assert loaded.__class__ == BeautifulSoup
|
assert loaded.__class__ == BeautifulSoup
|
||||||
assert loaded.decode() == self.tree.decode()
|
assert loaded.decode() == self.tree.decode()
|
||||||
|
|
||||||
def test_deepcopy_identity(self):
|
def test_deepcopy_identity(self):
|
||||||
# Making a deepcopy of a tree yields an identical tree.
|
# Making a deepcopy of a tree yields an identical tree.
|
||||||
copied = copy.deepcopy(self.tree)
|
copied = copy.deepcopy(self.tree)
|
||||||
assert copied.decode() == self.tree.decode()
|
assert copied.decode() == self.tree.decode()
|
||||||
|
|
||||||
|
def test_copy_deeply_nested_document(self):
|
||||||
|
# This test verifies that copy and deepcopy don't involve any
|
||||||
|
# recursive function calls. If they did, this test would
|
||||||
|
# overflow the Python interpreter stack.
|
||||||
|
limit = sys.getrecursionlimit() + 1
|
||||||
|
markup = "<span>" * limit
|
||||||
|
|
||||||
|
soup = self.soup(markup)
|
||||||
|
|
||||||
|
copied = copy.copy(soup)
|
||||||
|
copied = copy.deepcopy(soup)
|
||||||
|
|
||||||
def test_copy_preserves_encoding(self):
|
def test_copy_preserves_encoding(self):
|
||||||
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||||||
encoding = soup.original_encoding
|
encoding = soup.original_encoding
|
||||||
|
|
|
@ -24,6 +24,7 @@ from bs4.builder import (
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
|
PYTHON_SPECIFIC_ENCODINGS,
|
||||||
Tag,
|
Tag,
|
||||||
NavigableString,
|
NavigableString,
|
||||||
)
|
)
|
||||||
|
@ -210,6 +211,47 @@ class TestConstructor(SoupTest):
|
||||||
assert [] == soup.string_container_stack
|
assert [] == soup.string_container_stack
|
||||||
|
|
||||||
|
|
||||||
|
class TestOutput(SoupTest):
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"eventual_encoding,actual_encoding", [
|
||||||
|
("utf-8", "utf-8"),
|
||||||
|
("utf-16", "utf-16"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_decode_xml_declaration(self, eventual_encoding, actual_encoding):
|
||||||
|
# Most of the time, calling decode() on an XML document will
|
||||||
|
# give you a document declaration that mentions the encoding
|
||||||
|
# you intend to use when encoding the document as a
|
||||||
|
# bytestring.
|
||||||
|
soup = self.soup("<tag></tag>")
|
||||||
|
soup.is_xml = True
|
||||||
|
assert (f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>'
|
||||||
|
== soup.decode(eventual_encoding=eventual_encoding))
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None]
|
||||||
|
)
|
||||||
|
def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(self, eventual_encoding):
|
||||||
|
# But if you pass a Python internal encoding into decode(), or
|
||||||
|
# omit the eventual_encoding altogether, the document
|
||||||
|
# declaration won't mention any particular encoding.
|
||||||
|
soup = BeautifulSoup("<tag></tag>", "html.parser")
|
||||||
|
soup.is_xml = True
|
||||||
|
assert (f'<?xml version="1.0"?>\n<tag></tag>'
|
||||||
|
== soup.decode(eventual_encoding=eventual_encoding))
|
||||||
|
|
||||||
|
def test(self):
|
||||||
|
# BeautifulSoup subclasses Tag and extends the decode() method.
|
||||||
|
# Make sure the other Tag methods which call decode() call
|
||||||
|
# it correctly.
|
||||||
|
soup = self.soup("<tag></tag>")
|
||||||
|
assert b"<tag></tag>" == soup.encode(encoding="utf-8")
|
||||||
|
assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8")
|
||||||
|
assert "<tag></tag>" == soup.decode_contents()
|
||||||
|
assert "<tag>\n</tag>\n" == soup.prettify()
|
||||||
|
|
||||||
|
|
||||||
class TestWarnings(SoupTest):
|
class TestWarnings(SoupTest):
|
||||||
# Note that some of the tests in this class create BeautifulSoup
|
# Note that some of the tests in this class create BeautifulSoup
|
||||||
# objects directly rather than using self.soup(). That's
|
# objects directly rather than using self.soup(). That's
|
||||||
|
|
|
@ -32,7 +32,7 @@ from . import css_match as cm
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
||||||
import bs4 # type: ignore[import]
|
import bs4 # type: ignore[import]
|
||||||
from typing import Optional, Any, Iterator, Iterable
|
from typing import Any, Iterator, Iterable
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
||||||
|
@ -45,10 +45,10 @@ SoupSieve = cm.SoupSieve
|
||||||
|
|
||||||
def compile( # noqa: A001
|
def compile( # noqa: A001
|
||||||
pattern: str,
|
pattern: str,
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> cm.SoupSieve:
|
) -> cm.SoupSieve:
|
||||||
"""Compile CSS pattern."""
|
"""Compile CSS pattern."""
|
||||||
|
@ -79,10 +79,10 @@ def purge() -> None:
|
||||||
def closest(
|
def closest(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> 'bs4.Tag':
|
) -> 'bs4.Tag':
|
||||||
"""Match closest ancestor."""
|
"""Match closest ancestor."""
|
||||||
|
@ -93,10 +93,10 @@ def closest(
|
||||||
def match(
|
def match(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Match node."""
|
"""Match node."""
|
||||||
|
@ -107,10 +107,10 @@ def match(
|
||||||
def filter( # noqa: A001
|
def filter( # noqa: A001
|
||||||
select: str,
|
select: str,
|
||||||
iterable: Iterable['bs4.Tag'],
|
iterable: Iterable['bs4.Tag'],
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> list['bs4.Tag']:
|
) -> list['bs4.Tag']:
|
||||||
"""Filter list of nodes."""
|
"""Filter list of nodes."""
|
||||||
|
@ -121,10 +121,10 @@ def filter( # noqa: A001
|
||||||
def select_one(
|
def select_one(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> 'bs4.Tag':
|
) -> 'bs4.Tag':
|
||||||
"""Select a single tag."""
|
"""Select a single tag."""
|
||||||
|
@ -135,11 +135,11 @@ def select_one(
|
||||||
def select(
|
def select(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
limit: int = 0,
|
limit: int = 0,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> list['bs4.Tag']:
|
) -> list['bs4.Tag']:
|
||||||
"""Select the specified tags."""
|
"""Select the specified tags."""
|
||||||
|
@ -150,11 +150,11 @@ def select(
|
||||||
def iselect(
|
def iselect(
|
||||||
select: str,
|
select: str,
|
||||||
tag: 'bs4.Tag',
|
tag: 'bs4.Tag',
|
||||||
namespaces: Optional[dict[str, str]] = None,
|
namespaces: dict[str, str] | None = None,
|
||||||
limit: int = 0,
|
limit: int = 0,
|
||||||
flags: int = 0,
|
flags: int = 0,
|
||||||
*,
|
*,
|
||||||
custom: Optional[dict[str, str]] = None,
|
custom: dict[str, str] | None = None,
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
) -> Iterator['bs4.Tag']:
|
) -> Iterator['bs4.Tag']:
|
||||||
"""Iterate the specified tags."""
|
"""Iterate the specified tags."""
|
||||||
|
|
|
@ -193,5 +193,5 @@ def parse_version(ver: str) -> Version:
|
||||||
return Version(major, minor, micro, release, pre, post, dev)
|
return Version(major, minor, micro, release, pre, post, dev)
|
||||||
|
|
||||||
|
|
||||||
__version_info__ = Version(2, 4, 0, "final")
|
__version_info__ = Version(2, 4, 1, "final")
|
||||||
__version__ = __version_info__._get_canonical()
|
__version__ = __version_info__._get_canonical()
|
||||||
|
|
|
@ -6,7 +6,7 @@ import re
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import bs4 # type: ignore[import]
|
import bs4 # type: ignore[import]
|
||||||
from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
|
from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
|
||||||
|
|
||||||
# Empty tag pattern (whitespace okay)
|
# Empty tag pattern (whitespace okay)
|
||||||
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
||||||
|
@ -171,7 +171,7 @@ class _DocumentNav:
|
||||||
def get_children(
|
def get_children(
|
||||||
self,
|
self,
|
||||||
el: bs4.Tag,
|
el: bs4.Tag,
|
||||||
start: Optional[int] = None,
|
start: int | None = None,
|
||||||
reverse: bool = False,
|
reverse: bool = False,
|
||||||
tags: bool = True,
|
tags: bool = True,
|
||||||
no_iframe: bool = False
|
no_iframe: bool = False
|
||||||
|
@ -239,22 +239,22 @@ class _DocumentNav:
|
||||||
return parent
|
return parent
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_tag_name(el: bs4.Tag) -> Optional[str]:
|
def get_tag_name(el: bs4.Tag) -> str | None:
|
||||||
"""Get tag."""
|
"""Get tag."""
|
||||||
|
|
||||||
return cast(Optional[str], el.name)
|
return cast('str | None', el.name)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_prefix_name(el: bs4.Tag) -> Optional[str]:
|
def get_prefix_name(el: bs4.Tag) -> str | None:
|
||||||
"""Get prefix."""
|
"""Get prefix."""
|
||||||
|
|
||||||
return cast(Optional[str], el.prefix)
|
return cast('str | None', el.prefix)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_uri(el: bs4.Tag) -> Optional[str]:
|
def get_uri(el: bs4.Tag) -> str | None:
|
||||||
"""Get namespace `URI`."""
|
"""Get namespace `URI`."""
|
||||||
|
|
||||||
return cast(Optional[str], el.namespace)
|
return cast('str | None', el.namespace)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
||||||
|
@ -287,7 +287,7 @@ class _DocumentNav:
|
||||||
return bool(ns and ns == NS_XHTML)
|
return bool(ns and ns == NS_XHTML)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
|
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
|
||||||
"""Return namespace and attribute name without the prefix."""
|
"""Return namespace and attribute name without the prefix."""
|
||||||
|
|
||||||
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
|
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
|
||||||
|
@ -330,8 +330,8 @@ class _DocumentNav:
|
||||||
cls,
|
cls,
|
||||||
el: bs4.Tag,
|
el: bs4.Tag,
|
||||||
name: str,
|
name: str,
|
||||||
default: Optional[str | Sequence[str]] = None
|
default: str | Sequence[str] | None = None
|
||||||
) -> Optional[str | Sequence[str]]:
|
) -> str | Sequence[str] | None:
|
||||||
"""Get attribute by name."""
|
"""Get attribute by name."""
|
||||||
|
|
||||||
value = default
|
value = default
|
||||||
|
@ -348,7 +348,7 @@ class _DocumentNav:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
|
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
|
||||||
"""Iterate attributes."""
|
"""Iterate attributes."""
|
||||||
|
|
||||||
for k, v in el.attrs.items():
|
for k, v in el.attrs.items():
|
||||||
|
@ -424,10 +424,10 @@ class Inputs:
|
||||||
return 0 <= minutes <= 59
|
return 0 <= minutes <= 59
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
|
def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
|
||||||
"""Parse the input value."""
|
"""Parse the input value."""
|
||||||
|
|
||||||
parsed = None # type: Optional[tuple[float, ...]]
|
parsed = None # type: tuple[float, ...] | None
|
||||||
if value is None:
|
if value is None:
|
||||||
return value
|
return value
|
||||||
if itype == "date":
|
if itype == "date":
|
||||||
|
@ -486,7 +486,7 @@ class CSSMatch(_DocumentNav):
|
||||||
self,
|
self,
|
||||||
selectors: ct.SelectorList,
|
selectors: ct.SelectorList,
|
||||||
scope: bs4.Tag,
|
scope: bs4.Tag,
|
||||||
namespaces: Optional[ct.Namespaces],
|
namespaces: ct.Namespaces | None,
|
||||||
flags: int
|
flags: int
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
@ -545,19 +545,19 @@ class CSSMatch(_DocumentNav):
|
||||||
|
|
||||||
return self.get_tag_ns(el) == NS_XHTML
|
return self.get_tag_ns(el) == NS_XHTML
|
||||||
|
|
||||||
def get_tag(self, el: bs4.Tag) -> Optional[str]:
|
def get_tag(self, el: bs4.Tag) -> str | None:
|
||||||
"""Get tag."""
|
"""Get tag."""
|
||||||
|
|
||||||
name = self.get_tag_name(el)
|
name = self.get_tag_name(el)
|
||||||
return util.lower(name) if name is not None and not self.is_xml else name
|
return util.lower(name) if name is not None and not self.is_xml else name
|
||||||
|
|
||||||
def get_prefix(self, el: bs4.Tag) -> Optional[str]:
|
def get_prefix(self, el: bs4.Tag) -> str | None:
|
||||||
"""Get prefix."""
|
"""Get prefix."""
|
||||||
|
|
||||||
prefix = self.get_prefix_name(el)
|
prefix = self.get_prefix_name(el)
|
||||||
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
|
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
|
||||||
|
|
||||||
def find_bidi(self, el: bs4.Tag) -> Optional[int]:
|
def find_bidi(self, el: bs4.Tag) -> int | None:
|
||||||
"""Get directionality from element text."""
|
"""Get directionality from element text."""
|
||||||
|
|
||||||
for node in self.get_children(el, tags=False):
|
for node in self.get_children(el, tags=False):
|
||||||
|
@ -653,8 +653,8 @@ class CSSMatch(_DocumentNav):
|
||||||
self,
|
self,
|
||||||
el: bs4.Tag,
|
el: bs4.Tag,
|
||||||
attr: str,
|
attr: str,
|
||||||
prefix: Optional[str]
|
prefix: str | None
|
||||||
) -> Optional[str | Sequence[str]]:
|
) -> str | Sequence[str] | None:
|
||||||
"""Match attribute name and return value if it exists."""
|
"""Match attribute name and return value if it exists."""
|
||||||
|
|
||||||
value = None
|
value = None
|
||||||
|
@ -751,7 +751,7 @@ class CSSMatch(_DocumentNav):
|
||||||
name not in (self.get_tag(el), '*')
|
name not in (self.get_tag(el), '*')
|
||||||
)
|
)
|
||||||
|
|
||||||
def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
|
def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
|
||||||
"""Match the tag."""
|
"""Match the tag."""
|
||||||
|
|
||||||
match = True
|
match = True
|
||||||
|
@ -1030,7 +1030,7 @@ class CSSMatch(_DocumentNav):
|
||||||
"""Match element if it contains text."""
|
"""Match element if it contains text."""
|
||||||
|
|
||||||
match = True
|
match = True
|
||||||
content = None # type: Optional[str | Sequence[str]]
|
content = None # type: str | Sequence[str] | None
|
||||||
for contain_list in contains:
|
for contain_list in contains:
|
||||||
if content is None:
|
if content is None:
|
||||||
if contain_list.own:
|
if contain_list.own:
|
||||||
|
@ -1099,7 +1099,7 @@ class CSSMatch(_DocumentNav):
|
||||||
match = False
|
match = False
|
||||||
name = cast(str, self.get_attribute_by_name(el, 'name'))
|
name = cast(str, self.get_attribute_by_name(el, 'name'))
|
||||||
|
|
||||||
def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
|
def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
|
||||||
"""Find this input's form."""
|
"""Find this input's form."""
|
||||||
form = None
|
form = None
|
||||||
parent = self.get_parent(el, no_iframe=True)
|
parent = self.get_parent(el, no_iframe=True)
|
||||||
|
@ -1478,7 +1478,7 @@ class CSSMatch(_DocumentNav):
|
||||||
if lim < 1:
|
if lim < 1:
|
||||||
break
|
break
|
||||||
|
|
||||||
def closest(self) -> Optional[bs4.Tag]:
|
def closest(self) -> bs4.Tag | None:
|
||||||
"""Match closest ancestor."""
|
"""Match closest ancestor."""
|
||||||
|
|
||||||
current = self.tag
|
current = self.tag
|
||||||
|
@ -1506,7 +1506,7 @@ class SoupSieve(ct.Immutable):
|
||||||
|
|
||||||
pattern: str
|
pattern: str
|
||||||
selectors: ct.SelectorList
|
selectors: ct.SelectorList
|
||||||
namespaces: Optional[ct.Namespaces]
|
namespaces: ct.Namespaces | None
|
||||||
custom: dict[str, str]
|
custom: dict[str, str]
|
||||||
flags: int
|
flags: int
|
||||||
|
|
||||||
|
@ -1516,8 +1516,8 @@ class SoupSieve(ct.Immutable):
|
||||||
self,
|
self,
|
||||||
pattern: str,
|
pattern: str,
|
||||||
selectors: ct.SelectorList,
|
selectors: ct.SelectorList,
|
||||||
namespaces: Optional[ct.Namespaces],
|
namespaces: ct.Namespaces | None,
|
||||||
custom: Optional[ct.CustomSelectors],
|
custom: ct.CustomSelectors | None,
|
||||||
flags: int
|
flags: int
|
||||||
):
|
):
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
|
@ -7,7 +7,7 @@ from . import css_match as cm
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
from .util import SelectorSyntaxError
|
from .util import SelectorSyntaxError
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Optional, Match, Any, Iterator, cast
|
from typing import Match, Any, Iterator, cast
|
||||||
|
|
||||||
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
||||||
|
|
||||||
|
@ -113,7 +113,7 @@ VALUE = r'''
|
||||||
'''.format(nl=NEWLINE, ident=IDENTIFIER)
|
'''.format(nl=NEWLINE, ident=IDENTIFIER)
|
||||||
# Attribute value comparison. `!=` is handled special as it is non-standard.
|
# Attribute value comparison. `!=` is handled special as it is non-standard.
|
||||||
ATTR = r'''
|
ATTR = r'''
|
||||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}*(?P<case>[is]))?)?{ws}*\]
|
||||||
'''.format(ws=WSC, value=VALUE)
|
'''.format(ws=WSC, value=VALUE)
|
||||||
|
|
||||||
# Selector patterns
|
# Selector patterns
|
||||||
|
@ -207,8 +207,8 @@ _MAXCACHE = 500
|
||||||
@lru_cache(maxsize=_MAXCACHE)
|
@lru_cache(maxsize=_MAXCACHE)
|
||||||
def _cached_css_compile(
|
def _cached_css_compile(
|
||||||
pattern: str,
|
pattern: str,
|
||||||
namespaces: Optional[ct.Namespaces],
|
namespaces: ct.Namespaces | None,
|
||||||
custom: Optional[ct.CustomSelectors],
|
custom: ct.CustomSelectors | None,
|
||||||
flags: int
|
flags: int
|
||||||
) -> cm.SoupSieve:
|
) -> cm.SoupSieve:
|
||||||
"""Cached CSS compile."""
|
"""Cached CSS compile."""
|
||||||
|
@ -233,7 +233,7 @@ def _purge_cache() -> None:
|
||||||
_cached_css_compile.cache_clear()
|
_cached_css_compile.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
|
def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
|
||||||
"""Process custom."""
|
"""Process custom."""
|
||||||
|
|
||||||
custom_selectors = {}
|
custom_selectors = {}
|
||||||
|
@ -317,7 +317,7 @@ class SelectorPattern:
|
||||||
|
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
|
||||||
"""Match the selector."""
|
"""Match the selector."""
|
||||||
|
|
||||||
return self.re_pattern.match(selector, index)
|
return self.re_pattern.match(selector, index)
|
||||||
|
@ -336,7 +336,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
for pseudo in p[1]:
|
for pseudo in p[1]:
|
||||||
self.patterns[pseudo] = pattern
|
self.patterns[pseudo] = pattern
|
||||||
|
|
||||||
self.matched_name = None # type: Optional[SelectorPattern]
|
self.matched_name = None # type: SelectorPattern | None
|
||||||
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
|
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
|
||||||
|
|
||||||
def get_name(self) -> str:
|
def get_name(self) -> str:
|
||||||
|
@ -344,7 +344,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
|
|
||||||
return '' if self.matched_name is None else self.matched_name.get_name()
|
return '' if self.matched_name is None else self.matched_name.get_name()
|
||||||
|
|
||||||
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
|
||||||
"""Match the selector."""
|
"""Match the selector."""
|
||||||
|
|
||||||
pseudo = None
|
pseudo = None
|
||||||
|
@ -372,14 +372,14 @@ class _Selector:
|
||||||
def __init__(self, **kwargs: Any) -> None:
|
def __init__(self, **kwargs: Any) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
|
self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
|
||||||
self.ids = kwargs.get('ids', []) # type: list[str]
|
self.ids = kwargs.get('ids', []) # type: list[str]
|
||||||
self.classes = kwargs.get('classes', []) # type: list[str]
|
self.classes = kwargs.get('classes', []) # type: list[str]
|
||||||
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
|
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
|
||||||
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
|
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
|
||||||
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
|
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
|
||||||
self.relations = kwargs.get('relations', []) # type: list[_Selector]
|
self.relations = kwargs.get('relations', []) # type: list[_Selector]
|
||||||
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
|
self.rel_type = kwargs.get('rel_type', None) # type: str | None
|
||||||
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
|
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
|
||||||
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
|
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
|
||||||
self.flags = kwargs.get('flags', 0) # type: int
|
self.flags = kwargs.get('flags', 0) # type: int
|
||||||
|
@ -462,7 +462,7 @@ class CSSParser:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
selector: str,
|
selector: str,
|
||||||
custom: Optional[dict[str, str | ct.SelectorList]] = None,
|
custom: dict[str, str | ct.SelectorList] | None = None,
|
||||||
flags: int = 0
|
flags: int = 0
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import copyreg
|
import copyreg
|
||||||
from .pretty import pretty
|
from .pretty import pretty
|
||||||
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
|
from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'Selector',
|
'Selector',
|
||||||
|
@ -189,28 +189,28 @@ class Selector(Immutable):
|
||||||
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
|
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
|
||||||
)
|
)
|
||||||
|
|
||||||
tag: Optional[SelectorTag]
|
tag: SelectorTag | None
|
||||||
ids: tuple[str, ...]
|
ids: tuple[str, ...]
|
||||||
classes: tuple[str, ...]
|
classes: tuple[str, ...]
|
||||||
attributes: tuple[SelectorAttribute, ...]
|
attributes: tuple[SelectorAttribute, ...]
|
||||||
nth: tuple[SelectorNth, ...]
|
nth: tuple[SelectorNth, ...]
|
||||||
selectors: tuple[SelectorList, ...]
|
selectors: tuple[SelectorList, ...]
|
||||||
relation: SelectorList
|
relation: SelectorList
|
||||||
rel_type: Optional[str]
|
rel_type: str | None
|
||||||
contains: tuple[SelectorContains, ...]
|
contains: tuple[SelectorContains, ...]
|
||||||
lang: tuple[SelectorLang, ...]
|
lang: tuple[SelectorLang, ...]
|
||||||
flags: int
|
flags: int
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
tag: Optional[SelectorTag],
|
tag: SelectorTag | None,
|
||||||
ids: tuple[str, ...],
|
ids: tuple[str, ...],
|
||||||
classes: tuple[str, ...],
|
classes: tuple[str, ...],
|
||||||
attributes: tuple[SelectorAttribute, ...],
|
attributes: tuple[SelectorAttribute, ...],
|
||||||
nth: tuple[SelectorNth, ...],
|
nth: tuple[SelectorNth, ...],
|
||||||
selectors: tuple[SelectorList, ...],
|
selectors: tuple[SelectorList, ...],
|
||||||
relation: SelectorList,
|
relation: SelectorList,
|
||||||
rel_type: Optional[str],
|
rel_type: str | None,
|
||||||
contains: tuple[SelectorContains, ...],
|
contains: tuple[SelectorContains, ...],
|
||||||
lang: tuple[SelectorLang, ...],
|
lang: tuple[SelectorLang, ...],
|
||||||
flags: int
|
flags: int
|
||||||
|
@ -247,9 +247,9 @@ class SelectorTag(Immutable):
|
||||||
__slots__ = ("name", "prefix", "_hash")
|
__slots__ = ("name", "prefix", "_hash")
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
prefix: Optional[str]
|
prefix: str | None
|
||||||
|
|
||||||
def __init__(self, name: str, prefix: Optional[str]) -> None:
|
def __init__(self, name: str, prefix: str | None) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
super().__init__(name=name, prefix=prefix)
|
super().__init__(name=name, prefix=prefix)
|
||||||
|
@ -262,15 +262,15 @@ class SelectorAttribute(Immutable):
|
||||||
|
|
||||||
attribute: str
|
attribute: str
|
||||||
prefix: str
|
prefix: str
|
||||||
pattern: Optional[Pattern[str]]
|
pattern: Pattern[str] | None
|
||||||
xml_type_pattern: Optional[Pattern[str]]
|
xml_type_pattern: Pattern[str] | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
attribute: str,
|
attribute: str,
|
||||||
prefix: str,
|
prefix: str,
|
||||||
pattern: Optional[Pattern[str]],
|
pattern: Pattern[str] | None,
|
||||||
xml_type_pattern: Optional[Pattern[str]]
|
xml_type_pattern: Pattern[str] | None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
|
@ -360,7 +360,7 @@ class SelectorList(Immutable):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
selectors: Optional[Iterable[Selector | SelectorNull]] = None,
|
selectors: Iterable[Selector | SelectorNull] | None = None,
|
||||||
is_not: bool = False,
|
is_not: bool = False,
|
||||||
is_html: bool = False
|
is_html: bool = False
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||||
from functools import wraps, lru_cache
|
from functools import wraps, lru_cache
|
||||||
import warnings
|
import warnings
|
||||||
import re
|
import re
|
||||||
from typing import Callable, Any, Optional
|
from typing import Callable, Any
|
||||||
|
|
||||||
DEBUG = 0x00001
|
DEBUG = 0x00001
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ def lower(string: str) -> str:
|
||||||
class SelectorSyntaxError(Exception):
|
class SelectorSyntaxError(Exception):
|
||||||
"""Syntax error in a CSS selector."""
|
"""Syntax error in a CSS selector."""
|
||||||
|
|
||||||
def __init__(self, msg: str, pattern: Optional[str] = None, index: Optional[int] = None) -> None:
|
def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None:
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
self.line = None
|
self.line = None
|
||||||
|
@ -84,7 +84,7 @@ def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
|
||||||
col = 1
|
col = 1
|
||||||
text = [] # type: list[str]
|
text = [] # type: list[str]
|
||||||
line = 1
|
line = 1
|
||||||
offset = None # type: Optional[int]
|
offset = None # type: int | None
|
||||||
|
|
||||||
# Split pattern by newline and handle the text before the newline
|
# Split pattern by newline and handle the text before the newline
|
||||||
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
|
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
|
||||||
|
|
|
@ -4,7 +4,7 @@ arrow==1.2.3
|
||||||
backports.csv==1.0.7
|
backports.csv==1.0.7
|
||||||
backports.functools-lru-cache==1.6.4
|
backports.functools-lru-cache==1.6.4
|
||||||
backports.zoneinfo==0.2.1;python_version<"3.9"
|
backports.zoneinfo==0.2.1;python_version<"3.9"
|
||||||
beautifulsoup4==4.11.2
|
beautifulsoup4==4.12.2
|
||||||
bleach==6.0.0
|
bleach==6.0.0
|
||||||
certifi==2022.12.7
|
certifi==2022.12.7
|
||||||
cheroot==9.0.0
|
cheroot==9.0.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue