mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-06 05:01:14 -07:00
Bump beautifulsoup4 from 4.11.2 to 4.12.2 (#2037)
* Bump beautifulsoup4 from 4.11.2 to 4.12.2 Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.2 to 4.12.2. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update beautifulsoup4==4.12.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
This commit is contained in:
parent
1798594569
commit
e70e08c3f5
32 changed files with 1439 additions and 755 deletions
|
@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.11.2"
|
||||
__version__ = "4.12.2"
|
||||
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
@ -38,11 +38,13 @@ from .builder import (
|
|||
builder_registry,
|
||||
ParserRejectedMarkup,
|
||||
XMLParsedAsHTMLWarning,
|
||||
HTMLParserTreeBuilder
|
||||
)
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
CSS,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
|
@ -348,26 +350,50 @@ class BeautifulSoup(Tag):
|
|||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def __copy__(self):
|
||||
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
|
||||
copy = type(self)(
|
||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||
)
|
||||
def _clone(self):
|
||||
"""Create a new BeautifulSoup object with the same TreeBuilder,
|
||||
but not associated with any markup.
|
||||
|
||||
# Although we encoded the tree to UTF-8, that may not have
|
||||
# been the encoding of the original markup. Set the copy's
|
||||
# .original_encoding to reflect the original object's
|
||||
# .original_encoding.
|
||||
copy.original_encoding = self.original_encoding
|
||||
return copy
|
||||
This is the first step of the deepcopy process.
|
||||
"""
|
||||
clone = type(self)("", None, self.builder)
|
||||
|
||||
# Keep track of the encoding of the original document,
|
||||
# since we won't be parsing it again.
|
||||
clone.original_encoding = self.original_encoding
|
||||
return clone
|
||||
|
||||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
||||
d['builder'] = None
|
||||
d['builder'] = type(self.builder)
|
||||
# Store the contents as a Unicode string.
|
||||
d['contents'] = []
|
||||
d['markup'] = self.decode()
|
||||
|
||||
# If _most_recent_element is present, it's a Tag object left
|
||||
# over from initial parse. It might not be picklable and we
|
||||
# don't need it.
|
||||
if '_most_recent_element' in d:
|
||||
del d['_most_recent_element']
|
||||
return d
|
||||
|
||||
def __setstate__(self, state):
|
||||
# If necessary, restore the TreeBuilder by looking it up.
|
||||
self.__dict__ = state
|
||||
if isinstance(self.builder, type):
|
||||
self.builder = self.builder()
|
||||
elif not self.builder:
|
||||
# We don't know which builder was used to build this
|
||||
# parse tree, so use a default we know is always available.
|
||||
self.builder = HTMLParserTreeBuilder()
|
||||
self.builder.soup = self
|
||||
self.reset()
|
||||
self._feed()
|
||||
return state
|
||||
|
||||
|
||||
@classmethod
|
||||
def _decode_markup(cls, markup):
|
||||
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
|
||||
|
@ -468,6 +494,7 @@ class BeautifulSoup(Tag):
|
|||
self.open_tag_counter = Counter()
|
||||
self.preserve_whitespace_tag_stack = []
|
||||
self.string_container_stack = []
|
||||
self._most_recent_element = None
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
||||
|
@ -749,7 +776,7 @@ class BeautifulSoup(Tag):
|
|||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
formatter="minimal", iterator=None):
|
||||
"""Returns a string or Unicode representation of the parse tree
|
||||
as an HTML or XML document.
|
||||
|
||||
|
@ -776,7 +803,7 @@ class BeautifulSoup(Tag):
|
|||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
indent_level, eventual_encoding, formatter, iterator)
|
||||
|
||||
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
||||
_s = BeautifulSoup
|
||||
|
|
|
@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit
|
|||
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
ParserRejectedMarkup,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
|
@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
|||
|
||||
self._initialize_xml_detector()
|
||||
|
||||
def error(self, message):
|
||||
# NOTE: This method is required so long as Python 3.9 is
|
||||
# supported. The corresponding code is removed from HTMLParser
|
||||
# in 3.5, but not removed from ParserBase until 3.10.
|
||||
# https://github.com/python/cpython/issues/76025
|
||||
#
|
||||
# The original implementation turned the error into a warning,
|
||||
# but in every case I discovered, this made HTMLParser
|
||||
# immediately crash with an error message that was less
|
||||
# helpful than the warning. The new implementation makes it
|
||||
# more clear that html.parser just can't parse this
|
||||
# markup. The 3.10 implementation does the same, though it
|
||||
# raises AssertionError rather than calling a method. (We
|
||||
# catch this error and wrap it in a ParserRejectedMarkup.)
|
||||
raise ParserRejectedMarkup(message)
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
"""Handle an incoming empty-element tag.
|
||||
|
||||
|
@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
|||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except AssertionError as e:
|
||||
# html.parser raises AssertionError in rare cases to
|
||||
# indicate a fatal problem with the markup, especially
|
||||
# when there's an error in the doctype declaration.
|
||||
raise ParserRejectedMarkup(e)
|
||||
parser.close()
|
||||
parser.already_closed_empty_element = []
|
||||
|
|
280
lib/bs4/css.py
Normal file
280
lib/bs4/css.py
Normal file
|
@ -0,0 +1,280 @@
|
|||
"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve)."""
|
||||
|
||||
import warnings
|
||||
try:
|
||||
import soupsieve
|
||||
except ImportError as e:
|
||||
soupsieve = None
|
||||
warnings.warn(
|
||||
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||||
)
|
||||
|
||||
|
||||
class CSS(object):
|
||||
"""A proxy object against the soupsieve library, to simplify its
|
||||
CSS selector API.
|
||||
|
||||
Acquire this object through the .css attribute on the
|
||||
BeautifulSoup object, or on the Tag you want to use as the
|
||||
starting point for a CSS selector.
|
||||
|
||||
The main advantage of doing this is that the tag to be selected
|
||||
against doesn't need to be explicitly specified in the function
|
||||
calls, since it's already scoped to a tag.
|
||||
"""
|
||||
|
||||
def __init__(self, tag, api=soupsieve):
|
||||
"""Constructor.
|
||||
|
||||
You don't need to instantiate this class yourself; instead,
|
||||
access the .css attribute on the BeautifulSoup object, or on
|
||||
the Tag you want to use as the starting point for your CSS
|
||||
selector.
|
||||
|
||||
:param tag: All CSS selectors will use this as their starting
|
||||
point.
|
||||
|
||||
:param api: A plug-in replacement for the soupsieve module,
|
||||
designed mainly for use in tests.
|
||||
"""
|
||||
if api is None:
|
||||
raise NotImplementedError(
|
||||
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||||
)
|
||||
self.api = api
|
||||
self.tag = tag
|
||||
|
||||
def escape(self, ident):
|
||||
"""Escape a CSS identifier.
|
||||
|
||||
This is a simple wrapper around soupselect.escape(). See the
|
||||
documentation for that function for more information.
|
||||
"""
|
||||
if soupsieve is None:
|
||||
raise NotImplementedError(
|
||||
"Cannot escape CSS identifiers because the soupsieve package is not installed."
|
||||
)
|
||||
return self.api.escape(ident)
|
||||
|
||||
def _ns(self, ns, select):
|
||||
"""Normalize a dictionary of namespaces."""
|
||||
if not isinstance(select, self.api.SoupSieve) and ns is None:
|
||||
# If the selector is a precompiled pattern, it already has
|
||||
# a namespace context compiled in, which cannot be
|
||||
# replaced.
|
||||
ns = self.tag._namespaces
|
||||
return ns
|
||||
|
||||
def _rs(self, results):
|
||||
"""Normalize a list of results to a Resultset.
|
||||
|
||||
A ResultSet is more consistent with the rest of Beautiful
|
||||
Soup's API, and ResultSet.__getattr__ has a helpful error
|
||||
message if you try to treat a list of results as a single
|
||||
result (a common mistake).
|
||||
"""
|
||||
# Import here to avoid circular import
|
||||
from bs4.element import ResultSet
|
||||
return ResultSet(None, results)
|
||||
|
||||
def compile(self, select, namespaces=None, flags=0, **kwargs):
|
||||
"""Pre-compile a selector and return the compiled object.
|
||||
|
||||
:param selector: A CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will use the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.compile() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.compile() method.
|
||||
|
||||
:return: A precompiled selector object.
|
||||
:rtype: soupsieve.SoupSieve
|
||||
"""
|
||||
return self.api.compile(
|
||||
select, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
|
||||
def select_one(self, select, namespaces=None, flags=0, **kwargs):
|
||||
"""Perform a CSS selection operation on the current Tag and return the
|
||||
first result.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the soupsieve.select_one()
|
||||
method.
|
||||
|
||||
:param selector: A CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will use the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.select_one() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.select_one() method.
|
||||
|
||||
:return: A Tag, or None if the selector has no match.
|
||||
:rtype: bs4.element.Tag
|
||||
|
||||
"""
|
||||
return self.api.select_one(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
|
||||
def select(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||||
"""Perform a CSS selection operation on the current Tag.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the soupsieve.select()
|
||||
method.
|
||||
|
||||
:param selector: A string containing a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param limit: After finding this number of results, stop looking.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.select() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.select() method.
|
||||
|
||||
:return: A ResultSet of Tag objects.
|
||||
:rtype: bs4.element.ResultSet
|
||||
|
||||
"""
|
||||
if limit is None:
|
||||
limit = 0
|
||||
|
||||
return self._rs(
|
||||
self.api.select(
|
||||
select, self.tag, self._ns(namespaces, select), limit, flags,
|
||||
**kwargs
|
||||
)
|
||||
)
|
||||
|
||||
def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs):
|
||||
"""Perform a CSS selection operation on the current Tag.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the soupsieve.iselect()
|
||||
method. It is the same as select(), but it returns a generator
|
||||
instead of a list.
|
||||
|
||||
:param selector: A string containing a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param limit: After finding this number of results, stop looking.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.iselect() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.iselect() method.
|
||||
|
||||
:return: A generator
|
||||
:rtype: types.GeneratorType
|
||||
"""
|
||||
return self.api.iselect(
|
||||
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
|
||||
)
|
||||
|
||||
def closest(self, select, namespaces=None, flags=0, **kwargs):
|
||||
"""Find the Tag closest to this one that matches the given selector.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the soupsieve.closest()
|
||||
method.
|
||||
|
||||
:param selector: A string containing a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.closest() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.closest() method.
|
||||
|
||||
:return: A Tag, or None if there is no match.
|
||||
:rtype: bs4.Tag
|
||||
|
||||
"""
|
||||
return self.api.closest(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
|
||||
def match(self, select, namespaces=None, flags=0, **kwargs):
|
||||
"""Check whether this Tag matches the given CSS selector.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the soupsieve.match()
|
||||
method.
|
||||
|
||||
:param: a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.match() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.match() method.
|
||||
|
||||
:return: True if this Tag matches the selector; False otherwise.
|
||||
:rtype: bool
|
||||
"""
|
||||
return self.api.match(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
|
||||
def filter(self, select, namespaces=None, flags=0, **kwargs):
|
||||
"""Filter this Tag's direct children based on the given CSS selector.
|
||||
|
||||
This uses the Soup Sieve library. It works the same way as
|
||||
passing this Tag into that library's soupsieve.filter()
|
||||
method. More information, for more information see the
|
||||
documentation for soupsieve.filter().
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
soupsieve.filter() method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
soupsieve.filter() method.
|
||||
|
||||
:return: A ResultSet of Tag objects.
|
||||
:rtype: bs4.element.ResultSet
|
||||
|
||||
"""
|
||||
return self._rs(
|
||||
self.api.filter(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
)
|
|
@ -59,21 +59,6 @@ def diagnose(data):
|
|||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
|
||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||
return
|
||||
else:
|
||||
try:
|
||||
if os.path.exists(data):
|
||||
print(('"%s" looks like a filename. Reading data from the file.' % data))
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
except ValueError:
|
||||
# This can happen on some platforms when the 'filename' is
|
||||
# too long. Assume it's data and not a filename.
|
||||
pass
|
||||
print("")
|
||||
|
||||
for parser in basic_parsers:
|
||||
print(("Trying to parse your markup with %s" % parser))
|
||||
|
|
|
@ -8,14 +8,8 @@ except ImportError as e:
|
|||
import re
|
||||
import sys
|
||||
import warnings
|
||||
try:
|
||||
import soupsieve
|
||||
except ImportError as e:
|
||||
soupsieve = None
|
||||
warnings.warn(
|
||||
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||||
)
|
||||
|
||||
from bs4.css import CSS
|
||||
from bs4.formatter import (
|
||||
Formatter,
|
||||
HTMLFormatter,
|
||||
|
@ -154,6 +148,11 @@ class PageElement(object):
|
|||
NavigableString, Tag, etc. are all subclasses of PageElement.
|
||||
"""
|
||||
|
||||
# In general, we can't tell just by looking at an element whether
|
||||
# it's contained in an XML document or an HTML document. But for
|
||||
# Tags (q.v.) we can store this information at parse time.
|
||||
known_xml = None
|
||||
|
||||
def setup(self, parent=None, previous_element=None, next_element=None,
|
||||
previous_sibling=None, next_sibling=None):
|
||||
"""Sets up the initial relations between this element and
|
||||
|
@ -941,11 +940,6 @@ class NavigableString(str, PageElement):
|
|||
PREFIX = ''
|
||||
SUFFIX = ''
|
||||
|
||||
# We can't tell just by looking at a string whether it's contained
|
||||
# in an XML document or an HTML document.
|
||||
|
||||
known_xml = None
|
||||
|
||||
def __new__(cls, value):
|
||||
"""Create a new NavigableString.
|
||||
|
||||
|
@ -961,12 +955,22 @@ class NavigableString(str, PageElement):
|
|||
u.setup()
|
||||
return u
|
||||
|
||||
def __copy__(self):
|
||||
def __deepcopy__(self, memo, recursive=False):
|
||||
"""A copy of a NavigableString has the same contents and class
|
||||
as the original, but it is not connected to the parse tree.
|
||||
|
||||
:param recursive: This parameter is ignored; it's only defined
|
||||
so that NavigableString.__deepcopy__ implements the same
|
||||
signature as Tag.__deepcopy__.
|
||||
"""
|
||||
return type(self)(self)
|
||||
|
||||
def __copy__(self):
|
||||
"""A copy of a NavigableString can only be a deep copy, because
|
||||
only one PageElement can occupy a given place in a parse tree.
|
||||
"""
|
||||
return self.__deepcopy__({})
|
||||
|
||||
def __getnewargs__(self):
|
||||
return (str(self),)
|
||||
|
||||
|
@ -1311,10 +1315,46 @@ class Tag(PageElement):
|
|||
|
||||
parserClass = _alias("parser_class") # BS3
|
||||
|
||||
def __copy__(self):
|
||||
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
|
||||
def __deepcopy__(self, memo, recursive=True):
|
||||
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
|
||||
Its contents are a copy of the old Tag's contents.
|
||||
"""
|
||||
clone = self._clone()
|
||||
|
||||
if recursive:
|
||||
# Clone this tag's descendants recursively, but without
|
||||
# making any recursive function calls.
|
||||
tag_stack = [clone]
|
||||
for event, element in self._event_stream(self.descendants):
|
||||
if event is Tag.END_ELEMENT_EVENT:
|
||||
# Stop appending incoming Tags to the Tag that was
|
||||
# just closed.
|
||||
tag_stack.pop()
|
||||
else:
|
||||
descendant_clone = element.__deepcopy__(
|
||||
memo, recursive=False
|
||||
)
|
||||
# Add to its parent's .contents
|
||||
tag_stack[-1].append(descendant_clone)
|
||||
|
||||
if event is Tag.START_ELEMENT_EVENT:
|
||||
# Add the Tag itself to the stack so that its
|
||||
# children will be .appended to it.
|
||||
tag_stack.append(descendant_clone)
|
||||
return clone
|
||||
|
||||
def __copy__(self):
|
||||
"""A copy of a Tag must always be a deep copy, because a Tag's
|
||||
children can only have one parent at a time.
|
||||
"""
|
||||
return self.__deepcopy__({})
|
||||
|
||||
def _clone(self):
|
||||
"""Create a new Tag just like this one, but with no
|
||||
contents and unattached to any parse tree.
|
||||
|
||||
This is the first step in the deepcopy process.
|
||||
"""
|
||||
clone = type(self)(
|
||||
None, self.builder, self.name, self.namespace,
|
||||
self.prefix, self.attrs, is_xml=self._is_xml,
|
||||
|
@ -1326,8 +1366,6 @@ class Tag(PageElement):
|
|||
)
|
||||
for attr in ('can_be_empty_element', 'hidden'):
|
||||
setattr(clone, attr, getattr(self, attr))
|
||||
for child in self.contents:
|
||||
clone.append(child.__copy__())
|
||||
return clone
|
||||
|
||||
@property
|
||||
|
@ -1650,28 +1688,178 @@ class Tag(PageElement):
|
|||
|
||||
def decode(self, indent_level=None,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Render a Unicode representation of this PageElement and its
|
||||
contents.
|
||||
|
||||
:param indent_level: Each line of the rendering will be
|
||||
indented this many spaces. Used internally in
|
||||
recursive calls while pretty-printing.
|
||||
:param eventual_encoding: The tag is destined to be
|
||||
encoded into this encoding. This method is _not_
|
||||
responsible for performing that encoding. This information
|
||||
is passed in so that it can be substituted in if the
|
||||
document contains a <META> tag that mentions the document's
|
||||
encoding.
|
||||
:param formatter: A Formatter object, or a string naming one of
|
||||
the standard formatters.
|
||||
"""
|
||||
|
||||
formatter="minimal",
|
||||
iterator=None):
|
||||
pieces = []
|
||||
# First off, turn a non-Formatter `formatter` into a Formatter
|
||||
# object. This will stop the lookup from happening over and
|
||||
# over again.
|
||||
if not isinstance(formatter, Formatter):
|
||||
formatter = self.formatter_for_name(formatter)
|
||||
|
||||
if indent_level is True:
|
||||
indent_level = 0
|
||||
|
||||
# The currently active tag that put us into string literal
|
||||
# mode. Until this element is closed, children will be treated
|
||||
# as string literals and not pretty-printed. String literal
|
||||
# mode is turned on immediately after this tag begins, and
|
||||
# turned off immediately before it's closed. This means there
|
||||
# will be whitespace before and after the tag itself.
|
||||
string_literal_tag = None
|
||||
|
||||
for event, element in self._event_stream(iterator):
|
||||
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
|
||||
piece = element._format_tag(
|
||||
eventual_encoding, formatter, opening=True
|
||||
)
|
||||
elif event is Tag.END_ELEMENT_EVENT:
|
||||
piece = element._format_tag(
|
||||
eventual_encoding, formatter, opening=False
|
||||
)
|
||||
if indent_level is not None:
|
||||
indent_level -= 1
|
||||
else:
|
||||
piece = element.output_ready(formatter)
|
||||
|
||||
# Now we need to apply the 'prettiness' -- extra
|
||||
# whitespace before and/or after this tag. This can get
|
||||
# complicated because certain tags, like <pre> and
|
||||
# <script>, can't be prettified, since adding whitespace would
|
||||
# change the meaning of the content.
|
||||
|
||||
# The default behavior is to add whitespace before and
|
||||
# after an element when string literal mode is off, and to
|
||||
# leave things as they are when string literal mode is on.
|
||||
if string_literal_tag:
|
||||
indent_before = indent_after = False
|
||||
else:
|
||||
indent_before = indent_after = True
|
||||
|
||||
# The only time the behavior is more complex than that is
|
||||
# when we encounter an opening or closing tag that might
|
||||
# put us into or out of string literal mode.
|
||||
if (event is Tag.START_ELEMENT_EVENT
|
||||
and not string_literal_tag
|
||||
and not element._should_pretty_print()):
|
||||
# We are about to enter string literal mode. Add
|
||||
# whitespace before this tag, but not after. We
|
||||
# will stay in string literal mode until this tag
|
||||
# is closed.
|
||||
indent_before = True
|
||||
indent_after = False
|
||||
string_literal_tag = element
|
||||
elif (event is Tag.END_ELEMENT_EVENT
|
||||
and element is string_literal_tag):
|
||||
# We are about to exit string literal mode by closing
|
||||
# the tag that sent us into that mode. Add whitespace
|
||||
# after this tag, but not before.
|
||||
indent_before = False
|
||||
indent_after = True
|
||||
string_literal_tag = None
|
||||
|
||||
# Now we know whether to add whitespace before and/or
|
||||
# after this element.
|
||||
if indent_level is not None:
|
||||
if (indent_before or indent_after):
|
||||
if isinstance(element, NavigableString):
|
||||
piece = piece.strip()
|
||||
if piece:
|
||||
piece = self._indent_string(
|
||||
piece, indent_level, formatter,
|
||||
indent_before, indent_after
|
||||
)
|
||||
if event == Tag.START_ELEMENT_EVENT:
|
||||
indent_level += 1
|
||||
pieces.append(piece)
|
||||
return "".join(pieces)
|
||||
|
||||
# Names for the different events yielded by _event_stream
|
||||
START_ELEMENT_EVENT = object()
|
||||
END_ELEMENT_EVENT = object()
|
||||
EMPTY_ELEMENT_EVENT = object()
|
||||
STRING_ELEMENT_EVENT = object()
|
||||
|
||||
def _event_stream(self, iterator=None):
|
||||
"""Yield a sequence of events that can be used to reconstruct the DOM
|
||||
for this element.
|
||||
|
||||
This lets us recreate the nested structure of this element
|
||||
(e.g. when formatting it as a string) without using recursive
|
||||
method calls.
|
||||
|
||||
This is similar in concept to the SAX API, but it's a simpler
|
||||
interface designed for internal use. The events are different
|
||||
from SAX and the arguments associated with the events are Tags
|
||||
and other Beautiful Soup objects.
|
||||
|
||||
:param iterator: An alternate iterator to use when traversing
|
||||
the tree.
|
||||
"""
|
||||
tag_stack = []
|
||||
|
||||
iterator = iterator or self.self_and_descendants
|
||||
|
||||
for c in iterator:
|
||||
# If the parent of the element we're about to yield is not
|
||||
# the tag currently on the stack, it means that the tag on
|
||||
# the stack closed before this element appeared.
|
||||
while tag_stack and c.parent != tag_stack[-1]:
|
||||
now_closed_tag = tag_stack.pop()
|
||||
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
||||
|
||||
if isinstance(c, Tag):
|
||||
if c.is_empty_element:
|
||||
yield Tag.EMPTY_ELEMENT_EVENT, c
|
||||
else:
|
||||
yield Tag.START_ELEMENT_EVENT, c
|
||||
tag_stack.append(c)
|
||||
continue
|
||||
else:
|
||||
yield Tag.STRING_ELEMENT_EVENT, c
|
||||
|
||||
while tag_stack:
|
||||
now_closed_tag = tag_stack.pop()
|
||||
yield Tag.END_ELEMENT_EVENT, now_closed_tag
|
||||
|
||||
def _indent_string(self, s, indent_level, formatter,
|
||||
indent_before, indent_after):
|
||||
"""Add indentation whitespace before and/or after a string.
|
||||
|
||||
:param s: The string to amend with whitespace.
|
||||
:param indent_level: The indentation level; affects how much
|
||||
whitespace goes before the string.
|
||||
:param indent_before: Whether or not to add whitespace
|
||||
before the string.
|
||||
:param indent_after: Whether or not to add whitespace
|
||||
(a newline) after the string.
|
||||
"""
|
||||
space_before = ''
|
||||
if indent_before and indent_level:
|
||||
space_before = (formatter.indent * indent_level)
|
||||
|
||||
space_after = ''
|
||||
if indent_after:
|
||||
space_after = "\n"
|
||||
|
||||
return space_before + s + space_after
|
||||
|
||||
def _format_tag(self, eventual_encoding, formatter, opening):
|
||||
# A tag starts with the < character (see below).
|
||||
|
||||
# Then the / character, if this is a closing tag.
|
||||
closing_slash = ''
|
||||
if not opening:
|
||||
closing_slash = '/'
|
||||
|
||||
# Then an optional namespace prefix.
|
||||
prefix = ''
|
||||
if self.prefix:
|
||||
prefix = self.prefix + ":"
|
||||
|
||||
# Then a list of attribute values, if this is an opening tag.
|
||||
attribute_string = ''
|
||||
if opening:
|
||||
attributes = formatter.attributes(self)
|
||||
attrs = []
|
||||
for key, val in attributes:
|
||||
|
@ -1693,63 +1881,19 @@ class Tag(PageElement):
|
|||
str(key) + '='
|
||||
+ formatter.quoted_attribute_value(text))
|
||||
attrs.append(decoded)
|
||||
close = ''
|
||||
closeTag = ''
|
||||
|
||||
prefix = ''
|
||||
if self.prefix:
|
||||
prefix = self.prefix + ":"
|
||||
|
||||
if self.is_empty_element:
|
||||
close = formatter.void_element_close_prefix or ''
|
||||
else:
|
||||
closeTag = '</%s%s>' % (prefix, self.name)
|
||||
|
||||
pretty_print = self._should_pretty_print(indent_level)
|
||||
space = ''
|
||||
indent_space = ''
|
||||
if indent_level is not None:
|
||||
indent_space = (formatter.indent * (indent_level - 1))
|
||||
if pretty_print:
|
||||
space = indent_space
|
||||
indent_contents = indent_level + 1
|
||||
else:
|
||||
indent_contents = None
|
||||
contents = self.decode_contents(
|
||||
indent_contents, eventual_encoding, formatter
|
||||
)
|
||||
|
||||
if self.hidden:
|
||||
# This is the 'document root' object.
|
||||
s = contents
|
||||
else:
|
||||
s = []
|
||||
attribute_string = ''
|
||||
if attrs:
|
||||
attribute_string = ' ' + ' '.join(attrs)
|
||||
if indent_level is not None:
|
||||
# Even if this particular tag is not pretty-printed,
|
||||
# we should indent up to the start of the tag.
|
||||
s.append(indent_space)
|
||||
s.append('<%s%s%s%s>' % (
|
||||
prefix, self.name, attribute_string, close))
|
||||
if pretty_print:
|
||||
s.append("\n")
|
||||
s.append(contents)
|
||||
if pretty_print and contents and contents[-1] != "\n":
|
||||
s.append("\n")
|
||||
if pretty_print and closeTag:
|
||||
s.append(space)
|
||||
s.append(closeTag)
|
||||
if indent_level is not None and closeTag and self.next_sibling:
|
||||
# Even if this particular tag is not pretty-printed,
|
||||
# we're now done with the tag, and we should add a
|
||||
# newline if appropriate.
|
||||
s.append("\n")
|
||||
s = ''.join(s)
|
||||
return s
|
||||
|
||||
def _should_pretty_print(self, indent_level):
|
||||
# Then an optional closing slash (for a void element in an
|
||||
# XML document).
|
||||
void_element_closing_slash = ''
|
||||
if self.is_empty_element:
|
||||
void_element_closing_slash = formatter.void_element_close_prefix or ''
|
||||
|
||||
# Put it all together.
|
||||
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
|
||||
|
||||
def _should_pretty_print(self, indent_level=1):
|
||||
"""Should this tag be pretty-printed?
|
||||
|
||||
Most of them should, but some (such as <pre> in HTML
|
||||
|
@ -1800,32 +1944,8 @@ class Tag(PageElement):
|
|||
the standard Formatters.
|
||||
|
||||
"""
|
||||
# First off, turn a string formatter into a Formatter object. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
if not isinstance(formatter, Formatter):
|
||||
formatter = self.formatter_for_name(formatter)
|
||||
|
||||
pretty_print = (indent_level is not None)
|
||||
s = []
|
||||
for c in self:
|
||||
text = None
|
||||
if isinstance(c, NavigableString):
|
||||
text = c.output_ready(formatter)
|
||||
elif isinstance(c, Tag):
|
||||
s.append(c.decode(indent_level, eventual_encoding,
|
||||
formatter))
|
||||
preserve_whitespace = (
|
||||
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
|
||||
)
|
||||
if text and indent_level and not preserve_whitespace:
|
||||
text = text.strip()
|
||||
if text:
|
||||
if pretty_print and not preserve_whitespace:
|
||||
s.append(formatter.indent * (indent_level - 1))
|
||||
s.append(text)
|
||||
if pretty_print and not preserve_whitespace:
|
||||
s.append("\n")
|
||||
return ''.join(s)
|
||||
return self.decode(indent_level, eventual_encoding, formatter,
|
||||
iterator=self.descendants)
|
||||
|
||||
def encode_contents(
|
||||
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
|
@ -1922,6 +2042,18 @@ class Tag(PageElement):
|
|||
# return iter() to make the purpose of the method clear
|
||||
return iter(self.contents) # XXX This seems to be untested.
|
||||
|
||||
@property
|
||||
def self_and_descendants(self):
|
||||
"""Iterate over this PageElement and its children in a
|
||||
breadth-first sequence.
|
||||
|
||||
:yield: A sequence of PageElements.
|
||||
"""
|
||||
if not self.hidden:
|
||||
yield self
|
||||
for i in self.descendants:
|
||||
yield i
|
||||
|
||||
@property
|
||||
def descendants(self):
|
||||
"""Iterate over all children of this PageElement in a
|
||||
|
@ -1948,16 +2080,13 @@ class Tag(PageElement):
|
|||
Beautiful Soup will use the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||
soupsieve.select() method.
|
||||
|
||||
:return: A Tag.
|
||||
:rtype: bs4.element.Tag
|
||||
"""
|
||||
value = self.select(selector, namespaces, 1, **kwargs)
|
||||
if value:
|
||||
return value[0]
|
||||
return None
|
||||
return self.css.select_one(selector, namespaces, **kwargs)
|
||||
|
||||
def select(self, selector, namespaces=None, limit=None, **kwargs):
|
||||
"""Perform a CSS selection operation on the current element.
|
||||
|
@ -1979,21 +2108,12 @@ class Tag(PageElement):
|
|||
:return: A ResultSet of Tags.
|
||||
:rtype: bs4.element.ResultSet
|
||||
"""
|
||||
if namespaces is None:
|
||||
namespaces = self._namespaces
|
||||
return self.css.select(selector, namespaces, limit, **kwargs)
|
||||
|
||||
if limit is None:
|
||||
limit = 0
|
||||
if soupsieve is None:
|
||||
raise NotImplementedError(
|
||||
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||||
)
|
||||
|
||||
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
|
||||
|
||||
# We do this because it's more consistent and because
|
||||
# ResultSet.__getattr__ has a helpful error message.
|
||||
return ResultSet(None, results)
|
||||
@property
|
||||
def css(self):
|
||||
"""Return an interface to the CSS selector API."""
|
||||
return CSS(self)
|
||||
|
||||
# Old names for backwards compatibility
|
||||
def childGenerator(self):
|
||||
|
|
|
@ -298,37 +298,11 @@ class TreeBuilderSmokeTest(object):
|
|||
)
|
||||
assert soup.a['class'] == ['a', 'b', 'c']
|
||||
|
||||
def test_fuzzed_input(self):
|
||||
# This test centralizes in one place the various fuzz tests
|
||||
# for Beautiful Soup created by the oss-fuzz project.
|
||||
|
||||
# These strings superficially resemble markup, but they
|
||||
# generally can't be parsed into anything. The best we can
|
||||
# hope for is that parsing these strings won't crash the
|
||||
# parser.
|
||||
#
|
||||
# n.b. This markup is commented out because these fuzz tests
|
||||
# _do_ crash the parser. However the crashes are due to bugs
|
||||
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
|
||||
# bugs!
|
||||
|
||||
bad_markup = [
|
||||
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
|
||||
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
|
||||
# https://bugs.python.org/issue37747
|
||||
#
|
||||
#b'\n<![\xff\xfe\xfe\xcd\x00',
|
||||
|
||||
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
|
||||
# https://bugs.python.org/issue34480
|
||||
#
|
||||
#b'<![n\x00'
|
||||
]
|
||||
for markup in bad_markup:
|
||||
with warnings.catch_warnings(record=False):
|
||||
def test_invalid_doctype(self):
|
||||
markup = '<![if word]>content<![endif]>'
|
||||
markup = '<!DOCTYPE html]ff>'
|
||||
soup = self.soup(markup)
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
@ -577,8 +551,8 @@ Hello, world!
|
|||
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
||||
even if that would mean not prettifying the markup.
|
||||
"""
|
||||
pre_markup = "<pre> </pre>"
|
||||
textarea_markup = "<textarea> woo\nwoo </textarea>"
|
||||
pre_markup = "<pre>a z</pre>\n"
|
||||
textarea_markup = "<textarea> woo\nwoo </textarea>\n"
|
||||
self.assert_soup(pre_markup)
|
||||
self.assert_soup(textarea_markup)
|
||||
|
||||
|
@ -589,7 +563,7 @@ Hello, world!
|
|||
assert soup.textarea.prettify() == textarea_markup
|
||||
|
||||
soup = self.soup("<textarea></textarea>")
|
||||
assert soup.textarea.prettify() == "<textarea></textarea>"
|
||||
assert soup.textarea.prettify() == "<textarea></textarea>\n"
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
˙<!DOCTyPEV PUBLIC'''Đ'
|
|
@ -0,0 +1 @@
|
|||
)<a><math><TR><a><mI><a><p><a>
|
Binary file not shown.
|
@ -0,0 +1,2 @@
|
|||
|
||||
<![
|
|
@ -0,0 +1 @@
|
|||
-<math><sElect><mi><sElect><sElect>
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -0,0 +1 @@
|
|||
ñ<table><svg><html>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
487
lib/bs4/tests/test_css.py
Normal file
487
lib/bs4/tests/test_css.py
Normal file
|
@ -0,0 +1,487 @@
|
|||
import pytest
|
||||
import types
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from bs4 import (
|
||||
CSS,
|
||||
BeautifulSoup,
|
||||
ResultSet,
|
||||
)
|
||||
|
||||
from . import (
|
||||
SoupTest,
|
||||
SOUP_SIEVE_PRESENT,
|
||||
)
|
||||
|
||||
if SOUP_SIEVE_PRESENT:
|
||||
from soupsieve import SelectorSyntaxError
|
||||
|
||||
|
||||
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
|
||||
class TestCSSSelectors(SoupTest):
|
||||
"""Test basic CSS selector functionality.
|
||||
|
||||
This functionality is implemented in soupsieve, which has a much
|
||||
more comprehensive test suite, so this is basically an extra check
|
||||
that soupsieve works as expected.
|
||||
"""
|
||||
|
||||
HTML = """
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>The title</title>
|
||||
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
|
||||
</head>
|
||||
<body>
|
||||
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
|
||||
<div id="main" class="fancy">
|
||||
<div id="inner">
|
||||
<h1 id="header1">An H1</h1>
|
||||
<p>Some text</p>
|
||||
<p class="onep" id="p1">Some more text</p>
|
||||
<h2 id="header2">An H2</h2>
|
||||
<p class="class1 class2 class3" id="pmulti">Another</p>
|
||||
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
|
||||
<h2 id="header3">Another H2</h2>
|
||||
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
|
||||
<span class="s1">
|
||||
<a href="#" id="s1a1">span1a1</a>
|
||||
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
|
||||
<span class="span2">
|
||||
<a href="#" id="s2a1">span2a1</a>
|
||||
</span>
|
||||
<span class="span3"></span>
|
||||
<custom-dashed-tag class="dashed" id="dash2"/>
|
||||
<div data-tag="dashedvalue" id="data1"/>
|
||||
</span>
|
||||
</div>
|
||||
<x id="xid">
|
||||
<z id="zida"/>
|
||||
<z id="zidab"/>
|
||||
<z id="zidac"/>
|
||||
</x>
|
||||
<y id="yid">
|
||||
<z id="zidb"/>
|
||||
</y>
|
||||
<p lang="en" id="lang-en">English</p>
|
||||
<p lang="en-gb" id="lang-en-gb">English UK</p>
|
||||
<p lang="en-us" id="lang-en-us">English US</p>
|
||||
<p lang="fr" id="lang-fr">French</p>
|
||||
</div>
|
||||
|
||||
<div id="footer">
|
||||
</div>
|
||||
"""
|
||||
|
||||
def setup_method(self):
|
||||
self.soup = BeautifulSoup(self.HTML, 'html.parser')
|
||||
|
||||
def assert_selects(self, selector, expected_ids, **kwargs):
|
||||
results = self.soup.select(selector, **kwargs)
|
||||
assert isinstance(results, ResultSet)
|
||||
el_ids = [el['id'] for el in results]
|
||||
el_ids.sort()
|
||||
expected_ids.sort()
|
||||
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
|
||||
selector, ', '.join(expected_ids), ', '.join(el_ids)
|
||||
)
|
||||
|
||||
assertSelect = assert_selects
|
||||
|
||||
def assert_select_multiple(self, *tests):
|
||||
for selector, expected_ids in tests:
|
||||
self.assert_selects(selector, expected_ids)
|
||||
|
||||
def test_precompiled(self):
|
||||
sel = self.soup.css.compile('div')
|
||||
|
||||
els = self.soup.select(sel)
|
||||
assert len(els) == 4
|
||||
for div in els:
|
||||
assert div.name == 'div'
|
||||
|
||||
el = self.soup.select_one(sel)
|
||||
assert 'main' == el['id']
|
||||
|
||||
def test_one_tag_one(self):
|
||||
els = self.soup.select('title')
|
||||
assert len(els) == 1
|
||||
assert els[0].name == 'title'
|
||||
assert els[0].contents == ['The title']
|
||||
|
||||
def test_one_tag_many(self):
|
||||
els = self.soup.select('div')
|
||||
assert len(els) == 4
|
||||
for div in els:
|
||||
assert div.name == 'div'
|
||||
|
||||
el = self.soup.select_one('div')
|
||||
assert 'main' == el['id']
|
||||
|
||||
def test_select_one_returns_none_if_no_match(self):
|
||||
match = self.soup.select_one('nonexistenttag')
|
||||
assert None == match
|
||||
|
||||
|
||||
def test_tag_in_tag_one(self):
|
||||
els = self.soup.select('div div')
|
||||
self.assert_selects('div div', ['inner', 'data1'])
|
||||
|
||||
def test_tag_in_tag_many(self):
|
||||
for selector in ('html div', 'html body div', 'body div'):
|
||||
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
|
||||
|
||||
|
||||
def test_limit(self):
|
||||
self.assert_selects('html div', ['main'], limit=1)
|
||||
self.assert_selects('html body div', ['inner', 'main'], limit=2)
|
||||
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
|
||||
limit=10)
|
||||
|
||||
def test_tag_no_match(self):
|
||||
assert len(self.soup.select('del')) == 0
|
||||
|
||||
def test_invalid_tag(self):
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select('tag%t')
|
||||
|
||||
def test_select_dashed_tag_ids(self):
|
||||
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||
|
||||
def test_select_dashed_by_id(self):
|
||||
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
|
||||
assert dashed[0].name == 'custom-dashed-tag'
|
||||
assert dashed[0]['id'] == 'dash2'
|
||||
|
||||
def test_dashed_tag_text(self):
|
||||
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
|
||||
|
||||
def test_select_dashed_matches_find_all(self):
|
||||
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
|
||||
|
||||
def test_header_tags(self):
|
||||
self.assert_select_multiple(
|
||||
('h1', ['header1']),
|
||||
('h2', ['header2', 'header3']),
|
||||
)
|
||||
|
||||
def test_class_one(self):
|
||||
for selector in ('.onep', 'p.onep', 'html p.onep'):
|
||||
els = self.soup.select(selector)
|
||||
assert len(els) == 1
|
||||
assert els[0].name == 'p'
|
||||
assert els[0]['class'] == ['onep']
|
||||
|
||||
def test_class_mismatched_tag(self):
|
||||
els = self.soup.select('div.onep')
|
||||
assert len(els) == 0
|
||||
|
||||
def test_one_id(self):
|
||||
for selector in ('div#inner', '#inner', 'div div#inner'):
|
||||
self.assert_selects(selector, ['inner'])
|
||||
|
||||
def test_bad_id(self):
|
||||
els = self.soup.select('#doesnotexist')
|
||||
assert len(els) == 0
|
||||
|
||||
def test_items_in_id(self):
|
||||
els = self.soup.select('div#inner p')
|
||||
assert len(els) == 3
|
||||
for el in els:
|
||||
assert el.name == 'p'
|
||||
assert els[1]['class'] == ['onep']
|
||||
assert not els[0].has_attr('class')
|
||||
|
||||
def test_a_bunch_of_emptys(self):
|
||||
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
||||
assert len(self.soup.select(selector)) == 0
|
||||
|
||||
def test_multi_class_support(self):
|
||||
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
|
||||
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
|
||||
self.assert_selects(selector, ['pmulti'])
|
||||
|
||||
def test_multi_class_selection(self):
|
||||
for selector in ('.class1.class3', '.class3.class2',
|
||||
'.class1.class2.class3'):
|
||||
self.assert_selects(selector, ['pmulti'])
|
||||
|
||||
def test_child_selector(self):
|
||||
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
|
||||
self.assert_selects('.s1 > a span', ['s1a2s1'])
|
||||
|
||||
def test_child_selector_id(self):
|
||||
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
|
||||
|
||||
def test_attribute_equals(self):
|
||||
self.assert_select_multiple(
|
||||
('p[class="onep"]', ['p1']),
|
||||
('p[id="p1"]', ['p1']),
|
||||
('[class="onep"]', ['p1']),
|
||||
('[id="p1"]', ['p1']),
|
||||
('link[rel="stylesheet"]', ['l1']),
|
||||
('link[type="text/css"]', ['l1']),
|
||||
('link[href="blah.css"]', ['l1']),
|
||||
('link[href="no-blah.css"]', []),
|
||||
('[rel="stylesheet"]', ['l1']),
|
||||
('[type="text/css"]', ['l1']),
|
||||
('[href="blah.css"]', ['l1']),
|
||||
('[href="no-blah.css"]', []),
|
||||
('p[href="no-blah.css"]', []),
|
||||
('[href="no-blah.css"]', []),
|
||||
)
|
||||
|
||||
def test_attribute_tilde(self):
|
||||
self.assert_select_multiple(
|
||||
('p[class~="class1"]', ['pmulti']),
|
||||
('p[class~="class2"]', ['pmulti']),
|
||||
('p[class~="class3"]', ['pmulti']),
|
||||
('[class~="class1"]', ['pmulti']),
|
||||
('[class~="class2"]', ['pmulti']),
|
||||
('[class~="class3"]', ['pmulti']),
|
||||
('a[rel~="friend"]', ['bob']),
|
||||
('a[rel~="met"]', ['bob']),
|
||||
('[rel~="friend"]', ['bob']),
|
||||
('[rel~="met"]', ['bob']),
|
||||
)
|
||||
|
||||
def test_attribute_startswith(self):
|
||||
self.assert_select_multiple(
|
||||
('[rel^="style"]', ['l1']),
|
||||
('link[rel^="style"]', ['l1']),
|
||||
('notlink[rel^="notstyle"]', []),
|
||||
('[rel^="notstyle"]', []),
|
||||
('link[rel^="notstyle"]', []),
|
||||
('link[href^="bla"]', ['l1']),
|
||||
('a[href^="http://"]', ['bob', 'me']),
|
||||
('[href^="http://"]', ['bob', 'me']),
|
||||
('[id^="p"]', ['pmulti', 'p1']),
|
||||
('[id^="m"]', ['me', 'main']),
|
||||
('div[id^="m"]', ['main']),
|
||||
('a[id^="m"]', ['me']),
|
||||
('div[data-tag^="dashed"]', ['data1'])
|
||||
)
|
||||
|
||||
def test_attribute_endswith(self):
|
||||
self.assert_select_multiple(
|
||||
('[href$=".css"]', ['l1']),
|
||||
('link[href$=".css"]', ['l1']),
|
||||
('link[id$="1"]', ['l1']),
|
||||
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
|
||||
('div[id$="1"]', ['data1']),
|
||||
('[id$="noending"]', []),
|
||||
)
|
||||
|
||||
def test_attribute_contains(self):
|
||||
self.assert_select_multiple(
|
||||
# From test_attribute_startswith
|
||||
('[rel*="style"]', ['l1']),
|
||||
('link[rel*="style"]', ['l1']),
|
||||
('notlink[rel*="notstyle"]', []),
|
||||
('[rel*="notstyle"]', []),
|
||||
('link[rel*="notstyle"]', []),
|
||||
('link[href*="bla"]', ['l1']),
|
||||
('[href*="http://"]', ['bob', 'me']),
|
||||
('[id*="p"]', ['pmulti', 'p1']),
|
||||
('div[id*="m"]', ['main']),
|
||||
('a[id*="m"]', ['me']),
|
||||
# From test_attribute_endswith
|
||||
('[href*=".css"]', ['l1']),
|
||||
('link[href*=".css"]', ['l1']),
|
||||
('link[id*="1"]', ['l1']),
|
||||
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
|
||||
('div[id*="1"]', ['data1']),
|
||||
('[id*="noending"]', []),
|
||||
# New for this test
|
||||
('[href*="."]', ['bob', 'me', 'l1']),
|
||||
('a[href*="."]', ['bob', 'me']),
|
||||
('link[href*="."]', ['l1']),
|
||||
('div[id*="n"]', ['main', 'inner']),
|
||||
('div[id*="nn"]', ['inner']),
|
||||
('div[data-tag*="edval"]', ['data1'])
|
||||
)
|
||||
|
||||
def test_attribute_exact_or_hypen(self):
|
||||
self.assert_select_multiple(
|
||||
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||
('p[lang|="fr"]', ['lang-fr']),
|
||||
('p[lang|="gb"]', []),
|
||||
)
|
||||
|
||||
def test_attribute_exists(self):
|
||||
self.assert_select_multiple(
|
||||
('[rel]', ['l1', 'bob', 'me']),
|
||||
('link[rel]', ['l1']),
|
||||
('a[rel]', ['bob', 'me']),
|
||||
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
|
||||
('p[class]', ['p1', 'pmulti']),
|
||||
('[blah]', []),
|
||||
('p[blah]', []),
|
||||
('div[data-tag]', ['data1'])
|
||||
)
|
||||
|
||||
def test_quoted_space_in_selector_name(self):
|
||||
html = """<div style="display: wrong">nope</div>
|
||||
<div style="display: right">yes</div>
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
[chosen] = soup.select('div[style="display: right"]')
|
||||
assert "yes" == chosen.string
|
||||
|
||||
def test_unsupported_pseudoclass(self):
|
||||
with pytest.raises(NotImplementedError):
|
||||
self.soup.select("a:no-such-pseudoclass")
|
||||
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select("a:nth-of-type(a)")
|
||||
|
||||
def test_nth_of_type(self):
|
||||
# Try to select first paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||
assert len(els) == 1
|
||||
assert els[0].string == 'Some text'
|
||||
|
||||
# Try to select third paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||
assert len(els) == 1
|
||||
assert els[0].string == 'Another'
|
||||
|
||||
# Try to select (non-existent!) fourth paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||
assert len(els) == 0
|
||||
|
||||
# Zero will select no tags.
|
||||
els = self.soup.select('div p:nth-of-type(0)')
|
||||
assert len(els) == 0
|
||||
|
||||
def test_nth_of_type_direct_descendant(self):
|
||||
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||
assert len(els) == 1
|
||||
assert els[0].string == 'Some text'
|
||||
|
||||
def test_id_child_selector_nth_of_type(self):
|
||||
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
|
||||
|
||||
def test_select_on_element(self):
|
||||
# Other tests operate on the tree; this operates on an element
|
||||
# within the tree.
|
||||
inner = self.soup.find("div", id="main")
|
||||
selected = inner.select("div")
|
||||
# The <div id="inner"> tag was selected. The <div id="footer">
|
||||
# tag was not.
|
||||
self.assert_selects_ids(selected, ['inner', 'data1'])
|
||||
|
||||
def test_overspecified_child_id(self):
|
||||
self.assert_selects(".fancy #inner", ['inner'])
|
||||
self.assert_selects(".normal #inner", [])
|
||||
|
||||
def test_adjacent_sibling_selector(self):
|
||||
self.assert_selects('#p1 + h2', ['header2'])
|
||||
self.assert_selects('#p1 + h2 + p', ['pmulti'])
|
||||
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
|
||||
assert [] == self.soup.select('#p1 + p')
|
||||
|
||||
def test_general_sibling_selector(self):
|
||||
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
|
||||
self.assert_selects('#p1 ~ #header2', ['header2'])
|
||||
self.assert_selects('#p1 ~ h2 + a', ['me'])
|
||||
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
|
||||
assert [] == self.soup.select('#inner ~ h2')
|
||||
|
||||
def test_dangling_combinator(self):
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select('h1 >')
|
||||
|
||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||
|
||||
# Test the selector grouping operator (the comma)
|
||||
def test_multiple_select(self):
|
||||
self.assert_selects('x, y', ['xid', 'yid'])
|
||||
|
||||
def test_multiple_select_with_no_space(self):
|
||||
self.assert_selects('x,y', ['xid', 'yid'])
|
||||
|
||||
def test_multiple_select_with_more_space(self):
|
||||
self.assert_selects('x, y', ['xid', 'yid'])
|
||||
|
||||
def test_multiple_select_duplicated(self):
|
||||
self.assert_selects('x, x', ['xid'])
|
||||
|
||||
def test_multiple_select_sibling(self):
|
||||
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
|
||||
|
||||
def test_multiple_select_tag_and_direct_descendant(self):
|
||||
self.assert_selects('x, y > z', ['xid', 'zidb'])
|
||||
|
||||
def test_multiple_select_direct_descendant_and_tags(self):
|
||||
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||
|
||||
def test_multiple_select_indirect_descendant(self):
|
||||
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||
|
||||
def test_invalid_multiple_select(self):
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select(',x, y')
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select('x,,y')
|
||||
|
||||
def test_multiple_select_attrs(self):
|
||||
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||
|
||||
def test_multiple_select_ids(self):
|
||||
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
|
||||
|
||||
def test_multiple_select_nested(self):
|
||||
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
|
||||
|
||||
def test_select_duplicate_elements(self):
|
||||
# When markup contains duplicate elements, a multiple select
|
||||
# will find all of them.
|
||||
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||||
soup = BeautifulSoup(markup, 'html.parser')
|
||||
selected = soup.select(".c1, .c2")
|
||||
assert 3 == len(selected)
|
||||
|
||||
# Verify that find_all finds the same elements, though because
|
||||
# of an implementation detail it finds them in a different
|
||||
# order.
|
||||
for element in soup.find_all(class_=['c1', 'c2']):
|
||||
assert element in selected
|
||||
|
||||
def test_closest(self):
|
||||
inner = self.soup.find("div", id="inner")
|
||||
closest = inner.css.closest("div[id=main]")
|
||||
assert closest == self.soup.find("div", id="main")
|
||||
|
||||
def test_match(self):
|
||||
inner = self.soup.find("div", id="inner")
|
||||
main = self.soup.find("div", id="main")
|
||||
assert inner.css.match("div[id=main]") == False
|
||||
assert main.css.match("div[id=main]") == True
|
||||
|
||||
def test_iselect(self):
|
||||
gen = self.soup.css.iselect("h2")
|
||||
assert isinstance(gen, types.GeneratorType)
|
||||
[header2, header3] = gen
|
||||
assert header2['id'] == 'header2'
|
||||
assert header3['id'] == 'header3'
|
||||
|
||||
def test_filter(self):
|
||||
inner = self.soup.find("div", id="inner")
|
||||
results = inner.css.filter("h2")
|
||||
assert len(inner.css.filter("h2")) == 2
|
||||
|
||||
results = inner.css.filter("h2[id=header3]")
|
||||
assert isinstance(results, ResultSet)
|
||||
[result] = results
|
||||
assert result['id'] == 'header3'
|
||||
|
||||
def test_escape(self):
|
||||
m = self.soup.css.escape
|
||||
assert m(".foo#bar") == '\\.foo\\#bar'
|
||||
assert m("()[]{}") == '\\(\\)\\[\\]\\{\\}'
|
||||
assert m(".foo") == self.soup.css.escape(".foo")
|
|
@ -80,20 +80,20 @@ class TestFormatter(SoupTest):
|
|||
@pytest.mark.parametrize(
|
||||
"indent,expect",
|
||||
[
|
||||
(None, '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||
(-1, '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||
(0, '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||
("", '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||
(None, '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||
(-1, '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||
(0, '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||
("", '<a>\n<b>\ntext\n</b>\n</a>\n'),
|
||||
|
||||
(1, '<a>\n <b>\n text\n </b>\n</a>'),
|
||||
(2, '<a>\n <b>\n text\n </b>\n</a>'),
|
||||
(1, '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||
(2, '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||
|
||||
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
|
||||
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
|
||||
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>\n'),
|
||||
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>\n'),
|
||||
|
||||
# Some invalid inputs -- the default behavior is used.
|
||||
(object(), '<a>\n <b>\n text\n </b>\n</a>'),
|
||||
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>'),
|
||||
(object(), '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>\n'),
|
||||
]
|
||||
)
|
||||
def test_indent(self, indent, expect):
|
||||
|
|
91
lib/bs4/tests/test_fuzz.py
Normal file
91
lib/bs4/tests/test_fuzz.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
"""This file contains test cases reported by third parties using
|
||||
fuzzing tools, primarily from Google's oss-fuzz project. Some of these
|
||||
represent real problems with Beautiful Soup, but many are problems in
|
||||
libraries that Beautiful Soup depends on, and many of the test cases
|
||||
represent different ways of triggering the same problem.
|
||||
|
||||
Grouping these test cases together makes it easy to see which test
|
||||
cases represent the same problem, and puts the test cases in close
|
||||
proximity to code that can trigger the problems.
|
||||
"""
|
||||
import os
|
||||
import pytest
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
ParserRejectedMarkup,
|
||||
)
|
||||
|
||||
class TestFuzz(object):
|
||||
|
||||
# Test case markup files from fuzzers are given this extension so
|
||||
# they can be included in builds.
|
||||
TESTCASE_SUFFIX = ".testcase"
|
||||
|
||||
# This class of error has been fixed by catching a less helpful
|
||||
# exception from html.parser and raising ParserRejectedMarkup
|
||||
# instead.
|
||||
@pytest.mark.parametrize(
|
||||
"filename", [
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
|
||||
]
|
||||
)
|
||||
def test_rejected_markup(self, filename):
|
||||
markup = self.__markup(filename)
|
||||
with pytest.raises(ParserRejectedMarkup):
|
||||
BeautifulSoup(markup, 'html.parser')
|
||||
|
||||
# This class of error has to do with very deeply nested documents
|
||||
# which overflow the Python call stack when the tree is converted
|
||||
# to a string. This is an issue with Beautiful Soup which was fixed
|
||||
# as part of [bug=1471755].
|
||||
@pytest.mark.parametrize(
|
||||
"filename", [
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632",
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464",
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
|
||||
]
|
||||
)
|
||||
def test_deeply_nested_document(self, filename):
|
||||
# Parsing the document and encoding it back to a string is
|
||||
# sufficient to demonstrate that the overflow problem has
|
||||
# been fixed.
|
||||
markup = self.__markup(filename)
|
||||
BeautifulSoup(markup, 'html.parser').encode()
|
||||
|
||||
# This class of error represents problems with html5lib's parser,
|
||||
# not Beautiful Soup. I use
|
||||
# https://github.com/html5lib/html5lib-python/issues/568 to notify
|
||||
# the html5lib developers of these issues.
|
||||
@pytest.mark.skip("html5lib problems")
|
||||
@pytest.mark.parametrize(
|
||||
"filename", [
|
||||
# b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320",
|
||||
|
||||
# b')<a><math><TR><a><mI><a><p><a>'
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456",
|
||||
|
||||
# b'-<math><sElect><mi><sElect><sElect>'
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",
|
||||
|
||||
# b'ñ<table><svg><html>'
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",
|
||||
|
||||
# <TABLE>, some ^@ characters, some <math> tags.
|
||||
"clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744",
|
||||
|
||||
# Nested table
|
||||
"crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08"
|
||||
]
|
||||
)
|
||||
def test_html5lib_parse_errors(self, filename):
|
||||
markup = self.__markup(filename)
|
||||
print(BeautifulSoup(markup, 'html5lib').encode())
|
||||
|
||||
def __markup(self, filename):
|
||||
if not filename.endswith(self.TESTCASE_SUFFIX):
|
||||
filename += self.TESTCASE_SUFFIX
|
||||
this_dir = os.path.split(__file__)[0]
|
||||
path = os.path.join(this_dir, 'fuzz', filename)
|
||||
return open(path, 'rb').read()
|
|
@ -3,9 +3,11 @@ trees."""
|
|||
|
||||
from pdb import set_trace
|
||||
import pickle
|
||||
import pytest
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
HTMLParserTreeBuilder,
|
||||
ParserRejectedMarkup,
|
||||
XMLParsedAsHTMLWarning,
|
||||
)
|
||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||
|
@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
def test_rejected_input(self):
|
||||
# Python's html.parser will occasionally reject markup,
|
||||
# especially when there is a problem with the initial DOCTYPE
|
||||
# declaration. Different versions of Python sound the alarm in
|
||||
# different ways, but Beautiful Soup consistently raises
|
||||
# errors as ParserRejectedMarkup exceptions.
|
||||
bad_markup = [
|
||||
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
|
||||
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
|
||||
# https://github.com/python/cpython/issues/81928
|
||||
b'\n<![\xff\xfe\xfe\xcd\x00',
|
||||
|
||||
#https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8
|
||||
# https://github.com/python/cpython/issues/78661
|
||||
#
|
||||
b'<![n\x00',
|
||||
b"<![UNKNOWN[]]>",
|
||||
]
|
||||
for markup in bad_markup:
|
||||
with pytest.raises(ParserRejectedMarkup):
|
||||
soup = self.soup(markup)
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
|
|
@ -189,13 +189,15 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
|
|||
assert soup.find('prefix:tag3').name == 'tag3'
|
||||
assert soup.subtag.find('prefix:tag3').name == 'tag3'
|
||||
|
||||
def test_pickle_removes_builder(self):
|
||||
# The lxml TreeBuilder is not picklable, so it won't be
|
||||
# preserved in a pickle/unpickle operation.
|
||||
|
||||
def test_pickle_restores_builder(self):
|
||||
# The lxml TreeBuilder is not picklable, so when unpickling
|
||||
# a document created with it, a new TreeBuilder of the
|
||||
# appropriate class is created.
|
||||
soup = self.soup("<a>some markup</a>")
|
||||
assert isinstance(soup.builder, self.default_builder)
|
||||
pickled = pickle.dumps(soup)
|
||||
unpickled = pickle.loads(pickled)
|
||||
|
||||
assert "some markup" == unpickled.a.string
|
||||
assert unpickled.builder is None
|
||||
assert unpickled.builder != soup.builder
|
||||
assert isinstance(unpickled.builder, self.default_builder)
|
||||
|
|
|
@ -2,20 +2,18 @@
|
|||
import copy
|
||||
import pickle
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
)
|
||||
from . import (
|
||||
SoupTest,
|
||||
SOUP_SIEVE_PRESENT,
|
||||
)
|
||||
|
||||
if SOUP_SIEVE_PRESENT:
|
||||
from soupsieve import SelectorSyntaxError
|
||||
|
||||
class TestEncoding(SoupTest):
|
||||
"""Test the ability to encode objects into strings."""
|
||||
|
||||
|
@ -52,9 +50,20 @@ class TestEncoding(SoupTest):
|
|||
encoding="utf8"
|
||||
)
|
||||
|
||||
def test_encode_deeply_nested_document(self):
|
||||
# This test verifies that encoding a string doesn't involve
|
||||
# any recursive function calls. If it did, this test would
|
||||
# overflow the Python interpreter stack.
|
||||
limit = sys.getrecursionlimit() + 1
|
||||
markup = "<span>" * limit
|
||||
soup = self.soup(markup)
|
||||
encoded = soup.encode()
|
||||
assert limit == encoded.count(b"<span>")
|
||||
|
||||
def test_deprecated_renderContents(self):
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
soup.renderContents()
|
||||
assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents()
|
||||
|
||||
def test_repr(self):
|
||||
|
@ -159,7 +168,31 @@ class TestFormatters(SoupTest):
|
|||
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
|
||||
# Everything outside the <pre> tag is reformatted, but everything
|
||||
# inside is left alone.
|
||||
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
|
||||
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>\n' == soup.div.prettify()
|
||||
|
||||
def test_prettify_handles_nested_string_literal_tags(self):
|
||||
# Most of this markup is inside a <pre> tag, so prettify()
|
||||
# only does three things to it:
|
||||
# 1. Add a newline and a space between the <div> and the <pre>
|
||||
# 2. Add a newline after the </pre>
|
||||
# 3. Add a newline at the end.
|
||||
#
|
||||
# The contents of the <pre> tag are left completely alone. In
|
||||
# particular, we don't start adding whitespace again once we
|
||||
# encounter the first </pre> tag, because we know it's not
|
||||
# the one that put us into string literal mode.
|
||||
markup = """<div><pre><code>some
|
||||
<script><pre>code</pre></script> for you
|
||||
</code></pre></div>"""
|
||||
|
||||
expect = """<div>
|
||||
<pre><code>some
|
||||
<script><pre>code</pre></script> for you
|
||||
</code></pre>
|
||||
</div>
|
||||
"""
|
||||
soup = self.soup(markup)
|
||||
assert expect == soup.div.prettify()
|
||||
|
||||
def test_prettify_accepts_formatter_function(self):
|
||||
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
||||
|
@ -216,429 +249,6 @@ class TestFormatters(SoupTest):
|
|||
assert soup.contents[0].name == 'pre'
|
||||
|
||||
|
||||
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
|
||||
class TestCSSSelectors(SoupTest):
|
||||
"""Test basic CSS selector functionality.
|
||||
|
||||
This functionality is implemented in soupsieve, which has a much
|
||||
more comprehensive test suite, so this is basically an extra check
|
||||
that soupsieve works as expected.
|
||||
"""
|
||||
|
||||
HTML = """
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>The title</title>
|
||||
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
|
||||
</head>
|
||||
<body>
|
||||
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
|
||||
<div id="main" class="fancy">
|
||||
<div id="inner">
|
||||
<h1 id="header1">An H1</h1>
|
||||
<p>Some text</p>
|
||||
<p class="onep" id="p1">Some more text</p>
|
||||
<h2 id="header2">An H2</h2>
|
||||
<p class="class1 class2 class3" id="pmulti">Another</p>
|
||||
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
|
||||
<h2 id="header3">Another H2</h2>
|
||||
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
|
||||
<span class="s1">
|
||||
<a href="#" id="s1a1">span1a1</a>
|
||||
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
|
||||
<span class="span2">
|
||||
<a href="#" id="s2a1">span2a1</a>
|
||||
</span>
|
||||
<span class="span3"></span>
|
||||
<custom-dashed-tag class="dashed" id="dash2"/>
|
||||
<div data-tag="dashedvalue" id="data1"/>
|
||||
</span>
|
||||
</div>
|
||||
<x id="xid">
|
||||
<z id="zida"/>
|
||||
<z id="zidab"/>
|
||||
<z id="zidac"/>
|
||||
</x>
|
||||
<y id="yid">
|
||||
<z id="zidb"/>
|
||||
</y>
|
||||
<p lang="en" id="lang-en">English</p>
|
||||
<p lang="en-gb" id="lang-en-gb">English UK</p>
|
||||
<p lang="en-us" id="lang-en-us">English US</p>
|
||||
<p lang="fr" id="lang-fr">French</p>
|
||||
</div>
|
||||
|
||||
<div id="footer">
|
||||
</div>
|
||||
"""
|
||||
|
||||
def setup_method(self):
|
||||
self.soup = BeautifulSoup(self.HTML, 'html.parser')
|
||||
|
||||
def assert_selects(self, selector, expected_ids, **kwargs):
|
||||
el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
|
||||
el_ids.sort()
|
||||
expected_ids.sort()
|
||||
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
|
||||
selector, ', '.join(expected_ids), ', '.join(el_ids)
|
||||
)
|
||||
|
||||
assertSelect = assert_selects
|
||||
|
||||
def assert_select_multiple(self, *tests):
|
||||
for selector, expected_ids in tests:
|
||||
self.assert_selects(selector, expected_ids)
|
||||
|
||||
def test_one_tag_one(self):
|
||||
els = self.soup.select('title')
|
||||
assert len(els) == 1
|
||||
assert els[0].name == 'title'
|
||||
assert els[0].contents == ['The title']
|
||||
|
||||
def test_one_tag_many(self):
|
||||
els = self.soup.select('div')
|
||||
assert len(els) == 4
|
||||
for div in els:
|
||||
assert div.name == 'div'
|
||||
|
||||
el = self.soup.select_one('div')
|
||||
assert 'main' == el['id']
|
||||
|
||||
def test_select_one_returns_none_if_no_match(self):
|
||||
match = self.soup.select_one('nonexistenttag')
|
||||
assert None == match
|
||||
|
||||
|
||||
def test_tag_in_tag_one(self):
|
||||
els = self.soup.select('div div')
|
||||
self.assert_selects('div div', ['inner', 'data1'])
|
||||
|
||||
def test_tag_in_tag_many(self):
|
||||
for selector in ('html div', 'html body div', 'body div'):
|
||||
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
|
||||
|
||||
|
||||
def test_limit(self):
|
||||
self.assert_selects('html div', ['main'], limit=1)
|
||||
self.assert_selects('html body div', ['inner', 'main'], limit=2)
|
||||
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
|
||||
limit=10)
|
||||
|
||||
def test_tag_no_match(self):
|
||||
assert len(self.soup.select('del')) == 0
|
||||
|
||||
def test_invalid_tag(self):
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select('tag%t')
|
||||
|
||||
def test_select_dashed_tag_ids(self):
|
||||
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||
|
||||
def test_select_dashed_by_id(self):
|
||||
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
|
||||
assert dashed[0].name == 'custom-dashed-tag'
|
||||
assert dashed[0]['id'] == 'dash2'
|
||||
|
||||
def test_dashed_tag_text(self):
|
||||
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
|
||||
|
||||
def test_select_dashed_matches_find_all(self):
|
||||
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
|
||||
|
||||
def test_header_tags(self):
|
||||
self.assert_select_multiple(
|
||||
('h1', ['header1']),
|
||||
('h2', ['header2', 'header3']),
|
||||
)
|
||||
|
||||
def test_class_one(self):
|
||||
for selector in ('.onep', 'p.onep', 'html p.onep'):
|
||||
els = self.soup.select(selector)
|
||||
assert len(els) == 1
|
||||
assert els[0].name == 'p'
|
||||
assert els[0]['class'] == ['onep']
|
||||
|
||||
def test_class_mismatched_tag(self):
|
||||
els = self.soup.select('div.onep')
|
||||
assert len(els) == 0
|
||||
|
||||
def test_one_id(self):
|
||||
for selector in ('div#inner', '#inner', 'div div#inner'):
|
||||
self.assert_selects(selector, ['inner'])
|
||||
|
||||
def test_bad_id(self):
|
||||
els = self.soup.select('#doesnotexist')
|
||||
assert len(els) == 0
|
||||
|
||||
def test_items_in_id(self):
|
||||
els = self.soup.select('div#inner p')
|
||||
assert len(els) == 3
|
||||
for el in els:
|
||||
assert el.name == 'p'
|
||||
assert els[1]['class'] == ['onep']
|
||||
assert not els[0].has_attr('class')
|
||||
|
||||
def test_a_bunch_of_emptys(self):
|
||||
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
||||
assert len(self.soup.select(selector)) == 0
|
||||
|
||||
def test_multi_class_support(self):
|
||||
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
|
||||
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
|
||||
self.assert_selects(selector, ['pmulti'])
|
||||
|
||||
def test_multi_class_selection(self):
|
||||
for selector in ('.class1.class3', '.class3.class2',
|
||||
'.class1.class2.class3'):
|
||||
self.assert_selects(selector, ['pmulti'])
|
||||
|
||||
def test_child_selector(self):
|
||||
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
|
||||
self.assert_selects('.s1 > a span', ['s1a2s1'])
|
||||
|
||||
def test_child_selector_id(self):
|
||||
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
|
||||
|
||||
def test_attribute_equals(self):
|
||||
self.assert_select_multiple(
|
||||
('p[class="onep"]', ['p1']),
|
||||
('p[id="p1"]', ['p1']),
|
||||
('[class="onep"]', ['p1']),
|
||||
('[id="p1"]', ['p1']),
|
||||
('link[rel="stylesheet"]', ['l1']),
|
||||
('link[type="text/css"]', ['l1']),
|
||||
('link[href="blah.css"]', ['l1']),
|
||||
('link[href="no-blah.css"]', []),
|
||||
('[rel="stylesheet"]', ['l1']),
|
||||
('[type="text/css"]', ['l1']),
|
||||
('[href="blah.css"]', ['l1']),
|
||||
('[href="no-blah.css"]', []),
|
||||
('p[href="no-blah.css"]', []),
|
||||
('[href="no-blah.css"]', []),
|
||||
)
|
||||
|
||||
def test_attribute_tilde(self):
|
||||
self.assert_select_multiple(
|
||||
('p[class~="class1"]', ['pmulti']),
|
||||
('p[class~="class2"]', ['pmulti']),
|
||||
('p[class~="class3"]', ['pmulti']),
|
||||
('[class~="class1"]', ['pmulti']),
|
||||
('[class~="class2"]', ['pmulti']),
|
||||
('[class~="class3"]', ['pmulti']),
|
||||
('a[rel~="friend"]', ['bob']),
|
||||
('a[rel~="met"]', ['bob']),
|
||||
('[rel~="friend"]', ['bob']),
|
||||
('[rel~="met"]', ['bob']),
|
||||
)
|
||||
|
||||
def test_attribute_startswith(self):
|
||||
self.assert_select_multiple(
|
||||
('[rel^="style"]', ['l1']),
|
||||
('link[rel^="style"]', ['l1']),
|
||||
('notlink[rel^="notstyle"]', []),
|
||||
('[rel^="notstyle"]', []),
|
||||
('link[rel^="notstyle"]', []),
|
||||
('link[href^="bla"]', ['l1']),
|
||||
('a[href^="http://"]', ['bob', 'me']),
|
||||
('[href^="http://"]', ['bob', 'me']),
|
||||
('[id^="p"]', ['pmulti', 'p1']),
|
||||
('[id^="m"]', ['me', 'main']),
|
||||
('div[id^="m"]', ['main']),
|
||||
('a[id^="m"]', ['me']),
|
||||
('div[data-tag^="dashed"]', ['data1'])
|
||||
)
|
||||
|
||||
def test_attribute_endswith(self):
|
||||
self.assert_select_multiple(
|
||||
('[href$=".css"]', ['l1']),
|
||||
('link[href$=".css"]', ['l1']),
|
||||
('link[id$="1"]', ['l1']),
|
||||
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
|
||||
('div[id$="1"]', ['data1']),
|
||||
('[id$="noending"]', []),
|
||||
)
|
||||
|
||||
def test_attribute_contains(self):
|
||||
self.assert_select_multiple(
|
||||
# From test_attribute_startswith
|
||||
('[rel*="style"]', ['l1']),
|
||||
('link[rel*="style"]', ['l1']),
|
||||
('notlink[rel*="notstyle"]', []),
|
||||
('[rel*="notstyle"]', []),
|
||||
('link[rel*="notstyle"]', []),
|
||||
('link[href*="bla"]', ['l1']),
|
||||
('[href*="http://"]', ['bob', 'me']),
|
||||
('[id*="p"]', ['pmulti', 'p1']),
|
||||
('div[id*="m"]', ['main']),
|
||||
('a[id*="m"]', ['me']),
|
||||
# From test_attribute_endswith
|
||||
('[href*=".css"]', ['l1']),
|
||||
('link[href*=".css"]', ['l1']),
|
||||
('link[id*="1"]', ['l1']),
|
||||
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
|
||||
('div[id*="1"]', ['data1']),
|
||||
('[id*="noending"]', []),
|
||||
# New for this test
|
||||
('[href*="."]', ['bob', 'me', 'l1']),
|
||||
('a[href*="."]', ['bob', 'me']),
|
||||
('link[href*="."]', ['l1']),
|
||||
('div[id*="n"]', ['main', 'inner']),
|
||||
('div[id*="nn"]', ['inner']),
|
||||
('div[data-tag*="edval"]', ['data1'])
|
||||
)
|
||||
|
||||
def test_attribute_exact_or_hypen(self):
|
||||
self.assert_select_multiple(
|
||||
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||
('p[lang|="fr"]', ['lang-fr']),
|
||||
('p[lang|="gb"]', []),
|
||||
)
|
||||
|
||||
def test_attribute_exists(self):
|
||||
self.assert_select_multiple(
|
||||
('[rel]', ['l1', 'bob', 'me']),
|
||||
('link[rel]', ['l1']),
|
||||
('a[rel]', ['bob', 'me']),
|
||||
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
|
||||
('p[class]', ['p1', 'pmulti']),
|
||||
('[blah]', []),
|
||||
('p[blah]', []),
|
||||
('div[data-tag]', ['data1'])
|
||||
)
|
||||
|
||||
def test_quoted_space_in_selector_name(self):
|
||||
html = """<div style="display: wrong">nope</div>
|
||||
<div style="display: right">yes</div>
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
[chosen] = soup.select('div[style="display: right"]')
|
||||
assert "yes" == chosen.string
|
||||
|
||||
def test_unsupported_pseudoclass(self):
|
||||
with pytest.raises(NotImplementedError):
|
||||
self.soup.select("a:no-such-pseudoclass")
|
||||
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select("a:nth-of-type(a)")
|
||||
|
||||
def test_nth_of_type(self):
|
||||
# Try to select first paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||
assert len(els) == 1
|
||||
assert els[0].string == 'Some text'
|
||||
|
||||
# Try to select third paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||
assert len(els) == 1
|
||||
assert els[0].string == 'Another'
|
||||
|
||||
# Try to select (non-existent!) fourth paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||
assert len(els) == 0
|
||||
|
||||
# Zero will select no tags.
|
||||
els = self.soup.select('div p:nth-of-type(0)')
|
||||
assert len(els) == 0
|
||||
|
||||
def test_nth_of_type_direct_descendant(self):
|
||||
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||
assert len(els) == 1
|
||||
assert els[0].string == 'Some text'
|
||||
|
||||
def test_id_child_selector_nth_of_type(self):
|
||||
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
|
||||
|
||||
def test_select_on_element(self):
|
||||
# Other tests operate on the tree; this operates on an element
|
||||
# within the tree.
|
||||
inner = self.soup.find("div", id="main")
|
||||
selected = inner.select("div")
|
||||
# The <div id="inner"> tag was selected. The <div id="footer">
|
||||
# tag was not.
|
||||
self.assert_selects_ids(selected, ['inner', 'data1'])
|
||||
|
||||
def test_overspecified_child_id(self):
|
||||
self.assert_selects(".fancy #inner", ['inner'])
|
||||
self.assert_selects(".normal #inner", [])
|
||||
|
||||
def test_adjacent_sibling_selector(self):
|
||||
self.assert_selects('#p1 + h2', ['header2'])
|
||||
self.assert_selects('#p1 + h2 + p', ['pmulti'])
|
||||
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
|
||||
assert [] == self.soup.select('#p1 + p')
|
||||
|
||||
def test_general_sibling_selector(self):
|
||||
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
|
||||
self.assert_selects('#p1 ~ #header2', ['header2'])
|
||||
self.assert_selects('#p1 ~ h2 + a', ['me'])
|
||||
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
|
||||
assert [] == self.soup.select('#inner ~ h2')
|
||||
|
||||
def test_dangling_combinator(self):
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select('h1 >')
|
||||
|
||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||
|
||||
# Test the selector grouping operator (the comma)
|
||||
def test_multiple_select(self):
|
||||
self.assert_selects('x, y', ['xid', 'yid'])
|
||||
|
||||
def test_multiple_select_with_no_space(self):
|
||||
self.assert_selects('x,y', ['xid', 'yid'])
|
||||
|
||||
def test_multiple_select_with_more_space(self):
|
||||
self.assert_selects('x, y', ['xid', 'yid'])
|
||||
|
||||
def test_multiple_select_duplicated(self):
|
||||
self.assert_selects('x, x', ['xid'])
|
||||
|
||||
def test_multiple_select_sibling(self):
|
||||
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
|
||||
|
||||
def test_multiple_select_tag_and_direct_descendant(self):
|
||||
self.assert_selects('x, y > z', ['xid', 'zidb'])
|
||||
|
||||
def test_multiple_select_direct_descendant_and_tags(self):
|
||||
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||
|
||||
def test_multiple_select_indirect_descendant(self):
|
||||
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||
|
||||
def test_invalid_multiple_select(self):
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select(',x, y')
|
||||
with pytest.raises(SelectorSyntaxError):
|
||||
self.soup.select('x,,y')
|
||||
|
||||
def test_multiple_select_attrs(self):
|
||||
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||
|
||||
def test_multiple_select_ids(self):
|
||||
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
|
||||
|
||||
def test_multiple_select_nested(self):
|
||||
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
|
||||
|
||||
def test_select_duplicate_elements(self):
|
||||
# When markup contains duplicate elements, a multiple select
|
||||
# will find all of them.
|
||||
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||||
soup = BeautifulSoup(markup, 'html.parser')
|
||||
selected = soup.select(".c1, .c2")
|
||||
assert 3 == len(selected)
|
||||
|
||||
# Verify that find_all finds the same elements, though because
|
||||
# of an implementation detail it finds them in a different
|
||||
# order.
|
||||
for element in soup.find_all(class_=['c1', 'c2']):
|
||||
assert element in selected
|
||||
|
||||
|
||||
class TestPersistence(SoupTest):
|
||||
"Testing features like pickle and deepcopy."
|
||||
|
||||
|
@ -674,6 +284,18 @@ class TestPersistence(SoupTest):
|
|||
copied = copy.deepcopy(self.tree)
|
||||
assert copied.decode() == self.tree.decode()
|
||||
|
||||
def test_copy_deeply_nested_document(self):
|
||||
# This test verifies that copy and deepcopy don't involve any
|
||||
# recursive function calls. If they did, this test would
|
||||
# overflow the Python interpreter stack.
|
||||
limit = sys.getrecursionlimit() + 1
|
||||
markup = "<span>" * limit
|
||||
|
||||
soup = self.soup(markup)
|
||||
|
||||
copied = copy.copy(soup)
|
||||
copied = copy.deepcopy(soup)
|
||||
|
||||
def test_copy_preserves_encoding(self):
|
||||
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||||
encoding = soup.original_encoding
|
||||
|
|
|
@ -24,6 +24,7 @@ from bs4.builder import (
|
|||
from bs4.element import (
|
||||
Comment,
|
||||
SoupStrainer,
|
||||
PYTHON_SPECIFIC_ENCODINGS,
|
||||
Tag,
|
||||
NavigableString,
|
||||
)
|
||||
|
@ -210,6 +211,47 @@ class TestConstructor(SoupTest):
|
|||
assert [] == soup.string_container_stack
|
||||
|
||||
|
||||
class TestOutput(SoupTest):
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eventual_encoding,actual_encoding", [
|
||||
("utf-8", "utf-8"),
|
||||
("utf-16", "utf-16"),
|
||||
]
|
||||
)
|
||||
def test_decode_xml_declaration(self, eventual_encoding, actual_encoding):
|
||||
# Most of the time, calling decode() on an XML document will
|
||||
# give you a document declaration that mentions the encoding
|
||||
# you intend to use when encoding the document as a
|
||||
# bytestring.
|
||||
soup = self.soup("<tag></tag>")
|
||||
soup.is_xml = True
|
||||
assert (f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>'
|
||||
== soup.decode(eventual_encoding=eventual_encoding))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None]
|
||||
)
|
||||
def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(self, eventual_encoding):
|
||||
# But if you pass a Python internal encoding into decode(), or
|
||||
# omit the eventual_encoding altogether, the document
|
||||
# declaration won't mention any particular encoding.
|
||||
soup = BeautifulSoup("<tag></tag>", "html.parser")
|
||||
soup.is_xml = True
|
||||
assert (f'<?xml version="1.0"?>\n<tag></tag>'
|
||||
== soup.decode(eventual_encoding=eventual_encoding))
|
||||
|
||||
def test(self):
|
||||
# BeautifulSoup subclasses Tag and extends the decode() method.
|
||||
# Make sure the other Tag methods which call decode() call
|
||||
# it correctly.
|
||||
soup = self.soup("<tag></tag>")
|
||||
assert b"<tag></tag>" == soup.encode(encoding="utf-8")
|
||||
assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8")
|
||||
assert "<tag></tag>" == soup.decode_contents()
|
||||
assert "<tag>\n</tag>\n" == soup.prettify()
|
||||
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
# Note that some of the tests in this class create BeautifulSoup
|
||||
# objects directly rather than using self.soup(). That's
|
||||
|
|
|
@ -32,7 +32,7 @@ from . import css_match as cm
|
|||
from . import css_types as ct
|
||||
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
||||
import bs4 # type: ignore[import]
|
||||
from typing import Optional, Any, Iterator, Iterable
|
||||
from typing import Any, Iterator, Iterable
|
||||
|
||||
__all__ = (
|
||||
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
||||
|
@ -45,10 +45,10 @@ SoupSieve = cm.SoupSieve
|
|||
|
||||
def compile( # noqa: A001
|
||||
pattern: str,
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> cm.SoupSieve:
|
||||
"""Compile CSS pattern."""
|
||||
|
@ -79,10 +79,10 @@ def purge() -> None:
|
|||
def closest(
|
||||
select: str,
|
||||
tag: 'bs4.Tag',
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> 'bs4.Tag':
|
||||
"""Match closest ancestor."""
|
||||
|
@ -93,10 +93,10 @@ def closest(
|
|||
def match(
|
||||
select: str,
|
||||
tag: 'bs4.Tag',
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> bool:
|
||||
"""Match node."""
|
||||
|
@ -107,10 +107,10 @@ def match(
|
|||
def filter( # noqa: A001
|
||||
select: str,
|
||||
iterable: Iterable['bs4.Tag'],
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> list['bs4.Tag']:
|
||||
"""Filter list of nodes."""
|
||||
|
@ -121,10 +121,10 @@ def filter( # noqa: A001
|
|||
def select_one(
|
||||
select: str,
|
||||
tag: 'bs4.Tag',
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> 'bs4.Tag':
|
||||
"""Select a single tag."""
|
||||
|
@ -135,11 +135,11 @@ def select_one(
|
|||
def select(
|
||||
select: str,
|
||||
tag: 'bs4.Tag',
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
limit: int = 0,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> list['bs4.Tag']:
|
||||
"""Select the specified tags."""
|
||||
|
@ -150,11 +150,11 @@ def select(
|
|||
def iselect(
|
||||
select: str,
|
||||
tag: 'bs4.Tag',
|
||||
namespaces: Optional[dict[str, str]] = None,
|
||||
namespaces: dict[str, str] | None = None,
|
||||
limit: int = 0,
|
||||
flags: int = 0,
|
||||
*,
|
||||
custom: Optional[dict[str, str]] = None,
|
||||
custom: dict[str, str] | None = None,
|
||||
**kwargs: Any
|
||||
) -> Iterator['bs4.Tag']:
|
||||
"""Iterate the specified tags."""
|
||||
|
|
|
@ -193,5 +193,5 @@ def parse_version(ver: str) -> Version:
|
|||
return Version(major, minor, micro, release, pre, post, dev)
|
||||
|
||||
|
||||
__version_info__ = Version(2, 4, 0, "final")
|
||||
__version_info__ = Version(2, 4, 1, "final")
|
||||
__version__ = __version_info__._get_canonical()
|
||||
|
|
|
@ -6,7 +6,7 @@ import re
|
|||
from . import css_types as ct
|
||||
import unicodedata
|
||||
import bs4 # type: ignore[import]
|
||||
from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
|
||||
from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
|
||||
|
||||
# Empty tag pattern (whitespace okay)
|
||||
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
||||
|
@ -171,7 +171,7 @@ class _DocumentNav:
|
|||
def get_children(
|
||||
self,
|
||||
el: bs4.Tag,
|
||||
start: Optional[int] = None,
|
||||
start: int | None = None,
|
||||
reverse: bool = False,
|
||||
tags: bool = True,
|
||||
no_iframe: bool = False
|
||||
|
@ -239,22 +239,22 @@ class _DocumentNav:
|
|||
return parent
|
||||
|
||||
@staticmethod
|
||||
def get_tag_name(el: bs4.Tag) -> Optional[str]:
|
||||
def get_tag_name(el: bs4.Tag) -> str | None:
|
||||
"""Get tag."""
|
||||
|
||||
return cast(Optional[str], el.name)
|
||||
return cast('str | None', el.name)
|
||||
|
||||
@staticmethod
|
||||
def get_prefix_name(el: bs4.Tag) -> Optional[str]:
|
||||
def get_prefix_name(el: bs4.Tag) -> str | None:
|
||||
"""Get prefix."""
|
||||
|
||||
return cast(Optional[str], el.prefix)
|
||||
return cast('str | None', el.prefix)
|
||||
|
||||
@staticmethod
|
||||
def get_uri(el: bs4.Tag) -> Optional[str]:
|
||||
def get_uri(el: bs4.Tag) -> str | None:
|
||||
"""Get namespace `URI`."""
|
||||
|
||||
return cast(Optional[str], el.namespace)
|
||||
return cast('str | None', el.namespace)
|
||||
|
||||
@classmethod
|
||||
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
||||
|
@ -287,7 +287,7 @@ class _DocumentNav:
|
|||
return bool(ns and ns == NS_XHTML)
|
||||
|
||||
@staticmethod
|
||||
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
|
||||
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
|
||||
"""Return namespace and attribute name without the prefix."""
|
||||
|
||||
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
|
||||
|
@ -330,8 +330,8 @@ class _DocumentNav:
|
|||
cls,
|
||||
el: bs4.Tag,
|
||||
name: str,
|
||||
default: Optional[str | Sequence[str]] = None
|
||||
) -> Optional[str | Sequence[str]]:
|
||||
default: str | Sequence[str] | None = None
|
||||
) -> str | Sequence[str] | None:
|
||||
"""Get attribute by name."""
|
||||
|
||||
value = default
|
||||
|
@ -348,7 +348,7 @@ class _DocumentNav:
|
|||
return value
|
||||
|
||||
@classmethod
|
||||
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
|
||||
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
|
||||
"""Iterate attributes."""
|
||||
|
||||
for k, v in el.attrs.items():
|
||||
|
@ -424,10 +424,10 @@ class Inputs:
|
|||
return 0 <= minutes <= 59
|
||||
|
||||
@classmethod
|
||||
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
|
||||
def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
|
||||
"""Parse the input value."""
|
||||
|
||||
parsed = None # type: Optional[tuple[float, ...]]
|
||||
parsed = None # type: tuple[float, ...] | None
|
||||
if value is None:
|
||||
return value
|
||||
if itype == "date":
|
||||
|
@ -486,7 +486,7 @@ class CSSMatch(_DocumentNav):
|
|||
self,
|
||||
selectors: ct.SelectorList,
|
||||
scope: bs4.Tag,
|
||||
namespaces: Optional[ct.Namespaces],
|
||||
namespaces: ct.Namespaces | None,
|
||||
flags: int
|
||||
) -> None:
|
||||
"""Initialize."""
|
||||
|
@ -545,19 +545,19 @@ class CSSMatch(_DocumentNav):
|
|||
|
||||
return self.get_tag_ns(el) == NS_XHTML
|
||||
|
||||
def get_tag(self, el: bs4.Tag) -> Optional[str]:
|
||||
def get_tag(self, el: bs4.Tag) -> str | None:
|
||||
"""Get tag."""
|
||||
|
||||
name = self.get_tag_name(el)
|
||||
return util.lower(name) if name is not None and not self.is_xml else name
|
||||
|
||||
def get_prefix(self, el: bs4.Tag) -> Optional[str]:
|
||||
def get_prefix(self, el: bs4.Tag) -> str | None:
|
||||
"""Get prefix."""
|
||||
|
||||
prefix = self.get_prefix_name(el)
|
||||
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
|
||||
|
||||
def find_bidi(self, el: bs4.Tag) -> Optional[int]:
|
||||
def find_bidi(self, el: bs4.Tag) -> int | None:
|
||||
"""Get directionality from element text."""
|
||||
|
||||
for node in self.get_children(el, tags=False):
|
||||
|
@ -653,8 +653,8 @@ class CSSMatch(_DocumentNav):
|
|||
self,
|
||||
el: bs4.Tag,
|
||||
attr: str,
|
||||
prefix: Optional[str]
|
||||
) -> Optional[str | Sequence[str]]:
|
||||
prefix: str | None
|
||||
) -> str | Sequence[str] | None:
|
||||
"""Match attribute name and return value if it exists."""
|
||||
|
||||
value = None
|
||||
|
@ -751,7 +751,7 @@ class CSSMatch(_DocumentNav):
|
|||
name not in (self.get_tag(el), '*')
|
||||
)
|
||||
|
||||
def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
|
||||
def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
|
||||
"""Match the tag."""
|
||||
|
||||
match = True
|
||||
|
@ -1030,7 +1030,7 @@ class CSSMatch(_DocumentNav):
|
|||
"""Match element if it contains text."""
|
||||
|
||||
match = True
|
||||
content = None # type: Optional[str | Sequence[str]]
|
||||
content = None # type: str | Sequence[str] | None
|
||||
for contain_list in contains:
|
||||
if content is None:
|
||||
if contain_list.own:
|
||||
|
@ -1099,7 +1099,7 @@ class CSSMatch(_DocumentNav):
|
|||
match = False
|
||||
name = cast(str, self.get_attribute_by_name(el, 'name'))
|
||||
|
||||
def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
|
||||
def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
|
||||
"""Find this input's form."""
|
||||
form = None
|
||||
parent = self.get_parent(el, no_iframe=True)
|
||||
|
@ -1478,7 +1478,7 @@ class CSSMatch(_DocumentNav):
|
|||
if lim < 1:
|
||||
break
|
||||
|
||||
def closest(self) -> Optional[bs4.Tag]:
|
||||
def closest(self) -> bs4.Tag | None:
|
||||
"""Match closest ancestor."""
|
||||
|
||||
current = self.tag
|
||||
|
@ -1506,7 +1506,7 @@ class SoupSieve(ct.Immutable):
|
|||
|
||||
pattern: str
|
||||
selectors: ct.SelectorList
|
||||
namespaces: Optional[ct.Namespaces]
|
||||
namespaces: ct.Namespaces | None
|
||||
custom: dict[str, str]
|
||||
flags: int
|
||||
|
||||
|
@ -1516,8 +1516,8 @@ class SoupSieve(ct.Immutable):
|
|||
self,
|
||||
pattern: str,
|
||||
selectors: ct.SelectorList,
|
||||
namespaces: Optional[ct.Namespaces],
|
||||
custom: Optional[ct.CustomSelectors],
|
||||
namespaces: ct.Namespaces | None,
|
||||
custom: ct.CustomSelectors | None,
|
||||
flags: int
|
||||
):
|
||||
"""Initialize."""
|
||||
|
|
|
@ -7,7 +7,7 @@ from . import css_match as cm
|
|||
from . import css_types as ct
|
||||
from .util import SelectorSyntaxError
|
||||
import warnings
|
||||
from typing import Optional, Match, Any, Iterator, cast
|
||||
from typing import Match, Any, Iterator, cast
|
||||
|
||||
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
||||
|
||||
|
@ -113,7 +113,7 @@ VALUE = r'''
|
|||
'''.format(nl=NEWLINE, ident=IDENTIFIER)
|
||||
# Attribute value comparison. `!=` is handled special as it is non-standard.
|
||||
ATTR = r'''
|
||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}*(?P<case>[is]))?)?{ws}*\]
|
||||
'''.format(ws=WSC, value=VALUE)
|
||||
|
||||
# Selector patterns
|
||||
|
@ -207,8 +207,8 @@ _MAXCACHE = 500
|
|||
@lru_cache(maxsize=_MAXCACHE)
|
||||
def _cached_css_compile(
|
||||
pattern: str,
|
||||
namespaces: Optional[ct.Namespaces],
|
||||
custom: Optional[ct.CustomSelectors],
|
||||
namespaces: ct.Namespaces | None,
|
||||
custom: ct.CustomSelectors | None,
|
||||
flags: int
|
||||
) -> cm.SoupSieve:
|
||||
"""Cached CSS compile."""
|
||||
|
@ -233,7 +233,7 @@ def _purge_cache() -> None:
|
|||
_cached_css_compile.cache_clear()
|
||||
|
||||
|
||||
def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
|
||||
def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
|
||||
"""Process custom."""
|
||||
|
||||
custom_selectors = {}
|
||||
|
@ -317,7 +317,7 @@ class SelectorPattern:
|
|||
|
||||
return self.name
|
||||
|
||||
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
||||
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
|
||||
"""Match the selector."""
|
||||
|
||||
return self.re_pattern.match(selector, index)
|
||||
|
@ -336,7 +336,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
|||
for pseudo in p[1]:
|
||||
self.patterns[pseudo] = pattern
|
||||
|
||||
self.matched_name = None # type: Optional[SelectorPattern]
|
||||
self.matched_name = None # type: SelectorPattern | None
|
||||
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
|
||||
|
||||
def get_name(self) -> str:
|
||||
|
@ -344,7 +344,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
|||
|
||||
return '' if self.matched_name is None else self.matched_name.get_name()
|
||||
|
||||
def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
|
||||
def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
|
||||
"""Match the selector."""
|
||||
|
||||
pseudo = None
|
||||
|
@ -372,14 +372,14 @@ class _Selector:
|
|||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initialize."""
|
||||
|
||||
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
|
||||
self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
|
||||
self.ids = kwargs.get('ids', []) # type: list[str]
|
||||
self.classes = kwargs.get('classes', []) # type: list[str]
|
||||
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
|
||||
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
|
||||
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
|
||||
self.relations = kwargs.get('relations', []) # type: list[_Selector]
|
||||
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
|
||||
self.rel_type = kwargs.get('rel_type', None) # type: str | None
|
||||
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
|
||||
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
|
||||
self.flags = kwargs.get('flags', 0) # type: int
|
||||
|
@ -462,7 +462,7 @@ class CSSParser:
|
|||
def __init__(
|
||||
self,
|
||||
selector: str,
|
||||
custom: Optional[dict[str, str | ct.SelectorList]] = None,
|
||||
custom: dict[str, str | ct.SelectorList] | None = None,
|
||||
flags: int = 0
|
||||
) -> None:
|
||||
"""Initialize."""
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import annotations
|
||||
import copyreg
|
||||
from .pretty import pretty
|
||||
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
|
||||
from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping
|
||||
|
||||
__all__ = (
|
||||
'Selector',
|
||||
|
@ -189,28 +189,28 @@ class Selector(Immutable):
|
|||
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
|
||||
)
|
||||
|
||||
tag: Optional[SelectorTag]
|
||||
tag: SelectorTag | None
|
||||
ids: tuple[str, ...]
|
||||
classes: tuple[str, ...]
|
||||
attributes: tuple[SelectorAttribute, ...]
|
||||
nth: tuple[SelectorNth, ...]
|
||||
selectors: tuple[SelectorList, ...]
|
||||
relation: SelectorList
|
||||
rel_type: Optional[str]
|
||||
rel_type: str | None
|
||||
contains: tuple[SelectorContains, ...]
|
||||
lang: tuple[SelectorLang, ...]
|
||||
flags: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tag: Optional[SelectorTag],
|
||||
tag: SelectorTag | None,
|
||||
ids: tuple[str, ...],
|
||||
classes: tuple[str, ...],
|
||||
attributes: tuple[SelectorAttribute, ...],
|
||||
nth: tuple[SelectorNth, ...],
|
||||
selectors: tuple[SelectorList, ...],
|
||||
relation: SelectorList,
|
||||
rel_type: Optional[str],
|
||||
rel_type: str | None,
|
||||
contains: tuple[SelectorContains, ...],
|
||||
lang: tuple[SelectorLang, ...],
|
||||
flags: int
|
||||
|
@ -247,9 +247,9 @@ class SelectorTag(Immutable):
|
|||
__slots__ = ("name", "prefix", "_hash")
|
||||
|
||||
name: str
|
||||
prefix: Optional[str]
|
||||
prefix: str | None
|
||||
|
||||
def __init__(self, name: str, prefix: Optional[str]) -> None:
|
||||
def __init__(self, name: str, prefix: str | None) -> None:
|
||||
"""Initialize."""
|
||||
|
||||
super().__init__(name=name, prefix=prefix)
|
||||
|
@ -262,15 +262,15 @@ class SelectorAttribute(Immutable):
|
|||
|
||||
attribute: str
|
||||
prefix: str
|
||||
pattern: Optional[Pattern[str]]
|
||||
xml_type_pattern: Optional[Pattern[str]]
|
||||
pattern: Pattern[str] | None
|
||||
xml_type_pattern: Pattern[str] | None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attribute: str,
|
||||
prefix: str,
|
||||
pattern: Optional[Pattern[str]],
|
||||
xml_type_pattern: Optional[Pattern[str]]
|
||||
pattern: Pattern[str] | None,
|
||||
xml_type_pattern: Pattern[str] | None
|
||||
) -> None:
|
||||
"""Initialize."""
|
||||
|
||||
|
@ -360,7 +360,7 @@ class SelectorList(Immutable):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
selectors: Optional[Iterable[Selector | SelectorNull]] = None,
|
||||
selectors: Iterable[Selector | SelectorNull] | None = None,
|
||||
is_not: bool = False,
|
||||
is_html: bool = False
|
||||
) -> None:
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
|||
from functools import wraps, lru_cache
|
||||
import warnings
|
||||
import re
|
||||
from typing import Callable, Any, Optional
|
||||
from typing import Callable, Any
|
||||
|
||||
DEBUG = 0x00001
|
||||
|
||||
|
@ -27,7 +27,7 @@ def lower(string: str) -> str:
|
|||
class SelectorSyntaxError(Exception):
|
||||
"""Syntax error in a CSS selector."""
|
||||
|
||||
def __init__(self, msg: str, pattern: Optional[str] = None, index: Optional[int] = None) -> None:
|
||||
def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None:
|
||||
"""Initialize."""
|
||||
|
||||
self.line = None
|
||||
|
@ -84,7 +84,7 @@ def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
|
|||
col = 1
|
||||
text = [] # type: list[str]
|
||||
line = 1
|
||||
offset = None # type: Optional[int]
|
||||
offset = None # type: int | None
|
||||
|
||||
# Split pattern by newline and handle the text before the newline
|
||||
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
|
||||
|
|
|
@ -4,7 +4,7 @@ arrow==1.2.3
|
|||
backports.csv==1.0.7
|
||||
backports.functools-lru-cache==1.6.4
|
||||
backports.zoneinfo==0.2.1;python_version<"3.9"
|
||||
beautifulsoup4==4.11.2
|
||||
beautifulsoup4==4.12.2
|
||||
bleach==6.0.0
|
||||
certifi==2022.12.7
|
||||
cheroot==9.0.0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue