Bump beautifulsoup4 from 4.11.1 to 4.11.2 (#1987)

* Bump beautifulsoup4 from 4.11.1 to 4.11.2

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.1 to 4.11.2.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.11.2

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2023-03-02 20:56:24 -08:00 committed by GitHub
parent ded93ef2f5
commit 8e42757b2d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
23 changed files with 449 additions and 537 deletions

View file

@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
Beautiful Soup works with Python 3.5 and up. It works better if lxml
Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.11.1"
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
__version__ = "4.11.2"
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@ -211,7 +211,7 @@ class BeautifulSoup(Tag):
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name),
DeprecationWarning
DeprecationWarning, stacklevel=3
)
return kwargs.pop(old_name)
return None
@ -405,7 +405,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning
MarkupResemblesLocatorWarning,
stacklevel=3
)
return True
return False
@ -436,7 +437,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into'
' Beautiful Soup.',
MarkupResemblesLocatorWarning
MarkupResemblesLocatorWarning, stacklevel=3
)
return True
return False
@ -789,7 +790,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.',
DeprecationWarning
DeprecationWarning, stacklevel=2
)
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)

View file

@ -122,7 +122,7 @@ class TreeBuilder(object):
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
# Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()

View file

@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
warnings.warn(
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
stacklevel=3
)
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
warnings.warn(
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
stacklevel=4
)
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()
@ -249,9 +255,9 @@ class AttrList(object):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = self.element.cdata_list_attributes or {}
if (name in list_attr.get('*')
if (name in list_attr.get('*', [])
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
and name in list_attr.get(self.element.name, []))):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):

View file

@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from bs4.element import (
CData,
Comment,
@ -91,19 +70,6 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self._initialize_xml_detector()
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although
this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() by raising an exception,
which we don't want to do.
In any event, this method is called only on very strange
markup and our best strategy is to pretend it didn't happen
and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
@ -203,9 +169,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal.
"""
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
# TODO: This was originally a workaround for a bug in
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
# been fixed, but removing this code still makes some
# Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@ -333,9 +300,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
@ -395,105 +359,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

View file

@ -496,13 +496,16 @@ class PageElement(object):
def extend(self, tags):
"""Appends the given PageElements to this one's contents.
:param tags: A list of PageElements.
:param tags: A list of PageElements. If a single Tag is
provided instead, this PageElement's contents will be extended
with that Tag's contents.
"""
if isinstance(tags, Tag):
# Calling self.append() on another tag's contents will change
# the list we're iterating over. Make a list that won't
# change.
tags = list(tags.contents)
tags = tags.contents
if isinstance(tags, list):
# Moving items around the tree may change their position in
# the original list. Make a list that won't change.
tags = list(tags)
for tag in tags:
self.append(tag)
@ -586,8 +589,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements.
"""
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements,
**kwargs)
_stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@ -624,8 +628,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit,
self.next_siblings, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit,
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
@ -663,8 +670,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit, self.previous_elements,
**kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit, self.previous_elements,
_stacklevel=_stacklevel+1, **kwargs
)
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
@ -702,8 +712,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
return self._find_all(name, attrs, string, limit,
self.previous_siblings, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(
name, attrs, string, limit,
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
@ -724,7 +737,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
l = self.find_parents(name, attrs, 1, **kwargs)
l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l:
r = l[0]
return r
@ -744,8 +757,9 @@ class PageElement(object):
:return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
_stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@ -771,19 +785,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs):
r = None
l = method(name, attrs, string, 1, **kwargs)
l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l:
r = l[0]
return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
_stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
DeprecationWarning
DeprecationWarning, stacklevel=_stacklevel
)
if isinstance(name, SoupStrainer):
@ -1306,7 +1321,8 @@ class Tag(PageElement):
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags
preserve_whitespace_tags=self.preserve_whitespace_tags,
interesting_string_types=self.interesting_string_types
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
@ -1558,7 +1574,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
),
DeprecationWarning
DeprecationWarning, stacklevel=2
)
return self.find(tag_name)
# We special case contents to avoid recursion.
@ -1862,7 +1878,8 @@ class Tag(PageElement):
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
r = None
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
**kwargs)
if l:
r = l[0]
return r
@ -1889,7 +1906,9 @@ class Tag(PageElement):
generator = self.descendants
if not recursive:
generator = self.children
return self._find_all(name, attrs, string, limit, generator, **kwargs)
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, generator,
_stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3
findChildren = find_all # BS2
@ -1993,7 +2012,7 @@ class Tag(PageElement):
"""
warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning
DeprecationWarning, stacklevel=2
)
return self.has_attr(key)
@ -2024,7 +2043,7 @@ class SoupStrainer(object):
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning
DeprecationWarning, stacklevel=2
)
self.name = self._normalize_search_value(name)

View file

@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.

View file

@ -29,6 +29,29 @@ from bs4.builder import (
)
default_builder = HTMLParserTreeBuilder
# Some tests depend on specific third-party libraries. We use
# @pytest.mark.skipIf on the following conditionals to skip them
# if the libraries are not installed.
try:
from soupsieve import SelectorSyntaxError
SOUP_SIEVE_PRESENT = True
except ImportError:
SOUP_SIEVE_PRESENT = False
try:
import html5lib
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError:
LXML_PRESENT = False
LXML_VERSION = (0,)
BAD_DOCUMENT = """A bare string
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
@ -258,10 +281,10 @@ class TreeBuilderSmokeTest(object):
@pytest.mark.parametrize(
"multi_valued_attributes",
[None, dict(b=['class']), {'*': ['notclass']}]
[None, {}, dict(b=['class']), {'*': ['notclass']}]
)
def test_attribute_not_multi_valued(self, multi_valued_attributes):
markup = '<a class="a b c">'
markup = '<html xmlns="http://www.w3.org/1999/xhtml"><a class="a b c"></html>'
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a['class'] == 'a b c'
@ -820,26 +843,27 @@ Hello, world!
soup = self.soup(text)
assert soup.p.encode("utf-8") == expected
def test_real_iso_latin_document(self):
def test_real_iso_8859_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# Here it is in Unicode. Note that it claims to be in ISO-8859-1.
unicode_html = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
# That's because we're going to encode it into ISO-8859-1,
# and use that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML.
# Parse the ISO-8859-1 HTML.
soup = self.soup(iso_latin_html)
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
# UTF-8 instead of ISO-8859-1.
expected = unicode_html.replace("ISO-8859-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
@ -1177,15 +1201,3 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
assert isinstance(soup.contents[0], Comment)
assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?'
assert "html" == soup.contents[0].next_element.name
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator

View file

@ -10,22 +10,23 @@ from bs4.builder import (
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
from . import (
HTML5LIB_PRESENT,
LXML_PRESENT,
)
try:
if HTML5LIB_PRESENT:
from bs4.builder import HTML5TreeBuilder
if LXML_PRESENT:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
# TODO: Split out the lxml and html5lib tests into their own classes
# and gate with pytest.mark.skipIf.
class TestBuiltInRegistry(object):
"""Test the built-in registry with the default builders registered."""

View file

@ -17,25 +17,23 @@ class TestUnicodeDammit(object):
dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == markup
def test_smart_quotes_to_unicode(self):
@pytest.mark.parametrize(
"smart_quotes_to,expect_converted",
[(None, "\u2018\u2019\u201c\u201d"),
("xml", "&#x2018;&#x2019;&#x201C;&#x201D;"),
("html", "&lsquo;&rsquo;&ldquo;&rdquo;"),
("ascii", "''" + '""'),
]
)
def test_smart_quotes_to(self, smart_quotes_to, expect_converted):
"""Verify the functionality of the smart_quotes_to argument
to the UnicodeDammit constructor."""
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>"
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
assert dammit.unicode_markup == "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>"
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
assert dammit.unicode_markup == "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>"
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
assert dammit.unicode_markup == """<foo>''""</foo>"""
converted = UnicodeDammit(
markup, known_definite_encodings=["windows-1252"],
smart_quotes_to=smart_quotes_to
).unicode_markup
assert converted == "<foo>{}</foo>".format(expect_converted)
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
@ -275,23 +273,24 @@ class TestEntitySubstitution(object):
def setup_method(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
assert self.sub.substitute_html(s) == "foo&forall;\N{SNOWMAN}&otilde;bar"
def test_smart_quote_substitution(self):
@pytest.mark.parametrize(
"original,substituted",
[
# Basic case. Unicode characters corresponding to named
# HTML entites are substituted; others are not.
("foo\u2200\N{SNOWMAN}\u00f5bar",
"foo&forall;\N{SNOWMAN}&otilde;bar"),
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
assert self.sub.substitute_html(dammit.markup) == "&lsquo;&rsquo;foo&ldquo;&rdquo;"
('foo“”', "&lsquo;&rsquo;foo&ldquo;&rdquo;"),
]
)
def test_substitute_html(self, original, substituted):
assert self.sub.substitute_html(original) == substituted
def test_html5_entity(self):
# Some HTML5 entities correspond to single- or multi-character
# Unicode sequences.
for entity, u in (
# A few spot checks of our ability to recognize
# special character sequences and convert them

View file

@ -1,27 +1,26 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import pytest
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError as e:
HTML5LIB_PRESENT = False
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
from . import (
HTML5LIB_PRESENT,
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
@pytest.mark.skipif(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
reason="html5lib seems not to be present, not testing its tree builder."
)
class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
from bs4.builder import HTML5TreeBuilder
return HTML5TreeBuilder
def test_soupstrainer(self):
@ -29,10 +28,12 @@ class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
soup = BeautifulSoup(markup, "html5lib", parse_only=strainer)
assert soup.decode() == self.document_for(markup)
assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message)
[warning] = w
assert warning.filename == __file__
assert "the html5lib tree builder doesn't support parse_only" in str(warning.message)
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""

View file

@ -122,15 +122,3 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element
assert with_element == expect
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
with warnings.catch_warnings(record=True) as warns:
parser.error("don't crash")
[warning] = warns
assert "don't crash" == str(warning.message)

View file

@ -1,16 +1,10 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import pickle
import pytest
import re
import warnings
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
from . import LXML_PRESENT, LXML_VERSION
if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
@ -23,13 +17,14 @@ from bs4.element import Comment, Doctype, SoupStrainer
from . import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SOUP_SIEVE_PRESENT,
SoupTest,
skipIf,
)
@skipIf(
@pytest.mark.skipif(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
reason="lxml seems not to be present, not testing its tree builder."
)
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@ -54,9 +49,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
@pytest.mark.skipif(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
reason="Skipping doctype test for old version of lxml to avoid segfault."
)
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
@ -68,7 +64,9 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
assert "<b/>" == str(soup.b)
assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)
[warning] = w
assert warning.filename == __file__
assert "BeautifulStoneSoup class is deprecated" in str(warning.message)
def test_tracking_line_numbers(self):
# The lxml TreeBuilder cannot keep track of line numbers from
@ -85,9 +83,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
assert "sourceline" == soup.p.sourceline.name
assert "sourcepos" == soup.p.sourcepos.name
@skipIf(
@pytest.mark.skipif(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
reason="lxml seems not to be present, not testing its XML tree builder."
)
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@ -148,6 +147,9 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
}
@pytest.mark.skipif(
not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"
)
def test_namespace_interaction_with_select_and_find(self):
# Demonstrate how namespaces interact with select* and
# find* methods.

View file

@ -3,15 +3,18 @@ import copy
import pickle
import pytest
from soupsieve import SelectorSyntaxError
from bs4 import BeautifulSoup
from bs4.element import (
Comment,
SoupStrainer,
)
from . import SoupTest
from . import (
SoupTest,
SOUP_SIEVE_PRESENT,
)
if SOUP_SIEVE_PRESENT:
from soupsieve import SelectorSyntaxError
class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
@ -213,6 +216,7 @@ class TestFormatters(SoupTest):
assert soup.contents[0].name == 'pre'
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality.
@ -694,6 +698,7 @@ class TestPersistence(SoupTest):
assert tag.can_be_empty_element == copied.can_be_empty_element
assert tag.cdata_list_attributes == copied.cdata_list_attributes
assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
assert tag.interesting_string_types == copied.interesting_string_types
def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled.

View file

@ -30,19 +30,11 @@ from bs4.element import (
from . import (
default_builder,
LXML_PRESENT,
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
@ -139,7 +131,7 @@ class TestConstructor(SoupTest):
assert " an id " == a['id']
assert ["a", "class"] == a['class']
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
# TreeBuilder takes an argument called 'multi_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
@ -219,10 +211,17 @@ class TestConstructor(SoupTest):
class TestWarnings(SoupTest):
# Note that some of the tests in this class create BeautifulSoup
# objects directly rather than using self.soup(). That's
# because SoupTest.soup is defined in a different file,
# which will throw off the assertion in _assert_warning
# that the code that triggered the warning is in the same
# file as the test.
def _assert_warning(self, warnings, cls):
for w in warnings:
if isinstance(w.message, cls):
assert w.filename == __file__
return w
raise Exception("%s warning not found in %r" % (cls, warnings))
@ -243,13 +242,17 @@ class TestWarnings(SoupTest):
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
soup = self.soup("<a><b></b></a>")
assert [] == w
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
soup = BeautifulSoup(
"<a><b></b></a>", "html.parser",
parseOnlyThese=SoupStrainer("b"),
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "parseOnlyThese" in msg
assert "parse_only" in msg
assert b"<b></b>" == soup.encode()
@ -257,8 +260,11 @@ class TestWarnings(SoupTest):
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
soup = BeautifulSoup(
utf8, "html.parser", fromEncoding="utf8"
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "fromEncoding" in msg
assert "from_encoding" in msg
assert "utf8" == soup.original_encoding
@ -276,7 +282,7 @@ class TestWarnings(SoupTest):
# A warning is issued if the "markup" looks like the name of
# an HTML or text file, or a full path to a file on disk.
with warnings.catch_warnings(record=True) as w:
soup = self.soup("markup" + extension)
soup = BeautifulSoup("markup" + extension, "html.parser")
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks more like a filename" in str(warning.message)
@ -295,7 +301,7 @@ class TestWarnings(SoupTest):
def test_url_warning_with_bytes_url(self):
url = b"http://www.crummybytes.com/"
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(url)
soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
@ -307,7 +313,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(url)
soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
@ -348,9 +354,12 @@ class TestNewTag(SoupTest):
assert dict(bar="baz", name="a name") == new_tag.attrs
assert None == new_tag.parent
def test_tag_inherits_self_closing_rules_from_builder(self):
if LXML_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
@pytest.mark.skipif(
not LXML_PRESENT,
reason="lxml not installed, cannot parse XML document"
)
def test_xml_tag_inherits_self_closing_rules_from_builder(self):
xml_soup = BeautifulSoup("", "xml")
xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p")
@ -359,6 +368,7 @@ class TestNewTag(SoupTest):
assert b"<br/>" == xml_br.encode()
assert b"<p/>" == xml_p.encode()
def test_tag_inherits_self_closing_rules_from_builder(self):
html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
@ -450,13 +460,3 @@ class TestEncodingConversion(SoupTest):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
assert soup_from_unicode.encode('utf-8') == self.utf8_data
@skipIf(
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")

View file

@ -33,7 +33,6 @@ from bs4.element import (
)
from . import (
SoupTest,
skipIf,
)
class TestFind(SoupTest):
@ -910,12 +909,16 @@ class TestTreeModification(SoupTest):
soup.a.extend(l)
assert "<a><g></g><f></f><e></e><d></d><c></c><b></b></a>" == soup.decode()
def test_extend_with_another_tags_contents(self):
@pytest.mark.parametrize(
"get_tags", [lambda tag: tag, lambda tag: tag.contents]
)
def test_extend_with_another_tags_contents(self, get_tags):
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
soup = self.soup(data)
d1 = soup.find('div', id='d1')
d2 = soup.find('div', id='d2')
d2.extend(d1)
tags = get_tags(d1)
d2.extend(tags)
assert '<div id="d1"></div>' == d1.decode()
assert '<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>' == d2.decode()
@ -1272,19 +1275,30 @@ class TestTreeModification(SoupTest):
class TestDeprecatedArguments(SoupTest):
def test_find_type_method_string(self):
@pytest.mark.parametrize(
"method_name", [
"find", "find_all", "find_parent", "find_parents",
"find_next", "find_all_next", "find_previous",
"find_all_previous", "find_next_sibling", "find_next_siblings",
"find_previous_sibling", "find_previous_siblings",
]
)
def test_find_type_method_string(self, method_name):
soup = self.soup("<a>some</a><b>markup</b>")
method = getattr(soup.b, method_name)
with warnings.catch_warnings(record=True) as w:
[result] = soup.find_all(text='markup')
assert result == 'markup'
assert result.parent.name == 'b'
msg = str(w[0].message)
method(text='markup')
[warning] = w
assert warning.filename == __file__
msg = str(warning.message)
assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead."
def test_soupstrainer_constructor_string(self):
with warnings.catch_warnings(record=True) as w:
strainer = SoupStrainer(text="text")
assert strainer.text == 'text'
msg = str(w[0].message)
[warning] = w
msg = str(warning.message)
assert warning.filename == __file__
assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead."

View file

@ -25,13 +25,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
from __future__ import annotations
from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp
from . import css_match as cm
from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401
import bs4 # type: ignore[import]
from typing import Dict, Optional, Any, List, Iterator, Iterable
from typing import Optional, Any, Iterator, Iterable
__all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
@ -44,17 +45,14 @@ SoupSieve = cm.SoupSieve
def compile( # noqa: A001
pattern: str,
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> cm.SoupSieve:
"""Compile CSS pattern."""
ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
if isinstance(pattern, SoupSieve):
if flags:
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
@ -64,7 +62,12 @@ def compile( # noqa: A001
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern
return cp._cached_css_compile(pattern, ns, cs, flags)
return cp._cached_css_compile(
pattern,
ct.Namespaces(namespaces) if namespaces is not None else namespaces,
ct.CustomSelectors(custom) if custom is not None else custom,
flags
)
def purge() -> None:
@ -76,10 +79,10 @@ def purge() -> None:
def closest(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Match closest ancestor."""
@ -90,10 +93,10 @@ def closest(
def match(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> bool:
"""Match node."""
@ -104,12 +107,12 @@ def match(
def filter( # noqa: A001
select: str,
iterable: Iterable['bs4.Tag'],
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> List['bs4.Tag']:
) -> list['bs4.Tag']:
"""Filter list of nodes."""
return compile(select, namespaces, flags, **kwargs).filter(iterable)
@ -118,10 +121,10 @@ def filter( # noqa: A001
def select_one(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Select a single tag."""
@ -132,13 +135,13 @@ def select_one(
def select(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
limit: int = 0,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> List['bs4.Tag']:
) -> list['bs4.Tag']:
"""Select the specified tags."""
return compile(select, namespaces, flags, **kwargs).select(tag, limit)
@ -147,11 +150,11 @@ def select(
def iselect(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None,
namespaces: Optional[dict[str, str]] = None,
limit: int = 0,
flags: int = 0,
*,
custom: Optional[Dict[str, str]] = None,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> Iterator['bs4.Tag']:
"""Iterate the specified tags."""

View file

@ -1,4 +1,5 @@
"""Meta related things."""
from __future__ import annotations
from collections import namedtuple
import re
@ -83,7 +84,7 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre"
cls,
major: int, minor: int, micro: int, release: str = "final",
pre: int = 0, post: int = 0, dev: int = 0
) -> "Version":
) -> Version:
"""Validate version info."""
# Ensure all parts are positive integers.
@ -192,5 +193,5 @@ def parse_version(ver: str) -> Version:
return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(2, 3, 2, "final", post=1)
__version_info__ = Version(2, 4, 0, "final")
__version__ = __version_info__._get_canonical()

View file

@ -1,11 +1,12 @@
"""CSS matcher."""
from __future__ import annotations
from datetime import datetime
from . import util
import re
from . import css_types as ct
import unicodedata
import bs4 # type: ignore[import]
from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast
from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -64,12 +65,12 @@ class _FakeParent:
fake parent so we can traverse the root element as a child.
"""
def __init__(self, element: 'bs4.Tag') -> None:
def __init__(self, element: bs4.Tag) -> None:
"""Initialize."""
self.contents = [element]
def __len__(self) -> 'bs4.PageElement':
def __len__(self) -> bs4.PageElement:
"""Length."""
return len(self.contents)
@ -87,59 +88,59 @@ class _DocumentNav:
raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag)))
@staticmethod
def is_doc(obj: 'bs4.Tag') -> bool:
def is_doc(obj: bs4.Tag) -> bool:
"""Is `BeautifulSoup` object."""
return isinstance(obj, bs4.BeautifulSoup)
@staticmethod
def is_tag(obj: 'bs4.PageElement') -> bool:
def is_tag(obj: bs4.PageElement) -> bool:
"""Is tag."""
return isinstance(obj, bs4.Tag)
@staticmethod
def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover
def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is declaration."""
return isinstance(obj, bs4.Declaration)
@staticmethod
def is_cdata(obj: 'bs4.PageElement') -> bool:
def is_cdata(obj: bs4.PageElement) -> bool:
"""Is CDATA."""
return isinstance(obj, bs4.CData)
@staticmethod
def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover
def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is processing instruction."""
return isinstance(obj, bs4.ProcessingInstruction)
@staticmethod
def is_navigable_string(obj: 'bs4.PageElement') -> bool:
def is_navigable_string(obj: bs4.PageElement) -> bool:
"""Is navigable string."""
return isinstance(obj, bs4.NavigableString)
@staticmethod
def is_special_string(obj: 'bs4.PageElement') -> bool:
def is_special_string(obj: bs4.PageElement) -> bool:
"""Is special string."""
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
@classmethod
def is_content_string(cls, obj: 'bs4.PageElement') -> bool:
def is_content_string(cls, obj: bs4.PageElement) -> bool:
"""Check if node is content string."""
return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
@staticmethod
def create_fake_parent(el: 'bs4.Tag') -> _FakeParent:
def create_fake_parent(el: bs4.Tag) -> _FakeParent:
"""Create fake parent for a given element."""
return _FakeParent(el)
@staticmethod
def is_xml_tree(el: 'bs4.Tag') -> bool:
def is_xml_tree(el: bs4.Tag) -> bool:
"""Check if element (or document) is from a XML tree."""
return bool(el._is_xml)
def is_iframe(self, el: 'bs4.Tag') -> bool:
def is_iframe(self, el: bs4.Tag) -> bool:
"""Check if element is an `iframe`."""
return bool(
@ -147,7 +148,7 @@ class _DocumentNav:
self.is_html_tag(el) # type: ignore[attr-defined]
)
def is_root(self, el: 'bs4.Tag') -> bool:
def is_root(self, el: bs4.Tag) -> bool:
"""
Return whether element is a root element.
@ -161,7 +162,7 @@ class _DocumentNav:
root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
return root
def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']:
def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
"""Get contents or contents in reverse."""
if not no_iframe or not self.is_iframe(el):
for content in el.contents:
@ -169,12 +170,12 @@ class _DocumentNav:
def get_children(
self,
el: 'bs4.Tag',
el: bs4.Tag,
start: Optional[int] = None,
reverse: bool = False,
tags: bool = True,
no_iframe: bool = False
) -> Iterator['bs4.PageElement']:
) -> Iterator[bs4.PageElement]:
"""Get children."""
if not no_iframe or not self.is_iframe(el):
@ -195,10 +196,10 @@ class _DocumentNav:
def get_descendants(
self,
el: 'bs4.Tag',
el: bs4.Tag,
tags: bool = True,
no_iframe: bool = False
) -> Iterator['bs4.PageElement']:
) -> Iterator[bs4.PageElement]:
"""Get descendants."""
if not no_iframe or not self.is_iframe(el):
@ -229,7 +230,7 @@ class _DocumentNav:
if not tags or is_tag:
yield child
def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag':
def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
"""Get parent."""
parent = el.parent
@ -238,25 +239,25 @@ class _DocumentNav:
return parent
@staticmethod
def get_tag_name(el: 'bs4.Tag') -> Optional[str]:
def get_tag_name(el: bs4.Tag) -> Optional[str]:
"""Get tag."""
return cast(Optional[str], el.name)
@staticmethod
def get_prefix_name(el: 'bs4.Tag') -> Optional[str]:
def get_prefix_name(el: bs4.Tag) -> Optional[str]:
"""Get prefix."""
return cast(Optional[str], el.prefix)
@staticmethod
def get_uri(el: 'bs4.Tag') -> Optional[str]:
def get_uri(el: bs4.Tag) -> Optional[str]:
"""Get namespace `URI`."""
return cast(Optional[str], el.namespace)
@classmethod
def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get next sibling tag."""
sibling = el.next_sibling
@ -265,7 +266,7 @@ class _DocumentNav:
return sibling
@classmethod
def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get previous sibling tag."""
sibling = el.previous_sibling
@ -274,7 +275,7 @@ class _DocumentNav:
return sibling
@staticmethod
def has_html_ns(el: 'bs4.Tag') -> bool:
def has_html_ns(el: bs4.Tag) -> bool:
"""
Check if element has an HTML namespace.
@ -286,13 +287,13 @@ class _DocumentNav:
return bool(ns and ns == NS_XHTML)
@staticmethod
def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]:
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
"""Return namespace and attribute name without the prefix."""
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
@classmethod
def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]:
def normalize_value(cls, value: Any) -> str | Sequence[str]:
"""Normalize the value to be a string or list of strings."""
# Treat `None` as empty string.
@ -327,10 +328,10 @@ class _DocumentNav:
@classmethod
def get_attribute_by_name(
cls,
el: 'bs4.Tag',
el: bs4.Tag,
name: str,
default: Optional[Union[str, Sequence[str]]] = None
) -> Optional[Union[str, Sequence[str]]]:
default: Optional[str | Sequence[str]] = None
) -> Optional[str | Sequence[str]]:
"""Get attribute by name."""
value = default
@ -347,14 +348,14 @@ class _DocumentNav:
return value
@classmethod
def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]:
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
"""Iterate attributes."""
for k, v in el.attrs.items():
yield k, cls.normalize_value(v)
@classmethod
def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]:
def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
"""Get classes."""
classes = cls.get_attribute_by_name(el, 'class', [])
@ -362,14 +363,14 @@ class _DocumentNav:
classes = RE_NOT_WS.findall(classes)
return cast(Sequence[str], classes)
def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str:
def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
"""Get text."""
return ''.join(
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
)
def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]:
def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
"""Get Own Text."""
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
@ -423,10 +424,10 @@ class Inputs:
return 0 <= minutes <= 59
@classmethod
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]:
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
"""Parse the input value."""
parsed = None # type: Optional[Tuple[float, ...]]
parsed = None # type: Optional[tuple[float, ...]]
if value is None:
return value
if itype == "date":
@ -484,7 +485,7 @@ class CSSMatch(_DocumentNav):
def __init__(
self,
selectors: ct.SelectorList,
scope: 'bs4.Tag',
scope: bs4.Tag,
namespaces: Optional[ct.Namespaces],
flags: int
) -> None:
@ -492,11 +493,11 @@ class CSSMatch(_DocumentNav):
self.assert_valid_input(scope)
self.tag = scope
self.cached_meta_lang = [] # type: List[Tuple[str, str]]
self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']]
self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]]
self.cached_meta_lang = [] # type: list[tuple[str, str]]
self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
self.selectors = selectors
self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]]
self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
self.flags = flags
self.iframe_restrict = False
@ -527,7 +528,7 @@ class CSSMatch(_DocumentNav):
return self.is_xml or self.has_html_namespace
def get_tag_ns(self, el: 'bs4.Tag') -> str:
def get_tag_ns(self, el: bs4.Tag) -> str:
"""Get tag namespace."""
if self.supports_namespaces():
@ -539,24 +540,24 @@ class CSSMatch(_DocumentNav):
namespace = NS_XHTML
return namespace
def is_html_tag(self, el: 'bs4.Tag') -> bool:
def is_html_tag(self, el: bs4.Tag) -> bool:
"""Check if tag is in HTML namespace."""
return self.get_tag_ns(el) == NS_XHTML
def get_tag(self, el: 'bs4.Tag') -> Optional[str]:
def get_tag(self, el: bs4.Tag) -> Optional[str]:
"""Get tag."""
name = self.get_tag_name(el)
return util.lower(name) if name is not None and not self.is_xml else name
def get_prefix(self, el: 'bs4.Tag') -> Optional[str]:
def get_prefix(self, el: bs4.Tag) -> Optional[str]:
"""Get prefix."""
prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
def find_bidi(self, el: 'bs4.Tag') -> Optional[int]:
def find_bidi(self, el: bs4.Tag) -> Optional[int]:
"""Get directionality from element text."""
for node in self.get_children(el, tags=False):
@ -600,13 +601,18 @@ class CSSMatch(_DocumentNav):
ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-')
length = len(ranges)
slength = len(subtags)
rindex = 0
sindex = 0
r = ranges[rindex]
s = subtags[sindex]
# Empty specified language should match unspecified language attributes
if length == 1 and slength == 1 and not r and r == s:
return True
# Primary tag needs to match
if r != '*' and r != s:
if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
match = False
rindex += 1
@ -645,10 +651,10 @@ class CSSMatch(_DocumentNav):
def match_attribute_name(
self,
el: 'bs4.Tag',
el: bs4.Tag,
attr: str,
prefix: Optional[str]
) -> Optional[Union[str, Sequence[str]]]:
) -> Optional[str | Sequence[str]]:
"""Match attribute name and return value if it exists."""
value = None
@ -696,7 +702,7 @@ class CSSMatch(_DocumentNav):
break
return value
def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match the namespace of the element."""
match = True
@ -717,7 +723,7 @@ class CSSMatch(_DocumentNav):
match = False
return match
def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool:
def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
"""Match attributes."""
match = True
@ -736,7 +742,7 @@ class CSSMatch(_DocumentNav):
break
return match
def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match tag name."""
name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
@ -745,7 +751,7 @@ class CSSMatch(_DocumentNav):
name not in (self.get_tag(el), '*')
)
def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool:
def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
"""Match the tag."""
match = True
@ -757,7 +763,7 @@ class CSSMatch(_DocumentNav):
match = False
return match
def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match past relationship."""
found = False
@ -785,12 +791,12 @@ class CSSMatch(_DocumentNav):
found = self.match_selectors(sibling, relation)
return found
def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool:
def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
"""Match future child."""
match = False
if recursive:
children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']]
children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
else:
children = self.get_children
for child in children(parent, no_iframe=self.iframe_restrict):
@ -799,7 +805,7 @@ class CSSMatch(_DocumentNav):
break
return match
def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match future relationship."""
found = False
@ -822,7 +828,7 @@ class CSSMatch(_DocumentNav):
found = self.match_selectors(sibling, relation)
return found
def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match relationship to other elements."""
found = False
@ -837,7 +843,7 @@ class CSSMatch(_DocumentNav):
return found
def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool:
def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
"""Match element's ID."""
found = True
@ -847,7 +853,7 @@ class CSSMatch(_DocumentNav):
break
return found
def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool:
def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
"""Match element's classes."""
current_classes = self.get_classes(el)
@ -858,7 +864,7 @@ class CSSMatch(_DocumentNav):
break
return found
def match_root(self, el: 'bs4.Tag') -> bool:
def match_root(self, el: bs4.Tag) -> bool:
"""Match element as root."""
is_root = self.is_root(el)
@ -884,20 +890,20 @@ class CSSMatch(_DocumentNav):
sibling = self.get_next(sibling, tags=False)
return is_root
def match_scope(self, el: 'bs4.Tag') -> bool:
def match_scope(self, el: bs4.Tag) -> bool:
"""Match element as scope."""
return self.scope is el
def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool:
def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
"""Match tag type for `nth` matches."""
return(
return (
(self.get_tag(child) == self.get_tag(el)) and
(self.get_tag_ns(child) == self.get_tag_ns(el))
)
def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool:
def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
"""Match `nth` elements."""
matched = True
@ -998,7 +1004,7 @@ class CSSMatch(_DocumentNav):
break
return matched
def match_empty(self, el: 'bs4.Tag') -> bool:
def match_empty(self, el: bs4.Tag) -> bool:
"""Check if element is empty (if requested)."""
is_empty = True
@ -1011,7 +1017,7 @@ class CSSMatch(_DocumentNav):
break
return is_empty
def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool:
def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
"""Match selectors."""
match = True
@ -1020,11 +1026,11 @@ class CSSMatch(_DocumentNav):
match = False
return match
def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool:
def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
"""Match element if it contains text."""
match = True
content = None # type: Optional[Union[str, Sequence[str]]]
content = None # type: Optional[str | Sequence[str]]
for contain_list in contains:
if content is None:
if contain_list.own:
@ -1048,7 +1054,7 @@ class CSSMatch(_DocumentNav):
match = False
return match
def match_default(self, el: 'bs4.Tag') -> bool:
def match_default(self, el: bs4.Tag) -> bool:
"""Match default."""
match = False
@ -1087,13 +1093,13 @@ class CSSMatch(_DocumentNav):
break
return match
def match_indeterminate(self, el: 'bs4.Tag') -> bool:
def match_indeterminate(self, el: bs4.Tag) -> bool:
"""Match default."""
match = False
name = cast(str, self.get_attribute_by_name(el, 'name'))
def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']:
def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
"""Find this input's form."""
form = None
parent = self.get_parent(el, no_iframe=True)
@ -1148,7 +1154,7 @@ class CSSMatch(_DocumentNav):
return match
def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool:
def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
"""Match languages."""
match = False
@ -1183,7 +1189,7 @@ class CSSMatch(_DocumentNav):
break
# Use cached meta language.
if not found_lang and self.cached_meta_lang:
if found_lang is None and self.cached_meta_lang:
for cache in self.cached_meta_lang:
if root is cache[0]:
found_lang = cache[1]
@ -1217,13 +1223,13 @@ class CSSMatch(_DocumentNav):
found_lang = content
self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
break
if found_lang:
if found_lang is not None:
break
if not found_lang:
if found_lang is None:
self.cached_meta_lang.append((cast(str, root), ''))
# If we determined a language, compare.
if found_lang:
if found_lang is not None:
for patterns in langs:
match = False
for pattern in patterns:
@ -1234,7 +1240,7 @@ class CSSMatch(_DocumentNav):
return match
def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool:
def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
"""Check directionality."""
# If we have to match both left and right, we can't match either.
@ -1297,7 +1303,7 @@ class CSSMatch(_DocumentNav):
# Match parents direction
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
def match_range(self, el: 'bs4.Tag', condition: int) -> bool:
def match_range(self, el: bs4.Tag, condition: int) -> bool:
"""
Match range.
@ -1337,7 +1343,7 @@ class CSSMatch(_DocumentNav):
return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
def match_defined(self, el: 'bs4.Tag') -> bool:
def match_defined(self, el: bs4.Tag) -> bool:
"""
Match defined.
@ -1360,7 +1366,7 @@ class CSSMatch(_DocumentNav):
)
)
def match_placeholder_shown(self, el: 'bs4.Tag') -> bool:
def match_placeholder_shown(self, el: bs4.Tag) -> bool:
"""
Match placeholder shown according to HTML spec.
@ -1375,7 +1381,7 @@ class CSSMatch(_DocumentNav):
return match
def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool:
def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
"""Check if element matches one of the selectors."""
match = False
@ -1459,7 +1465,7 @@ class CSSMatch(_DocumentNav):
return match
def select(self, limit: int = 0) -> Iterator['bs4.Tag']:
def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
"""Match all tags under the targeted tag."""
lim = None if limit < 1 else limit
@ -1472,7 +1478,7 @@ class CSSMatch(_DocumentNav):
if lim < 1:
break
def closest(self) -> Optional['bs4.Tag']:
def closest(self) -> Optional[bs4.Tag]:
"""Match closest ancestor."""
current = self.tag
@ -1484,12 +1490,12 @@ class CSSMatch(_DocumentNav):
current = self.get_parent(current)
return closest
def filter(self) -> List['bs4.Tag']: # noqa A001
def filter(self) -> list[bs4.Tag]: # noqa A001
"""Filter tag's children."""
return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
def match(self, el: 'bs4.Tag') -> bool:
def match(self, el: bs4.Tag) -> bool:
"""Match."""
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
@ -1501,7 +1507,7 @@ class SoupSieve(ct.Immutable):
pattern: str
selectors: ct.SelectorList
namespaces: Optional[ct.Namespaces]
custom: Dict[str, str]
custom: dict[str, str]
flags: int
__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
@ -1524,17 +1530,17 @@ class SoupSieve(ct.Immutable):
flags=flags
)
def match(self, tag: 'bs4.Tag') -> bool:
def match(self, tag: bs4.Tag) -> bool:
"""Match."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag':
def closest(self, tag: bs4.Tag) -> bs4.Tag:
"""Match closest ancestor."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001
def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
"""
Filter.
@ -1551,18 +1557,18 @@ class SoupSieve(ct.Immutable):
else:
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag':
def select_one(self, tag: bs4.Tag) -> bs4.Tag:
"""Select a single tag."""
tags = self.select(tag, limit=1)
return tags[0] if tags else None
def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']:
def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
"""Select the specified tags."""
return list(self.iselect(tag, limit))
def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']:
def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
"""Iterate the specified tags."""
for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):

View file

@ -1,4 +1,5 @@
"""CSS selector parser."""
from __future__ import annotations
import re
from functools import lru_cache
from . import util
@ -6,7 +7,7 @@ from . import css_match as cm
from . import css_types as ct
from .util import SelectorSyntaxError
import warnings
from typing import Optional, Dict, Match, Tuple, Type, Any, List, Union, Iterator, cast
from typing import Optional, Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -232,7 +233,7 @@ def _purge_cache() -> None:
_cached_css_compile.cache_clear()
def process_custom(custom: Optional[ct.CustomSelectors]) -> Dict[str, Union[str, ct.SelectorList]]:
def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
"""Process custom."""
custom_selectors = {}
@ -325,7 +326,7 @@ class SelectorPattern:
class SpecialPseudoPattern(SelectorPattern):
"""Selector pattern."""
def __init__(self, patterns: Tuple[Tuple[str, Tuple[str, ...], str, Type[SelectorPattern]], ...]) -> None:
def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
"""Initialize."""
self.patterns = {}
@ -372,19 +373,19 @@ class _Selector:
"""Initialize."""
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
self.ids = kwargs.get('ids', []) # type: List[str]
self.classes = kwargs.get('classes', []) # type: List[str]
self.attributes = kwargs.get('attributes', []) # type: List[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) # type: List[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) # type: List[ct.SelectorList]
self.relations = kwargs.get('relations', []) # type: List[_Selector]
self.ids = kwargs.get('ids', []) # type: list[str]
self.classes = kwargs.get('classes', []) # type: list[str]
self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
self.contains = kwargs.get('contains', []) # type: List[ct.SelectorContains]
self.lang = kwargs.get('lang', []) # type: List[ct.SelectorLang]
self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) # type: int
self.no_match = kwargs.get('no_match', False) # type: bool
def _freeze_relations(self, relations: List['_Selector']) -> ct.SelectorList:
def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
"""Freeze relation."""
if relations:
@ -394,7 +395,7 @@ class _Selector:
else:
return ct.SelectorList()
def freeze(self) -> Union[ct.Selector, ct.SelectorNull]:
def freeze(self) -> ct.Selector | ct.SelectorNull:
"""Freeze self."""
if self.no_match:
@ -461,7 +462,7 @@ class CSSParser:
def __init__(
self,
selector: str,
custom: Optional[Dict[str, Union[str, ct.SelectorList]]] = None,
custom: Optional[dict[str, str | ct.SelectorList]] = None,
flags: int = 0
) -> None:
"""Initialize."""
@ -583,9 +584,9 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
iselector: Iterator[Tuple[str, Match[str]]],
iselector: Iterator[tuple[str, Match[str]]],
is_html: bool
) -> Tuple[bool, bool]:
) -> tuple[bool, bool]:
"""Parse pseudo class."""
complex_pseudo = False
@ -678,7 +679,7 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
iselector: Iterator[Tuple[str, Match[str]]]
iselector: Iterator[tuple[str, Match[str]]]
) -> bool:
"""Parse `nth` pseudo."""
@ -743,7 +744,7 @@ class CSSParser:
sel: _Selector,
name: str,
has_selector: bool,
iselector: Iterator[Tuple[str, Match[str]]],
iselector: Iterator[tuple[str, Match[str]]],
index: int
) -> bool:
"""Parse pseudo with opening bracket."""
@ -752,7 +753,7 @@ class CSSParser:
if name == ':not':
flags |= FLG_NOT
elif name == ':has':
flags |= FLG_RELATIVE | FLG_FORGIVE
flags |= FLG_RELATIVE
elif name in (':where', ':is'):
flags |= FLG_FORGIVE
@ -766,21 +767,16 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
selectors: List[_Selector],
selectors: list[_Selector],
rel_type: str,
index: int
) -> Tuple[bool, _Selector, str]:
) -> tuple[bool, _Selector, str]:
"""Parse combinator tokens."""
combinator = m.group('relation').strip()
if not combinator:
combinator = WS_COMBINATOR
if combinator == COMMA_COMBINATOR:
if not has_selector:
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
sel.no_match = True
sel.rel_type = rel_type
selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR
@ -814,12 +810,12 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
selectors: List[_Selector],
relations: List[_Selector],
selectors: list[_Selector],
relations: list[_Selector],
is_pseudo: bool,
is_forgive: bool,
index: int
) -> Tuple[bool, _Selector]:
) -> tuple[bool, _Selector]:
"""Parse combinator tokens."""
combinator = m.group('relation').strip()
@ -924,7 +920,7 @@ class CSSParser:
def parse_selectors(
self,
iselector: Iterator[Tuple[str, Match[str]]],
iselector: Iterator[tuple[str, Match[str]]],
index: int = 0,
flags: int = 0
) -> ct.SelectorList:
@ -935,7 +931,7 @@ class CSSParser:
selectors = []
has_selector = False
closed = False
relations = [] # type: List[_Selector]
relations = [] # type: list[_Selector]
rel_type = ":" + WS_COMBINATOR
# Setup various flags
@ -1069,18 +1065,8 @@ class CSSParser:
selectors.append(sel)
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive:
if is_relative:
# Handle relative selectors pseudo-classes with empty slots like `:has()`
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
sel.rel_type = rel_type
sel.no_match = True
selectors[-1].relations.append(sel)
has_selector = True
else:
# Handle normal pseudo-classes with empty slots
if not selectors or not relations:
# Others like `:is()` etc.
elif is_forgive and (not selectors or not relations):
# Handle normal pseudo-classes with empty slots like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
@ -1112,7 +1098,7 @@ class CSSParser:
# Return selector list
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
def selector_iter(self, pattern: str) -> Iterator[Tuple[str, Match[str]]]:
def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
"""Iterate selector tokens."""
# Ignore whitespace and comments at start and end of pattern

View file

@ -1,7 +1,8 @@
"""CSS selector structure items."""
from __future__ import annotations
import copyreg
from .pretty import pretty
from typing import Any, Type, Tuple, Union, Dict, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
__all__ = (
'Selector',
@ -33,7 +34,7 @@ SEL_PLACEHOLDER_SHOWN = 0x400
class Immutable:
"""Immutable."""
__slots__: Tuple[str, ...] = ('_hash',)
__slots__: tuple[str, ...] = ('_hash',)
_hash: int
@ -48,7 +49,7 @@ class Immutable:
super(Immutable, self).__setattr__('_hash', hash(tuple(temp)))
@classmethod
def __base__(cls) -> "Type[Immutable]":
def __base__(cls) -> "type[Immutable]":
"""Get base class."""
return cls
@ -99,7 +100,7 @@ class ImmutableDict(Mapping[Any, Any]):
def __init__(
self,
arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]]
arg: dict[Any, Any] | Iterable[tuple[Any, Any]]
) -> None:
"""Initialize."""
@ -107,7 +108,7 @@ class ImmutableDict(Mapping[Any, Any]):
self._d = dict(arg)
self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())]))
def _validate(self, arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]]) -> None:
def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
@ -147,12 +148,12 @@ class ImmutableDict(Mapping[Any, Any]):
class Namespaces(ImmutableDict):
"""Namespaces."""
def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize."""
super().__init__(arg)
def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
@ -165,12 +166,12 @@ class Namespaces(ImmutableDict):
class CustomSelectors(ImmutableDict):
"""Custom selectors."""
def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize."""
super().__init__(arg)
def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
@ -188,30 +189,30 @@ class Selector(Immutable):
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
)
tag: Optional['SelectorTag']
ids: Tuple[str, ...]
classes: Tuple[str, ...]
attributes: Tuple['SelectorAttribute', ...]
nth: Tuple['SelectorNth', ...]
selectors: Tuple['SelectorList', ...]
relation: 'SelectorList'
tag: Optional[SelectorTag]
ids: tuple[str, ...]
classes: tuple[str, ...]
attributes: tuple[SelectorAttribute, ...]
nth: tuple[SelectorNth, ...]
selectors: tuple[SelectorList, ...]
relation: SelectorList
rel_type: Optional[str]
contains: Tuple['SelectorContains', ...]
lang: Tuple['SelectorLang', ...]
contains: tuple[SelectorContains, ...]
lang: tuple[SelectorLang, ...]
flags: int
def __init__(
self,
tag: Optional['SelectorTag'],
ids: Tuple[str, ...],
classes: Tuple[str, ...],
attributes: Tuple['SelectorAttribute', ...],
nth: Tuple['SelectorNth', ...],
selectors: Tuple['SelectorList', ...],
relation: 'SelectorList',
tag: Optional[SelectorTag],
ids: tuple[str, ...],
classes: tuple[str, ...],
attributes: tuple[SelectorAttribute, ...],
nth: tuple[SelectorNth, ...],
selectors: tuple[SelectorList, ...],
relation: SelectorList,
rel_type: Optional[str],
contains: Tuple['SelectorContains', ...],
lang: Tuple['SelectorLang', ...],
contains: tuple[SelectorContains, ...],
lang: tuple[SelectorLang, ...],
flags: int
):
"""Initialize."""
@ -286,7 +287,7 @@ class SelectorContains(Immutable):
__slots__ = ("text", "own", "_hash")
text: Tuple[str, ...]
text: tuple[str, ...]
own: bool
def __init__(self, text: Iterable[str], own: bool) -> None:
@ -305,9 +306,9 @@ class SelectorNth(Immutable):
b: int
of_type: bool
last: bool
selectors: 'SelectorList'
selectors: SelectorList
def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: 'SelectorList') -> None:
def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None:
"""Initialize."""
super().__init__(
@ -325,7 +326,7 @@ class SelectorLang(Immutable):
__slots__ = ("languages", "_hash",)
languages: Tuple[str, ...]
languages: tuple[str, ...]
def __init__(self, languages: Iterable[str]):
"""Initialize."""
@ -353,13 +354,13 @@ class SelectorList(Immutable):
__slots__ = ("selectors", "is_not", "is_html", "_hash")
selectors: Tuple[Union['Selector', 'SelectorNull'], ...]
selectors: tuple[Selector | SelectorNull, ...]
is_not: bool
is_html: bool
def __init__(
self,
selectors: Optional[Iterable[Union['Selector', 'SelectorNull']]] = None,
selectors: Optional[Iterable[Selector | SelectorNull]] = None,
is_not: bool = False,
is_html: bool = False
) -> None:
@ -371,7 +372,7 @@ class SelectorList(Immutable):
is_html=is_html
)
def __iter__(self) -> Iterator[Union['Selector', 'SelectorNull']]:
def __iter__(self) -> Iterator[Selector | SelectorNull]:
"""Iterator."""
return iter(self.selectors)
@ -381,7 +382,7 @@ class SelectorList(Immutable):
return len(self.selectors)
def __getitem__(self, index: int) -> Union['Selector', 'SelectorNull']:
def __getitem__(self, index: int) -> Selector | SelectorNull:
"""Get item."""
return self.selectors[index]

View file

@ -65,6 +65,7 @@ SelectorList(
is_html=False)
```
"""
from __future__ import annotations
import re
from typing import Any

View file

@ -1,8 +1,9 @@
"""Utility."""
from __future__ import annotations
from functools import wraps, lru_cache
import warnings
import re
from typing import Callable, Any, Optional, Tuple, List
from typing import Callable, Any, Optional
DEBUG = 0x00001
@ -75,13 +76,13 @@ def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no co
)
def get_pattern_context(pattern: str, index: int) -> Tuple[str, int, int]:
def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
"""Get the pattern context."""
last = 0
current_line = 1
col = 1
text = [] # type: List[str]
text = [] # type: list[str]
line = 1
offset = None # type: Optional[int]

View file

@ -4,7 +4,7 @@ arrow==1.2.3
backports.csv==1.0.7
backports.functools-lru-cache==1.6.4
backports.zoneinfo==0.2.1;python_version<"3.9"
beautifulsoup4==4.11.1
beautifulsoup4==4.11.2
bleach==6.0.0
certifi==2022.12.7
cheroot==9.0.0