diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index b3c9feb8..db71cc7c 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
-Beautiful Soup works with Python 3.5 and up. It works better if lxml
+Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
@@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.11.1"
-__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
+__version__ = "4.11.2"
+__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@@ -211,7 +211,7 @@ class BeautifulSoup(Tag):
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name),
- DeprecationWarning
+ DeprecationWarning, stacklevel=3
)
return kwargs.pop(old_name)
return None
@@ -405,7 +405,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.',
- MarkupResemblesLocatorWarning
+ MarkupResemblesLocatorWarning,
+ stacklevel=3
)
return True
return False
@@ -436,7 +437,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into'
' Beautiful Soup.',
- MarkupResemblesLocatorWarning
+ MarkupResemblesLocatorWarning, stacklevel=3
)
return True
return False
@@ -789,7 +790,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.',
- DeprecationWarning
+ DeprecationWarning, stacklevel=2
)
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
index 9f789f3e..2e397458 100644
--- a/lib/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -122,7 +122,7 @@ class TreeBuilder(object):
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
- DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+ DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
# Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index 58bc176e..dac21732 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
- warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+ warnings.warn(
+ "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
+ stacklevel=3
+ )
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
@@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
- warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+ warnings.warn(
+ "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
+ stacklevel=4
+ )
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()
@@ -249,9 +255,9 @@ class AttrList(object):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = self.element.cdata_list_attributes or {}
- if (name in list_attr.get('*')
+ if (name in list_attr.get('*', [])
or (self.element.name in list_attr
- and name in list_attr[self.element.name])):
+ and name in list_attr.get(self.element.name, []))):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
index fae4d0f2..e48b6a0e 100644
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser
-try:
- from html.parser import HTMLParseError
-except ImportError as e:
- # HTMLParseError is removed in Python 3.5. Since it can never be
- # thrown in 3.5, we can just define our own class as a placeholder.
- class HTMLParseError(Exception):
- pass
-
import sys
import warnings
-# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
-# argument, which we'd like to set to False. Unfortunately,
-# http://bugs.python.org/issue13273 makes strict=True a better bet
-# before Python 3.2.3.
-#
-# At the end of this file, we monkeypatch HTMLParser so that
-# strict=True works well on Python 3.2.2.
-major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
-CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
-CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
-
-
from bs4.element import (
CData,
Comment,
@@ -90,20 +69,7 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self.already_closed_empty_element = []
self._initialize_xml_detector()
-
- def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although
- this requirement doesn't appear to be documented.
- In Python 2, HTMLParser implements error() by raising an exception,
- which we don't want to do.
-
- In any event, this method is called only on very strange
- markup and our best strategy is to pretend it didn't happen
- and keep going.
- """
- warnings.warn(msg)
-
def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
@@ -203,9 +169,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal.
"""
- # XXX workaround for a bug in HTMLParser. Remove this once
- # it's fixed in all supported versions.
- # http://bugs.python.org/issue13633
+ # TODO: This was originally a workaround for a bug in
+ # HTMLParser. (http://bugs.python.org/issue13633) The bug has
+ # been fixed, but removing this code still makes some
+ # Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@@ -333,10 +300,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs)
- if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
- parser_kwargs['strict'] = False
- if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
- parser_kwargs['convert_charrefs'] = False
+ parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
@@ -395,105 +359,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
- try:
- parser.feed(markup)
- parser.close()
- except HTMLParseError as e:
- warnings.warn(RuntimeWarning(
- "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
- raise e
+ parser.feed(markup)
+ parser.close()
parser.already_closed_empty_element = []
-
-# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
-# 3.2.3 code. This ensures they don't treat markup like
as a
-# string.
-#
-# XXX This code can be removed once most Python 3 users are on 3.2.3.
-if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
- import re
- attrfind_tolerant = re.compile(
- r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
- HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
-
- locatestarttagend = re.compile(r"""
- <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s+ # whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
- (?:\s*=\s* # value indicator
- (?:'[^']*' # LITA-enclosed value
- |\"[^\"]*\" # LIT-enclosed value
- |[^'\">\s]+ # bare value
- )
- )?
- )
- )*
- \s* # trailing whitespace
-""", re.VERBOSE)
- BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
-
- from html.parser import tagfind, attrfind
-
- def parse_starttag(self, i):
- self.__starttag_text = None
- endpos = self.check_for_whole_start_tag(i)
- if endpos < 0:
- return endpos
- rawdata = self.rawdata
- self.__starttag_text = rawdata[i:endpos]
-
- # Now parse the data between i+1 and j into a tag and attrs
- attrs = []
- match = tagfind.match(rawdata, i+1)
- assert match, 'unexpected call to parse_starttag()'
- k = match.end()
- self.lasttag = tag = rawdata[i+1:k].lower()
- while k < endpos:
- if self.strict:
- m = attrfind.match(rawdata, k)
- else:
- m = attrfind_tolerant.match(rawdata, k)
- if not m:
- break
- attrname, rest, attrvalue = m.group(1, 2, 3)
- if not rest:
- attrvalue = None
- elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
- attrvalue[:1] == '"' == attrvalue[-1:]:
- attrvalue = attrvalue[1:-1]
- if attrvalue:
- attrvalue = self.unescape(attrvalue)
- attrs.append((attrname.lower(), attrvalue))
- k = m.end()
-
- end = rawdata[k:endpos].strip()
- if end not in (">", "/>"):
- lineno, offset = self.getpos()
- if "\n" in self.__starttag_text:
- lineno = lineno + self.__starttag_text.count("\n")
- offset = len(self.__starttag_text) \
- - self.__starttag_text.rfind("\n")
- else:
- offset = offset + len(self.__starttag_text)
- if self.strict:
- self.error("junk characters in start tag: %r"
- % (rawdata[k:endpos][:20],))
- self.handle_data(rawdata[i:endpos])
- return endpos
- if end.endswith('/>'):
- # XHTML-style empty tag:
- self.handle_startendtag(tag, attrs)
- else:
- self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
- return endpos
-
- def set_cdata_mode(self, elem):
- self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
-
- BeautifulSoupHTMLParser.parse_starttag = parse_starttag
- BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
-
- CONSTRUCTOR_TAKES_STRICT = True
diff --git a/lib/bs4/element.py b/lib/bs4/element.py
index 74b1dc0f..583d0e8a 100644
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@@ -496,13 +496,16 @@ class PageElement(object):
def extend(self, tags):
"""Appends the given PageElements to this one's contents.
- :param tags: A list of PageElements.
+ :param tags: A list of PageElements. If a single Tag is
+ provided instead, this PageElement's contents will be extended
+ with that Tag's contents.
"""
if isinstance(tags, Tag):
- # Calling self.append() on another tag's contents will change
- # the list we're iterating over. Make a list that won't
- # change.
- tags = list(tags.contents)
+ tags = tags.contents
+ if isinstance(tags, list):
+ # Moving items around the tree may change their position in
+ # the original list. Make a list that won't change.
+ tags = list(tags)
for tag in tags:
self.append(tag)
@@ -586,8 +589,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements.
"""
+ _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements,
- **kwargs)
+ _stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@@ -624,8 +628,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
- return self._find_all(name, attrs, string, limit,
- self.next_siblings, **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit,
+ self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
+ )
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
@@ -663,8 +670,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
- return self._find_all(name, attrs, string, limit, self.previous_elements,
- **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit, self.previous_elements,
+ _stacklevel=_stacklevel+1, **kwargs
+ )
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
@@ -702,8 +712,11 @@ class PageElement(object):
:return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet
"""
- return self._find_all(name, attrs, string, limit,
- self.previous_siblings, **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit,
+ self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
+ )
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
@@ -724,7 +737,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
- l = self.find_parents(name, attrs, 1, **kwargs)
+ l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l:
r = l[0]
return r
@@ -744,8 +757,9 @@ class PageElement(object):
:return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
+ _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents,
- **kwargs)
+ _stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@@ -771,19 +785,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs):
r = None
- l = method(name, attrs, string, 1, **kwargs)
+ l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l:
r = l[0]
return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
+ _stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
- DeprecationWarning
+ DeprecationWarning, stacklevel=_stacklevel
)
if isinstance(name, SoupStrainer):
@@ -1306,7 +1321,8 @@ class Tag(PageElement):
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
- preserve_whitespace_tags=self.preserve_whitespace_tags
+ preserve_whitespace_tags=self.preserve_whitespace_tags,
+ interesting_string_types=self.interesting_string_types
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
@@ -1558,7 +1574,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name
),
- DeprecationWarning
+ DeprecationWarning, stacklevel=2
)
return self.find(tag_name)
# We special case contents to avoid recursion.
@@ -1862,7 +1878,8 @@ class Tag(PageElement):
:rtype: bs4.element.Tag | bs4.element.NavigableString
"""
r = None
- l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
+ l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
+ **kwargs)
if l:
r = l[0]
return r
@@ -1889,7 +1906,9 @@ class Tag(PageElement):
generator = self.descendants
if not recursive:
generator = self.children
- return self._find_all(name, attrs, string, limit, generator, **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(name, attrs, string, limit, generator,
+ _stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3
findChildren = find_all # BS2
@@ -1993,7 +2012,7 @@ class Tag(PageElement):
"""
warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.',
- DeprecationWarning
+ DeprecationWarning, stacklevel=2
)
return self.has_attr(key)
@@ -2024,7 +2043,7 @@ class SoupStrainer(object):
string = kwargs.pop('text')
warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
- DeprecationWarning
+ DeprecationWarning, stacklevel=2
)
self.name = self._normalize_search_value(name)
diff --git a/lib/bs4/formatter.py b/lib/bs4/formatter.py
index 65e57b57..83cc1c5c 100644
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
"""A generic Formatter for HTML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
- return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
+ super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
"""A generic Formatter for XML."""
REGISTRY = {}
def __init__(self, *args, **kwargs):
- return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+ super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.
diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py
index 4af4b0ce..f4d62db9 100644
--- a/lib/bs4/tests/__init__.py
+++ b/lib/bs4/tests/__init__.py
@@ -29,6 +29,29 @@ from bs4.builder import (
)
default_builder = HTMLParserTreeBuilder
+# Some tests depend on specific third-party libraries. We use
+# @pytest.mark.skipIf on the following conditionals to skip them
+# if the libraries are not installed.
+try:
+ from soupsieve import SelectorSyntaxError
+ SOUP_SIEVE_PRESENT = True
+except ImportError:
+ SOUP_SIEVE_PRESENT = False
+
+try:
+ import html5lib
+ HTML5LIB_PRESENT = True
+except ImportError:
+ HTML5LIB_PRESENT = False
+
+try:
+ import lxml.etree
+ LXML_PRESENT = True
+ LXML_VERSION = lxml.etree.LXML_VERSION
+except ImportError:
+ LXML_PRESENT = False
+ LXML_VERSION = (0,)
+
BAD_DOCUMENT = """A bare string
@@ -258,10 +281,10 @@ class TreeBuilderSmokeTest(object):
@pytest.mark.parametrize(
"multi_valued_attributes",
- [None, dict(b=['class']), {'*': ['notclass']}]
+ [None, {}, dict(b=['class']), {'*': ['notclass']}]
)
def test_attribute_not_multi_valued(self, multi_valued_attributes):
- markup = ''
+ markup = ''
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a['class'] == 'a b c'
@@ -820,26 +843,27 @@ Hello, world!
soup = self.soup(text)
assert soup.p.encode("utf-8") == expected
- def test_real_iso_latin_document(self):
+ def test_real_iso_8859_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
- # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
- unicode_html = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
'
+ # Here it is in Unicode. Note that it claims to be in ISO-8859-1.
+ unicode_html = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
'
- # That's because we're going to encode it into ISO-Latin-1, and use
- # that to test.
+ # That's because we're going to encode it into ISO-8859-1,
+ # and use that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
- # Parse the ISO-Latin-1 HTML.
+ # Parse the ISO-8859-1 HTML.
soup = self.soup(iso_latin_html)
+
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
- # UTF-8 instead of ISO-Latin-1.
- expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+ # UTF-8 instead of ISO-8859-1.
+ expected = unicode_html.replace("ISO-8859-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
@@ -1177,15 +1201,3 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
assert isinstance(soup.contents[0], Comment)
assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?'
assert "html" == soup.contents[0].next_element.name
-
-def skipIf(condition, reason):
- def nothing(test, *args, **kwargs):
- return None
-
- def decorator(test_item):
- if condition:
- return nothing
- else:
- return test_item
-
- return decorator
diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py
index 5fa874c8..9327174f 100644
--- a/lib/bs4/tests/test_builder_registry.py
+++ b/lib/bs4/tests/test_builder_registry.py
@@ -10,22 +10,23 @@ from bs4.builder import (
TreeBuilderRegistry,
)
-try:
- from bs4.builder import HTML5TreeBuilder
- HTML5LIB_PRESENT = True
-except ImportError:
- HTML5LIB_PRESENT = False
+from . import (
+ HTML5LIB_PRESENT,
+ LXML_PRESENT,
+)
-try:
+if HTML5LIB_PRESENT:
+ from bs4.builder import HTML5TreeBuilder
+
+if LXML_PRESENT:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
- LXML_PRESENT = True
-except ImportError:
- LXML_PRESENT = False
+# TODO: Split out the lxml and html5lib tests into their own classes
+# and gate with pytest.mark.skipIf.
class TestBuiltInRegistry(object):
"""Test the built-in registry with the default builders registered."""
diff --git a/lib/bs4/tests/test_dammit.py b/lib/bs4/tests/test_dammit.py
index 9971234e..9aad0ac6 100644
--- a/lib/bs4/tests/test_dammit.py
+++ b/lib/bs4/tests/test_dammit.py
@@ -17,26 +17,24 @@ class TestUnicodeDammit(object):
dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == markup
- def test_smart_quotes_to_unicode(self):
+ @pytest.mark.parametrize(
+ "smart_quotes_to,expect_converted",
+ [(None, "\u2018\u2019\u201c\u201d"),
+ ("xml", "‘’“”"),
+ ("html", "‘’“”"),
+ ("ascii", "''" + '""'),
+ ]
+ )
+ def test_smart_quotes_to(self, smart_quotes_to, expect_converted):
+ """Verify the functionality of the smart_quotes_to argument
+ to the UnicodeDammit constructor."""
markup = b"\x91\x92\x93\x94"
- dammit = UnicodeDammit(markup)
- assert dammit.unicode_markup == "\u2018\u2019\u201c\u201d"
-
- def test_smart_quotes_to_xml_entities(self):
- markup = b"\x91\x92\x93\x94"
- dammit = UnicodeDammit(markup, smart_quotes_to="xml")
- assert dammit.unicode_markup == "‘’“”"
-
- def test_smart_quotes_to_html_entities(self):
- markup = b"\x91\x92\x93\x94"
- dammit = UnicodeDammit(markup, smart_quotes_to="html")
- assert dammit.unicode_markup == "‘’“”"
-
- def test_smart_quotes_to_ascii(self):
- markup = b"\x91\x92\x93\x94"
- dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
- assert dammit.unicode_markup == """''"""""
-
+ converted = UnicodeDammit(
+ markup, known_definite_encodings=["windows-1252"],
+ smart_quotes_to=smart_quotes_to
+ ).unicode_markup
+ assert converted == "{}".format(expect_converted)
+
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
@@ -275,23 +273,24 @@ class TestEntitySubstitution(object):
def setup_method(self):
self.sub = EntitySubstitution
- def test_simple_html_substitution(self):
- # Unicode characters corresponding to named HTML entites
- # are substituted, and no others.
- s = "foo\u2200\N{SNOWMAN}\u00f5bar"
- assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar"
- def test_smart_quote_substitution(self):
- # MS smart quotes are a common source of frustration, so we
- # give them a special test.
- quotes = b"\x91\x92foo\x93\x94"
- dammit = UnicodeDammit(quotes)
- assert self.sub.substitute_html(dammit.markup) == "‘’foo“”"
+ @pytest.mark.parametrize(
+ "original,substituted",
+ [
+ # Basic case. Unicode characters corresponding to named
+ # HTML entites are substituted; others are not.
+ ("foo\u2200\N{SNOWMAN}\u00f5bar",
+ "foo∀\N{SNOWMAN}õbar"),
+ # MS smart quotes are a common source of frustration, so we
+ # give them a special test.
+ ('‘’foo“”', "‘’foo“”"),
+ ]
+ )
+ def test_substitute_html(self, original, substituted):
+ assert self.sub.substitute_html(original) == substituted
+
def test_html5_entity(self):
- # Some HTML5 entities correspond to single- or multi-character
- # Unicode sequences.
-
for entity, u in (
# A few spot checks of our ability to recognize
# special character sequences and convert them
diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py
index b32ab304..4197720f 100644
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@@ -1,27 +1,26 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
+import pytest
import warnings
-try:
- from bs4.builder import HTML5TreeBuilder
- HTML5LIB_PRESENT = True
-except ImportError as e:
- HTML5LIB_PRESENT = False
+from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
from . import (
+ HTML5LIB_PRESENT,
HTML5TreeBuilderSmokeTest,
SoupTest,
- skipIf,
)
-@skipIf(
+@pytest.mark.skipif(
not HTML5LIB_PRESENT,
- "html5lib seems not to be present, not testing its tree builder.")
+ reason="html5lib seems not to be present, not testing its tree builder."
+)
class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
+ from bs4.builder import HTML5TreeBuilder
return HTML5TreeBuilder
def test_soupstrainer(self):
@@ -29,10 +28,12 @@ class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
strainer = SoupStrainer("b")
markup = "A bold statement.
"
with warnings.catch_warnings(record=True) as w:
- soup = self.soup(markup, parse_only=strainer)
+ soup = BeautifulSoup(markup, "html5lib", parse_only=strainer)
assert soup.decode() == self.document_for(markup)
- assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message)
+ [warning] = w
+ assert warning.filename == __file__
+ assert "the html5lib tree builder doesn't support parse_only" in str(warning.message)
def test_correctly_nested_tables(self):
"""html5lib inserts tags where other parsers don't."""
diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py
index bfcfa1f3..470d3936 100644
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@@ -122,15 +122,3 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with_element = div.encode(formatter="html")
expect = b"%s
" % output_element
assert with_element == expect
-
-class TestHTMLParserSubclass(SoupTest):
- def test_error(self):
- """Verify that our HTMLParser subclass implements error() in a way
- that doesn't cause a crash.
- """
- parser = BeautifulSoupHTMLParser()
- with warnings.catch_warnings(record=True) as warns:
- parser.error("don't crash")
- [warning] = warns
- assert "don't crash" == str(warning.message)
-
diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py
index 396ca0ef..c7bf45d3 100644
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@@ -1,16 +1,10 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import pickle
+import pytest
import re
import warnings
-
-try:
- import lxml.etree
- LXML_PRESENT = True
- LXML_VERSION = lxml.etree.LXML_VERSION
-except ImportError as e:
- LXML_PRESENT = False
- LXML_VERSION = (0,)
+from . import LXML_PRESENT, LXML_VERSION
if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
@@ -23,13 +17,14 @@ from bs4.element import Comment, Doctype, SoupStrainer
from . import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
+ SOUP_SIEVE_PRESENT,
SoupTest,
- skipIf,
)
-@skipIf(
+@pytest.mark.skipif(
not LXML_PRESENT,
- "lxml seems not to be present, not testing its tree builder.")
+ reason="lxml seems not to be present, not testing its tree builder."
+)
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@@ -54,9 +49,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
- @skipIf(
+ @pytest.mark.skipif(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
- "Skipping doctype test for old version of lxml to avoid segfault.")
+ reason="Skipping doctype test for old version of lxml to avoid segfault."
+ )
def test_empty_doctype(self):
soup = self.soup("")
doctype = soup.contents[0]
@@ -68,7 +64,9 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("")
assert "" == str(soup.b)
- assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)
+ [warning] = w
+ assert warning.filename == __file__
+ assert "BeautifulStoneSoup class is deprecated" in str(warning.message)
def test_tracking_line_numbers(self):
# The lxml TreeBuilder cannot keep track of line numbers from
@@ -85,9 +83,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
assert "sourceline" == soup.p.sourceline.name
assert "sourcepos" == soup.p.sourcepos.name
-@skipIf(
+@pytest.mark.skipif(
not LXML_PRESENT,
- "lxml seems not to be present, not testing its XML tree builder.")
+ reason="lxml seems not to be present, not testing its XML tree builder."
+)
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@@ -148,6 +147,9 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
}
+ @pytest.mark.skipif(
+ not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"
+ )
def test_namespace_interaction_with_select_and_find(self):
# Demonstrate how namespaces interact with select* and
# find* methods.
diff --git a/lib/bs4/tests/test_pageelement.py b/lib/bs4/tests/test_pageelement.py
index 26783f2c..6674dadf 100644
--- a/lib/bs4/tests/test_pageelement.py
+++ b/lib/bs4/tests/test_pageelement.py
@@ -3,15 +3,18 @@ import copy
import pickle
import pytest
-from soupsieve import SelectorSyntaxError
-
from bs4 import BeautifulSoup
from bs4.element import (
Comment,
SoupStrainer,
)
-from . import SoupTest
+from . import (
+ SoupTest,
+ SOUP_SIEVE_PRESENT,
+)
+if SOUP_SIEVE_PRESENT:
+ from soupsieve import SelectorSyntaxError
class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
@@ -213,6 +216,7 @@ class TestFormatters(SoupTest):
assert soup.contents[0].name == 'pre'
+@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality.
@@ -694,6 +698,7 @@ class TestPersistence(SoupTest):
assert tag.can_be_empty_element == copied.can_be_empty_element
assert tag.cdata_list_attributes == copied.cdata_list_attributes
assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
+ assert tag.interesting_string_types == copied.interesting_string_types
def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled.
diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py
index 445f74da..64b8cf12 100644
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@@ -30,19 +30,11 @@ from bs4.element import (
from . import (
default_builder,
+ LXML_PRESENT,
SoupTest,
- skipIf,
)
import warnings
-
-try:
- from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
- LXML_PRESENT = True
-except ImportError as e:
- LXML_PRESENT = False
-PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
-
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
@@ -139,7 +131,7 @@ class TestConstructor(SoupTest):
assert " an id " == a['id']
assert ["a", "class"] == a['class']
- # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
+ # TreeBuilder takes an argument called 'multi_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
@@ -219,10 +211,17 @@ class TestConstructor(SoupTest):
class TestWarnings(SoupTest):
+ # Note that some of the tests in this class create BeautifulSoup
+ # objects directly rather than using self.soup(). That's
+ # because SoupTest.soup is defined in a different file,
+ # which will throw off the assertion in _assert_warning
+ # that the code that triggered the warning is in the same
+ # file as the test.
def _assert_warning(self, warnings, cls):
for w in warnings:
if isinstance(w.message, cls):
+ assert w.filename == __file__
return w
raise Exception("%s warning not found in %r" % (cls, warnings))
@@ -243,13 +242,17 @@ class TestWarnings(SoupTest):
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
- soup = BeautifulSoup("", "html.parser")
+ soup = self.soup("")
assert [] == w
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
- soup = self.soup("", parseOnlyThese=SoupStrainer("b"))
- msg = str(w[0].message)
+ soup = BeautifulSoup(
+ "", "html.parser",
+ parseOnlyThese=SoupStrainer("b"),
+ )
+ warning = self._assert_warning(w, DeprecationWarning)
+ msg = str(warning.message)
assert "parseOnlyThese" in msg
assert "parse_only" in msg
assert b"" == soup.encode()
@@ -257,8 +260,11 @@ class TestWarnings(SoupTest):
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
- soup = self.soup(utf8, fromEncoding="utf8")
- msg = str(w[0].message)
+ soup = BeautifulSoup(
+ utf8, "html.parser", fromEncoding="utf8"
+ )
+ warning = self._assert_warning(w, DeprecationWarning)
+ msg = str(warning.message)
assert "fromEncoding" in msg
assert "from_encoding" in msg
assert "utf8" == soup.original_encoding
@@ -276,7 +282,7 @@ class TestWarnings(SoupTest):
# A warning is issued if the "markup" looks like the name of
# an HTML or text file, or a full path to a file on disk.
with warnings.catch_warnings(record=True) as w:
- soup = self.soup("markup" + extension)
+ soup = BeautifulSoup("markup" + extension, "html.parser")
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks more like a filename" in str(warning.message)
@@ -291,11 +297,11 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("markup" + extension)
assert [] == w
-
+
def test_url_warning_with_bytes_url(self):
url = b"http://www.crummybytes.com/"
with warnings.catch_warnings(record=True) as warning_list:
- soup = self.soup(url)
+ soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
@@ -307,7 +313,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
- soup = self.soup(url)
+ soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning
)
@@ -347,18 +353,22 @@ class TestNewTag(SoupTest):
assert "foo" == new_tag.name
assert dict(bar="baz", name="a name") == new_tag.attrs
assert None == new_tag.parent
-
+
+ @pytest.mark.skipif(
+ not LXML_PRESENT,
+ reason="lxml not installed, cannot parse XML document"
+ )
+ def test_xml_tag_inherits_self_closing_rules_from_builder(self):
+ xml_soup = BeautifulSoup("", "xml")
+ xml_br = xml_soup.new_tag("br")
+ xml_p = xml_soup.new_tag("p")
+
+ # Both the
and tag are empty-element, just because
+ # they have no contents.
+ assert b"
" == xml_br.encode()
+ assert b"
" == xml_p.encode()
+
def test_tag_inherits_self_closing_rules_from_builder(self):
- if LXML_PRESENT:
- xml_soup = BeautifulSoup("", "lxml-xml")
- xml_br = xml_soup.new_tag("br")
- xml_p = xml_soup.new_tag("p")
-
- # Both the
and tag are empty-element, just because
- # they have no contents.
- assert b"
" == xml_br.encode()
- assert b"
" == xml_p.encode()
-
html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p")
@@ -450,13 +460,3 @@ class TestEncodingConversion(SoupTest):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
assert soup_from_unicode.encode('utf-8') == self.utf8_data
-
- @skipIf(
- PYTHON_3_PRE_3_2,
- "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
- def test_attribute_name_containing_unicode_characters(self):
- markup = ''
- assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")
-
-
-
diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py
index bfd6826e..26995f95 100644
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@@ -33,7 +33,6 @@ from bs4.element import (
)
from . import (
SoupTest,
- skipIf,
)
class TestFind(SoupTest):
@@ -910,12 +909,16 @@ class TestTreeModification(SoupTest):
soup.a.extend(l)
assert "" == soup.decode()
- def test_extend_with_another_tags_contents(self):
+ @pytest.mark.parametrize(
+ "get_tags", [lambda tag: tag, lambda tag: tag.contents]
+ )
+ def test_extend_with_another_tags_contents(self, get_tags):
data = ''
soup = self.soup(data)
d1 = soup.find('div', id='d1')
d2 = soup.find('div', id='d2')
- d2.extend(d1)
+ tags = get_tags(d1)
+ d2.extend(tags)
assert '' == d1.decode()
assert '' == d2.decode()
@@ -1272,19 +1275,30 @@ class TestTreeModification(SoupTest):
class TestDeprecatedArguments(SoupTest):
- def test_find_type_method_string(self):
+ @pytest.mark.parametrize(
+ "method_name", [
+ "find", "find_all", "find_parent", "find_parents",
+ "find_next", "find_all_next", "find_previous",
+ "find_all_previous", "find_next_sibling", "find_next_siblings",
+ "find_previous_sibling", "find_previous_siblings",
+ ]
+ )
+ def test_find_type_method_string(self, method_name):
soup = self.soup("somemarkup")
+ method = getattr(soup.b, method_name)
with warnings.catch_warnings(record=True) as w:
- [result] = soup.find_all(text='markup')
- assert result == 'markup'
- assert result.parent.name == 'b'
- msg = str(w[0].message)
+ method(text='markup')
+ [warning] = w
+ assert warning.filename == __file__
+ msg = str(warning.message)
assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead."
def test_soupstrainer_constructor_string(self):
with warnings.catch_warnings(record=True) as w:
strainer = SoupStrainer(text="text")
assert strainer.text == 'text'
- msg = str(w[0].message)
+ [warning] = w
+ msg = str(warning.message)
+ assert warning.filename == __file__
assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead."
diff --git a/lib/soupsieve/__init__.py b/lib/soupsieve/__init__.py
index c89b7002..591a4f4f 100644
--- a/lib/soupsieve/__init__.py
+++ b/lib/soupsieve/__init__.py
@@ -25,13 +25,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
+from __future__ import annotations
from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp
from . import css_match as cm
from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401
import bs4 # type: ignore[import]
-from typing import Dict, Optional, Any, List, Iterator, Iterable
+from typing import Optional, Any, Iterator, Iterable
__all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
@@ -44,17 +45,14 @@ SoupSieve = cm.SoupSieve
def compile( # noqa: A001
pattern: str,
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> cm.SoupSieve:
"""Compile CSS pattern."""
- ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
- cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
-
if isinstance(pattern, SoupSieve):
if flags:
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
@@ -64,7 +62,12 @@ def compile( # noqa: A001
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern
- return cp._cached_css_compile(pattern, ns, cs, flags)
+ return cp._cached_css_compile(
+ pattern,
+ ct.Namespaces(namespaces) if namespaces is not None else namespaces,
+ ct.CustomSelectors(custom) if custom is not None else custom,
+ flags
+ )
def purge() -> None:
@@ -76,10 +79,10 @@ def purge() -> None:
def closest(
select: str,
tag: 'bs4.Tag',
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Match closest ancestor."""
@@ -90,10 +93,10 @@ def closest(
def match(
select: str,
tag: 'bs4.Tag',
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> bool:
"""Match node."""
@@ -104,12 +107,12 @@ def match(
def filter( # noqa: A001
select: str,
iterable: Iterable['bs4.Tag'],
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
-) -> List['bs4.Tag']:
+) -> list['bs4.Tag']:
"""Filter list of nodes."""
return compile(select, namespaces, flags, **kwargs).filter(iterable)
@@ -118,10 +121,10 @@ def filter( # noqa: A001
def select_one(
select: str,
tag: 'bs4.Tag',
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Select a single tag."""
@@ -132,13 +135,13 @@ def select_one(
def select(
select: str,
tag: 'bs4.Tag',
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
limit: int = 0,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
-) -> List['bs4.Tag']:
+) -> list['bs4.Tag']:
"""Select the specified tags."""
return compile(select, namespaces, flags, **kwargs).select(tag, limit)
@@ -147,11 +150,11 @@ def select(
def iselect(
select: str,
tag: 'bs4.Tag',
- namespaces: Optional[Dict[str, str]] = None,
+ namespaces: Optional[dict[str, str]] = None,
limit: int = 0,
flags: int = 0,
*,
- custom: Optional[Dict[str, str]] = None,
+ custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> Iterator['bs4.Tag']:
"""Iterate the specified tags."""
diff --git a/lib/soupsieve/__meta__.py b/lib/soupsieve/__meta__.py
index 3bd6607f..5369314e 100644
--- a/lib/soupsieve/__meta__.py
+++ b/lib/soupsieve/__meta__.py
@@ -1,4 +1,5 @@
"""Meta related things."""
+from __future__ import annotations
from collections import namedtuple
import re
@@ -83,7 +84,7 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre"
cls,
major: int, minor: int, micro: int, release: str = "final",
pre: int = 0, post: int = 0, dev: int = 0
- ) -> "Version":
+ ) -> Version:
"""Validate version info."""
# Ensure all parts are positive integers.
@@ -192,5 +193,5 @@ def parse_version(ver: str) -> Version:
return Version(major, minor, micro, release, pre, post, dev)
-__version_info__ = Version(2, 3, 2, "final", post=1)
+__version_info__ = Version(2, 4, 0, "final")
__version__ = __version_info__._get_canonical()
diff --git a/lib/soupsieve/css_match.py b/lib/soupsieve/css_match.py
index 49e5f070..65752829 100644
--- a/lib/soupsieve/css_match.py
+++ b/lib/soupsieve/css_match.py
@@ -1,11 +1,12 @@
"""CSS matcher."""
+from __future__ import annotations
from datetime import datetime
from . import util
import re
from . import css_types as ct
import unicodedata
import bs4 # type: ignore[import]
-from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast
+from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@@ -64,12 +65,12 @@ class _FakeParent:
fake parent so we can traverse the root element as a child.
"""
- def __init__(self, element: 'bs4.Tag') -> None:
+ def __init__(self, element: bs4.Tag) -> None:
"""Initialize."""
self.contents = [element]
- def __len__(self) -> 'bs4.PageElement':
+ def __len__(self) -> bs4.PageElement:
"""Length."""
return len(self.contents)
@@ -87,59 +88,59 @@ class _DocumentNav:
raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag)))
@staticmethod
- def is_doc(obj: 'bs4.Tag') -> bool:
+ def is_doc(obj: bs4.Tag) -> bool:
"""Is `BeautifulSoup` object."""
return isinstance(obj, bs4.BeautifulSoup)
@staticmethod
- def is_tag(obj: 'bs4.PageElement') -> bool:
+ def is_tag(obj: bs4.PageElement) -> bool:
"""Is tag."""
return isinstance(obj, bs4.Tag)
@staticmethod
- def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover
+ def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is declaration."""
return isinstance(obj, bs4.Declaration)
@staticmethod
- def is_cdata(obj: 'bs4.PageElement') -> bool:
+ def is_cdata(obj: bs4.PageElement) -> bool:
"""Is CDATA."""
return isinstance(obj, bs4.CData)
@staticmethod
- def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover
+ def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is processing instruction."""
return isinstance(obj, bs4.ProcessingInstruction)
@staticmethod
- def is_navigable_string(obj: 'bs4.PageElement') -> bool:
+ def is_navigable_string(obj: bs4.PageElement) -> bool:
"""Is navigable string."""
return isinstance(obj, bs4.NavigableString)
@staticmethod
- def is_special_string(obj: 'bs4.PageElement') -> bool:
+ def is_special_string(obj: bs4.PageElement) -> bool:
"""Is special string."""
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
@classmethod
- def is_content_string(cls, obj: 'bs4.PageElement') -> bool:
+ def is_content_string(cls, obj: bs4.PageElement) -> bool:
"""Check if node is content string."""
return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
@staticmethod
- def create_fake_parent(el: 'bs4.Tag') -> _FakeParent:
+ def create_fake_parent(el: bs4.Tag) -> _FakeParent:
"""Create fake parent for a given element."""
return _FakeParent(el)
@staticmethod
- def is_xml_tree(el: 'bs4.Tag') -> bool:
+ def is_xml_tree(el: bs4.Tag) -> bool:
"""Check if element (or document) is from a XML tree."""
return bool(el._is_xml)
- def is_iframe(self, el: 'bs4.Tag') -> bool:
+ def is_iframe(self, el: bs4.Tag) -> bool:
"""Check if element is an `iframe`."""
return bool(
@@ -147,7 +148,7 @@ class _DocumentNav:
self.is_html_tag(el) # type: ignore[attr-defined]
)
- def is_root(self, el: 'bs4.Tag') -> bool:
+ def is_root(self, el: bs4.Tag) -> bool:
"""
Return whether element is a root element.
@@ -161,7 +162,7 @@ class _DocumentNav:
root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
return root
- def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']:
+ def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
"""Get contents or contents in reverse."""
if not no_iframe or not self.is_iframe(el):
for content in el.contents:
@@ -169,12 +170,12 @@ class _DocumentNav:
def get_children(
self,
- el: 'bs4.Tag',
+ el: bs4.Tag,
start: Optional[int] = None,
reverse: bool = False,
tags: bool = True,
no_iframe: bool = False
- ) -> Iterator['bs4.PageElement']:
+ ) -> Iterator[bs4.PageElement]:
"""Get children."""
if not no_iframe or not self.is_iframe(el):
@@ -195,10 +196,10 @@ class _DocumentNav:
def get_descendants(
self,
- el: 'bs4.Tag',
+ el: bs4.Tag,
tags: bool = True,
no_iframe: bool = False
- ) -> Iterator['bs4.PageElement']:
+ ) -> Iterator[bs4.PageElement]:
"""Get descendants."""
if not no_iframe or not self.is_iframe(el):
@@ -229,7 +230,7 @@ class _DocumentNav:
if not tags or is_tag:
yield child
- def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag':
+ def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
"""Get parent."""
parent = el.parent
@@ -238,25 +239,25 @@ class _DocumentNav:
return parent
@staticmethod
- def get_tag_name(el: 'bs4.Tag') -> Optional[str]:
+ def get_tag_name(el: bs4.Tag) -> Optional[str]:
"""Get tag."""
return cast(Optional[str], el.name)
@staticmethod
- def get_prefix_name(el: 'bs4.Tag') -> Optional[str]:
+ def get_prefix_name(el: bs4.Tag) -> Optional[str]:
"""Get prefix."""
return cast(Optional[str], el.prefix)
@staticmethod
- def get_uri(el: 'bs4.Tag') -> Optional[str]:
+ def get_uri(el: bs4.Tag) -> Optional[str]:
"""Get namespace `URI`."""
return cast(Optional[str], el.namespace)
@classmethod
- def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
+ def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get next sibling tag."""
sibling = el.next_sibling
@@ -265,7 +266,7 @@ class _DocumentNav:
return sibling
@classmethod
- def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
+ def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get previous sibling tag."""
sibling = el.previous_sibling
@@ -274,7 +275,7 @@ class _DocumentNav:
return sibling
@staticmethod
- def has_html_ns(el: 'bs4.Tag') -> bool:
+ def has_html_ns(el: bs4.Tag) -> bool:
"""
Check if element has an HTML namespace.
@@ -286,13 +287,13 @@ class _DocumentNav:
return bool(ns and ns == NS_XHTML)
@staticmethod
- def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]:
+ def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
"""Return namespace and attribute name without the prefix."""
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
@classmethod
- def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]:
+ def normalize_value(cls, value: Any) -> str | Sequence[str]:
"""Normalize the value to be a string or list of strings."""
# Treat `None` as empty string.
@@ -327,10 +328,10 @@ class _DocumentNav:
@classmethod
def get_attribute_by_name(
cls,
- el: 'bs4.Tag',
+ el: bs4.Tag,
name: str,
- default: Optional[Union[str, Sequence[str]]] = None
- ) -> Optional[Union[str, Sequence[str]]]:
+ default: Optional[str | Sequence[str]] = None
+ ) -> Optional[str | Sequence[str]]:
"""Get attribute by name."""
value = default
@@ -347,14 +348,14 @@ class _DocumentNav:
return value
@classmethod
- def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]:
+ def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
"""Iterate attributes."""
for k, v in el.attrs.items():
yield k, cls.normalize_value(v)
@classmethod
- def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]:
+ def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
"""Get classes."""
classes = cls.get_attribute_by_name(el, 'class', [])
@@ -362,14 +363,14 @@ class _DocumentNav:
classes = RE_NOT_WS.findall(classes)
return cast(Sequence[str], classes)
- def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str:
+ def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
"""Get text."""
return ''.join(
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
)
- def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]:
+ def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
"""Get Own Text."""
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
@@ -423,10 +424,10 @@ class Inputs:
return 0 <= minutes <= 59
@classmethod
- def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]:
+ def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
"""Parse the input value."""
- parsed = None # type: Optional[Tuple[float, ...]]
+ parsed = None # type: Optional[tuple[float, ...]]
if value is None:
return value
if itype == "date":
@@ -484,7 +485,7 @@ class CSSMatch(_DocumentNav):
def __init__(
self,
selectors: ct.SelectorList,
- scope: 'bs4.Tag',
+ scope: bs4.Tag,
namespaces: Optional[ct.Namespaces],
flags: int
) -> None:
@@ -492,11 +493,11 @@ class CSSMatch(_DocumentNav):
self.assert_valid_input(scope)
self.tag = scope
- self.cached_meta_lang = [] # type: List[Tuple[str, str]]
- self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']]
- self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]]
+ self.cached_meta_lang = [] # type: list[tuple[str, str]]
+ self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
+ self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
self.selectors = selectors
- self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]]
+ self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
self.flags = flags
self.iframe_restrict = False
@@ -527,7 +528,7 @@ class CSSMatch(_DocumentNav):
return self.is_xml or self.has_html_namespace
- def get_tag_ns(self, el: 'bs4.Tag') -> str:
+ def get_tag_ns(self, el: bs4.Tag) -> str:
"""Get tag namespace."""
if self.supports_namespaces():
@@ -539,24 +540,24 @@ class CSSMatch(_DocumentNav):
namespace = NS_XHTML
return namespace
- def is_html_tag(self, el: 'bs4.Tag') -> bool:
+ def is_html_tag(self, el: bs4.Tag) -> bool:
"""Check if tag is in HTML namespace."""
return self.get_tag_ns(el) == NS_XHTML
- def get_tag(self, el: 'bs4.Tag') -> Optional[str]:
+ def get_tag(self, el: bs4.Tag) -> Optional[str]:
"""Get tag."""
name = self.get_tag_name(el)
return util.lower(name) if name is not None and not self.is_xml else name
- def get_prefix(self, el: 'bs4.Tag') -> Optional[str]:
+ def get_prefix(self, el: bs4.Tag) -> Optional[str]:
"""Get prefix."""
prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
- def find_bidi(self, el: 'bs4.Tag') -> Optional[int]:
+ def find_bidi(self, el: bs4.Tag) -> Optional[int]:
"""Get directionality from element text."""
for node in self.get_children(el, tags=False):
@@ -600,13 +601,18 @@ class CSSMatch(_DocumentNav):
ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-')
length = len(ranges)
+ slength = len(subtags)
rindex = 0
sindex = 0
r = ranges[rindex]
s = subtags[sindex]
+ # Empty specified language should match unspecified language attributes
+ if length == 1 and slength == 1 and not r and r == s:
+ return True
+
# Primary tag needs to match
- if r != '*' and r != s:
+ if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
match = False
rindex += 1
@@ -645,10 +651,10 @@ class CSSMatch(_DocumentNav):
def match_attribute_name(
self,
- el: 'bs4.Tag',
+ el: bs4.Tag,
attr: str,
prefix: Optional[str]
- ) -> Optional[Union[str, Sequence[str]]]:
+ ) -> Optional[str | Sequence[str]]:
"""Match attribute name and return value if it exists."""
value = None
@@ -696,7 +702,7 @@ class CSSMatch(_DocumentNav):
break
return value
- def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
+ def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match the namespace of the element."""
match = True
@@ -717,7 +723,7 @@ class CSSMatch(_DocumentNav):
match = False
return match
- def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool:
+ def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
"""Match attributes."""
match = True
@@ -736,7 +742,7 @@ class CSSMatch(_DocumentNav):
break
return match
- def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
+ def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match tag name."""
name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
@@ -745,7 +751,7 @@ class CSSMatch(_DocumentNav):
name not in (self.get_tag(el), '*')
)
- def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool:
+ def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
"""Match the tag."""
match = True
@@ -757,7 +763,7 @@ class CSSMatch(_DocumentNav):
match = False
return match
- def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
+ def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match past relationship."""
found = False
@@ -785,12 +791,12 @@ class CSSMatch(_DocumentNav):
found = self.match_selectors(sibling, relation)
return found
- def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool:
+ def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
"""Match future child."""
match = False
if recursive:
- children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']]
+ children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
else:
children = self.get_children
for child in children(parent, no_iframe=self.iframe_restrict):
@@ -799,7 +805,7 @@ class CSSMatch(_DocumentNav):
break
return match
- def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
+ def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match future relationship."""
found = False
@@ -822,7 +828,7 @@ class CSSMatch(_DocumentNav):
found = self.match_selectors(sibling, relation)
return found
- def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
+ def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match relationship to other elements."""
found = False
@@ -837,7 +843,7 @@ class CSSMatch(_DocumentNav):
return found
- def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool:
+ def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
"""Match element's ID."""
found = True
@@ -847,7 +853,7 @@ class CSSMatch(_DocumentNav):
break
return found
- def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool:
+ def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
"""Match element's classes."""
current_classes = self.get_classes(el)
@@ -858,7 +864,7 @@ class CSSMatch(_DocumentNav):
break
return found
- def match_root(self, el: 'bs4.Tag') -> bool:
+ def match_root(self, el: bs4.Tag) -> bool:
"""Match element as root."""
is_root = self.is_root(el)
@@ -884,20 +890,20 @@ class CSSMatch(_DocumentNav):
sibling = self.get_next(sibling, tags=False)
return is_root
- def match_scope(self, el: 'bs4.Tag') -> bool:
+ def match_scope(self, el: bs4.Tag) -> bool:
"""Match element as scope."""
return self.scope is el
- def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool:
+ def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
"""Match tag type for `nth` matches."""
- return(
+ return (
(self.get_tag(child) == self.get_tag(el)) and
(self.get_tag_ns(child) == self.get_tag_ns(el))
)
- def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool:
+ def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
"""Match `nth` elements."""
matched = True
@@ -998,7 +1004,7 @@ class CSSMatch(_DocumentNav):
break
return matched
- def match_empty(self, el: 'bs4.Tag') -> bool:
+ def match_empty(self, el: bs4.Tag) -> bool:
"""Check if element is empty (if requested)."""
is_empty = True
@@ -1011,7 +1017,7 @@ class CSSMatch(_DocumentNav):
break
return is_empty
- def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool:
+ def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
"""Match selectors."""
match = True
@@ -1020,11 +1026,11 @@ class CSSMatch(_DocumentNav):
match = False
return match
- def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool:
+ def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
"""Match element if it contains text."""
match = True
- content = None # type: Optional[Union[str, Sequence[str]]]
+ content = None # type: Optional[str | Sequence[str]]
for contain_list in contains:
if content is None:
if contain_list.own:
@@ -1048,7 +1054,7 @@ class CSSMatch(_DocumentNav):
match = False
return match
- def match_default(self, el: 'bs4.Tag') -> bool:
+ def match_default(self, el: bs4.Tag) -> bool:
"""Match default."""
match = False
@@ -1087,13 +1093,13 @@ class CSSMatch(_DocumentNav):
break
return match
- def match_indeterminate(self, el: 'bs4.Tag') -> bool:
+ def match_indeterminate(self, el: bs4.Tag) -> bool:
"""Match default."""
match = False
name = cast(str, self.get_attribute_by_name(el, 'name'))
- def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']:
+ def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
"""Find this input's form."""
form = None
parent = self.get_parent(el, no_iframe=True)
@@ -1148,7 +1154,7 @@ class CSSMatch(_DocumentNav):
return match
- def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool:
+ def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
"""Match languages."""
match = False
@@ -1183,7 +1189,7 @@ class CSSMatch(_DocumentNav):
break
# Use cached meta language.
- if not found_lang and self.cached_meta_lang:
+ if found_lang is None and self.cached_meta_lang:
for cache in self.cached_meta_lang:
if root is cache[0]:
found_lang = cache[1]
@@ -1217,13 +1223,13 @@ class CSSMatch(_DocumentNav):
found_lang = content
self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
break
- if found_lang:
+ if found_lang is not None:
break
- if not found_lang:
+ if found_lang is None:
self.cached_meta_lang.append((cast(str, root), ''))
# If we determined a language, compare.
- if found_lang:
+ if found_lang is not None:
for patterns in langs:
match = False
for pattern in patterns:
@@ -1234,7 +1240,7 @@ class CSSMatch(_DocumentNav):
return match
- def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool:
+ def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
"""Check directionality."""
# If we have to match both left and right, we can't match either.
@@ -1297,7 +1303,7 @@ class CSSMatch(_DocumentNav):
# Match parents direction
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
- def match_range(self, el: 'bs4.Tag', condition: int) -> bool:
+ def match_range(self, el: bs4.Tag, condition: int) -> bool:
"""
Match range.
@@ -1337,7 +1343,7 @@ class CSSMatch(_DocumentNav):
return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
- def match_defined(self, el: 'bs4.Tag') -> bool:
+ def match_defined(self, el: bs4.Tag) -> bool:
"""
Match defined.
@@ -1360,7 +1366,7 @@ class CSSMatch(_DocumentNav):
)
)
- def match_placeholder_shown(self, el: 'bs4.Tag') -> bool:
+ def match_placeholder_shown(self, el: bs4.Tag) -> bool:
"""
Match placeholder shown according to HTML spec.
@@ -1375,7 +1381,7 @@ class CSSMatch(_DocumentNav):
return match
- def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool:
+ def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
"""Check if element matches one of the selectors."""
match = False
@@ -1459,7 +1465,7 @@ class CSSMatch(_DocumentNav):
return match
- def select(self, limit: int = 0) -> Iterator['bs4.Tag']:
+ def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
"""Match all tags under the targeted tag."""
lim = None if limit < 1 else limit
@@ -1472,7 +1478,7 @@ class CSSMatch(_DocumentNav):
if lim < 1:
break
- def closest(self) -> Optional['bs4.Tag']:
+ def closest(self) -> Optional[bs4.Tag]:
"""Match closest ancestor."""
current = self.tag
@@ -1484,12 +1490,12 @@ class CSSMatch(_DocumentNav):
current = self.get_parent(current)
return closest
- def filter(self) -> List['bs4.Tag']: # noqa A001
+ def filter(self) -> list[bs4.Tag]: # noqa A001
"""Filter tag's children."""
return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
- def match(self, el: 'bs4.Tag') -> bool:
+ def match(self, el: bs4.Tag) -> bool:
"""Match."""
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
@@ -1501,7 +1507,7 @@ class SoupSieve(ct.Immutable):
pattern: str
selectors: ct.SelectorList
namespaces: Optional[ct.Namespaces]
- custom: Dict[str, str]
+ custom: dict[str, str]
flags: int
__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
@@ -1524,17 +1530,17 @@ class SoupSieve(ct.Immutable):
flags=flags
)
- def match(self, tag: 'bs4.Tag') -> bool:
+ def match(self, tag: bs4.Tag) -> bool:
"""Match."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
- def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag':
+ def closest(self, tag: bs4.Tag) -> bs4.Tag:
"""Match closest ancestor."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
- def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001
+ def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
"""
Filter.
@@ -1551,18 +1557,18 @@ class SoupSieve(ct.Immutable):
else:
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
- def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag':
+ def select_one(self, tag: bs4.Tag) -> bs4.Tag:
"""Select a single tag."""
tags = self.select(tag, limit=1)
return tags[0] if tags else None
- def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']:
+ def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
"""Select the specified tags."""
return list(self.iselect(tag, limit))
- def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']:
+ def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
"""Iterate the specified tags."""
for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
diff --git a/lib/soupsieve/css_parser.py b/lib/soupsieve/css_parser.py
index d77084d4..4b8db186 100644
--- a/lib/soupsieve/css_parser.py
+++ b/lib/soupsieve/css_parser.py
@@ -1,4 +1,5 @@
"""CSS selector parser."""
+from __future__ import annotations
import re
from functools import lru_cache
from . import util
@@ -6,7 +7,7 @@ from . import css_match as cm
from . import css_types as ct
from .util import SelectorSyntaxError
import warnings
-from typing import Optional, Dict, Match, Tuple, Type, Any, List, Union, Iterator, cast
+from typing import Optional, Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD
@@ -232,7 +233,7 @@ def _purge_cache() -> None:
_cached_css_compile.cache_clear()
-def process_custom(custom: Optional[ct.CustomSelectors]) -> Dict[str, Union[str, ct.SelectorList]]:
+def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
"""Process custom."""
custom_selectors = {}
@@ -325,7 +326,7 @@ class SelectorPattern:
class SpecialPseudoPattern(SelectorPattern):
"""Selector pattern."""
- def __init__(self, patterns: Tuple[Tuple[str, Tuple[str, ...], str, Type[SelectorPattern]], ...]) -> None:
+ def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
"""Initialize."""
self.patterns = {}
@@ -372,19 +373,19 @@ class _Selector:
"""Initialize."""
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
- self.ids = kwargs.get('ids', []) # type: List[str]
- self.classes = kwargs.get('classes', []) # type: List[str]
- self.attributes = kwargs.get('attributes', []) # type: List[ct.SelectorAttribute]
- self.nth = kwargs.get('nth', []) # type: List[ct.SelectorNth]
- self.selectors = kwargs.get('selectors', []) # type: List[ct.SelectorList]
- self.relations = kwargs.get('relations', []) # type: List[_Selector]
+ self.ids = kwargs.get('ids', []) # type: list[str]
+ self.classes = kwargs.get('classes', []) # type: list[str]
+ self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
+ self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
+ self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
+ self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
- self.contains = kwargs.get('contains', []) # type: List[ct.SelectorContains]
- self.lang = kwargs.get('lang', []) # type: List[ct.SelectorLang]
+ self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
+ self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) # type: int
self.no_match = kwargs.get('no_match', False) # type: bool
- def _freeze_relations(self, relations: List['_Selector']) -> ct.SelectorList:
+ def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
"""Freeze relation."""
if relations:
@@ -394,7 +395,7 @@ class _Selector:
else:
return ct.SelectorList()
- def freeze(self) -> Union[ct.Selector, ct.SelectorNull]:
+ def freeze(self) -> ct.Selector | ct.SelectorNull:
"""Freeze self."""
if self.no_match:
@@ -461,7 +462,7 @@ class CSSParser:
def __init__(
self,
selector: str,
- custom: Optional[Dict[str, Union[str, ct.SelectorList]]] = None,
+ custom: Optional[dict[str, str | ct.SelectorList]] = None,
flags: int = 0
) -> None:
"""Initialize."""
@@ -583,9 +584,9 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
- iselector: Iterator[Tuple[str, Match[str]]],
+ iselector: Iterator[tuple[str, Match[str]]],
is_html: bool
- ) -> Tuple[bool, bool]:
+ ) -> tuple[bool, bool]:
"""Parse pseudo class."""
complex_pseudo = False
@@ -678,7 +679,7 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
- iselector: Iterator[Tuple[str, Match[str]]]
+ iselector: Iterator[tuple[str, Match[str]]]
) -> bool:
"""Parse `nth` pseudo."""
@@ -743,7 +744,7 @@ class CSSParser:
sel: _Selector,
name: str,
has_selector: bool,
- iselector: Iterator[Tuple[str, Match[str]]],
+ iselector: Iterator[tuple[str, Match[str]]],
index: int
) -> bool:
"""Parse pseudo with opening bracket."""
@@ -752,7 +753,7 @@ class CSSParser:
if name == ':not':
flags |= FLG_NOT
elif name == ':has':
- flags |= FLG_RELATIVE | FLG_FORGIVE
+ flags |= FLG_RELATIVE
elif name in (':where', ':is'):
flags |= FLG_FORGIVE
@@ -766,21 +767,16 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
- selectors: List[_Selector],
+ selectors: list[_Selector],
rel_type: str,
index: int
- ) -> Tuple[bool, _Selector, str]:
+ ) -> tuple[bool, _Selector, str]:
"""Parse combinator tokens."""
combinator = m.group('relation').strip()
if not combinator:
combinator = WS_COMBINATOR
if combinator == COMMA_COMBINATOR:
- if not has_selector:
- # If we've not captured any selector parts, the comma is either at the beginning of the pattern
- # or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
- sel.no_match = True
-
sel.rel_type = rel_type
selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR
@@ -814,12 +810,12 @@ class CSSParser:
sel: _Selector,
m: Match[str],
has_selector: bool,
- selectors: List[_Selector],
- relations: List[_Selector],
+ selectors: list[_Selector],
+ relations: list[_Selector],
is_pseudo: bool,
is_forgive: bool,
index: int
- ) -> Tuple[bool, _Selector]:
+ ) -> tuple[bool, _Selector]:
"""Parse combinator tokens."""
combinator = m.group('relation').strip()
@@ -924,7 +920,7 @@ class CSSParser:
def parse_selectors(
self,
- iselector: Iterator[Tuple[str, Match[str]]],
+ iselector: Iterator[tuple[str, Match[str]]],
index: int = 0,
flags: int = 0
) -> ct.SelectorList:
@@ -935,7 +931,7 @@ class CSSParser:
selectors = []
has_selector = False
closed = False
- relations = [] # type: List[_Selector]
+ relations = [] # type: list[_Selector]
rel_type = ":" + WS_COMBINATOR
# Setup various flags
@@ -1069,22 +1065,12 @@ class CSSParser:
selectors.append(sel)
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
- elif is_forgive:
- if is_relative:
- # Handle relative selectors pseudo-classes with empty slots like `:has()`
- if selectors and selectors[-1].rel_type is None and rel_type == ': ':
- sel.rel_type = rel_type
- sel.no_match = True
- selectors[-1].relations.append(sel)
- has_selector = True
- else:
- # Handle normal pseudo-classes with empty slots
- if not selectors or not relations:
- # Others like `:is()` etc.
- sel.no_match = True
- del relations[:]
- selectors.append(sel)
- has_selector = True
+ elif is_forgive and (not selectors or not relations):
+ # Handle normal pseudo-classes with empty slots like `:is()` etc.
+ sel.no_match = True
+ del relations[:]
+ selectors.append(sel)
+ has_selector = True
if not has_selector:
# We will always need to finish a selector when `:has()` is used as it leads with combining.
@@ -1112,7 +1098,7 @@ class CSSParser:
# Return selector list
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
- def selector_iter(self, pattern: str) -> Iterator[Tuple[str, Match[str]]]:
+ def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
"""Iterate selector tokens."""
# Ignore whitespace and comments at start and end of pattern
diff --git a/lib/soupsieve/css_types.py b/lib/soupsieve/css_types.py
index e5a6e49c..a97d5f4b 100644
--- a/lib/soupsieve/css_types.py
+++ b/lib/soupsieve/css_types.py
@@ -1,7 +1,8 @@
"""CSS selector structure items."""
+from __future__ import annotations
import copyreg
from .pretty import pretty
-from typing import Any, Type, Tuple, Union, Dict, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
+from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
__all__ = (
'Selector',
@@ -33,7 +34,7 @@ SEL_PLACEHOLDER_SHOWN = 0x400
class Immutable:
"""Immutable."""
- __slots__: Tuple[str, ...] = ('_hash',)
+ __slots__: tuple[str, ...] = ('_hash',)
_hash: int
@@ -48,7 +49,7 @@ class Immutable:
super(Immutable, self).__setattr__('_hash', hash(tuple(temp)))
@classmethod
- def __base__(cls) -> "Type[Immutable]":
+ def __base__(cls) -> "type[Immutable]":
"""Get base class."""
return cls
@@ -99,7 +100,7 @@ class ImmutableDict(Mapping[Any, Any]):
def __init__(
self,
- arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]]
+ arg: dict[Any, Any] | Iterable[tuple[Any, Any]]
) -> None:
"""Initialize."""
@@ -107,7 +108,7 @@ class ImmutableDict(Mapping[Any, Any]):
self._d = dict(arg)
self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())]))
- def _validate(self, arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]]) -> None:
+ def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
@@ -147,12 +148,12 @@ class ImmutableDict(Mapping[Any, Any]):
class Namespaces(ImmutableDict):
"""Namespaces."""
- def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
+ def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize."""
super().__init__(arg)
- def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
+ def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
@@ -165,12 +166,12 @@ class Namespaces(ImmutableDict):
class CustomSelectors(ImmutableDict):
"""Custom selectors."""
- def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
+ def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize."""
super().__init__(arg)
- def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None:
+ def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
@@ -188,30 +189,30 @@ class Selector(Immutable):
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
)
- tag: Optional['SelectorTag']
- ids: Tuple[str, ...]
- classes: Tuple[str, ...]
- attributes: Tuple['SelectorAttribute', ...]
- nth: Tuple['SelectorNth', ...]
- selectors: Tuple['SelectorList', ...]
- relation: 'SelectorList'
+ tag: Optional[SelectorTag]
+ ids: tuple[str, ...]
+ classes: tuple[str, ...]
+ attributes: tuple[SelectorAttribute, ...]
+ nth: tuple[SelectorNth, ...]
+ selectors: tuple[SelectorList, ...]
+ relation: SelectorList
rel_type: Optional[str]
- contains: Tuple['SelectorContains', ...]
- lang: Tuple['SelectorLang', ...]
+ contains: tuple[SelectorContains, ...]
+ lang: tuple[SelectorLang, ...]
flags: int
def __init__(
self,
- tag: Optional['SelectorTag'],
- ids: Tuple[str, ...],
- classes: Tuple[str, ...],
- attributes: Tuple['SelectorAttribute', ...],
- nth: Tuple['SelectorNth', ...],
- selectors: Tuple['SelectorList', ...],
- relation: 'SelectorList',
+ tag: Optional[SelectorTag],
+ ids: tuple[str, ...],
+ classes: tuple[str, ...],
+ attributes: tuple[SelectorAttribute, ...],
+ nth: tuple[SelectorNth, ...],
+ selectors: tuple[SelectorList, ...],
+ relation: SelectorList,
rel_type: Optional[str],
- contains: Tuple['SelectorContains', ...],
- lang: Tuple['SelectorLang', ...],
+ contains: tuple[SelectorContains, ...],
+ lang: tuple[SelectorLang, ...],
flags: int
):
"""Initialize."""
@@ -286,7 +287,7 @@ class SelectorContains(Immutable):
__slots__ = ("text", "own", "_hash")
- text: Tuple[str, ...]
+ text: tuple[str, ...]
own: bool
def __init__(self, text: Iterable[str], own: bool) -> None:
@@ -305,9 +306,9 @@ class SelectorNth(Immutable):
b: int
of_type: bool
last: bool
- selectors: 'SelectorList'
+ selectors: SelectorList
- def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: 'SelectorList') -> None:
+ def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None:
"""Initialize."""
super().__init__(
@@ -325,7 +326,7 @@ class SelectorLang(Immutable):
__slots__ = ("languages", "_hash",)
- languages: Tuple[str, ...]
+ languages: tuple[str, ...]
def __init__(self, languages: Iterable[str]):
"""Initialize."""
@@ -353,13 +354,13 @@ class SelectorList(Immutable):
__slots__ = ("selectors", "is_not", "is_html", "_hash")
- selectors: Tuple[Union['Selector', 'SelectorNull'], ...]
+ selectors: tuple[Selector | SelectorNull, ...]
is_not: bool
is_html: bool
def __init__(
self,
- selectors: Optional[Iterable[Union['Selector', 'SelectorNull']]] = None,
+ selectors: Optional[Iterable[Selector | SelectorNull]] = None,
is_not: bool = False,
is_html: bool = False
) -> None:
@@ -371,7 +372,7 @@ class SelectorList(Immutable):
is_html=is_html
)
- def __iter__(self) -> Iterator[Union['Selector', 'SelectorNull']]:
+ def __iter__(self) -> Iterator[Selector | SelectorNull]:
"""Iterator."""
return iter(self.selectors)
@@ -381,7 +382,7 @@ class SelectorList(Immutable):
return len(self.selectors)
- def __getitem__(self, index: int) -> Union['Selector', 'SelectorNull']:
+ def __getitem__(self, index: int) -> Selector | SelectorNull:
"""Get item."""
return self.selectors[index]
diff --git a/lib/soupsieve/pretty.py b/lib/soupsieve/pretty.py
index 57d16c97..f848d5e2 100644
--- a/lib/soupsieve/pretty.py
+++ b/lib/soupsieve/pretty.py
@@ -65,6 +65,7 @@ SelectorList(
is_html=False)
```
"""
+from __future__ import annotations
import re
from typing import Any
diff --git a/lib/soupsieve/util.py b/lib/soupsieve/util.py
index 2b1ed24b..cf4dc5cc 100644
--- a/lib/soupsieve/util.py
+++ b/lib/soupsieve/util.py
@@ -1,8 +1,9 @@
"""Utility."""
+from __future__ import annotations
from functools import wraps, lru_cache
import warnings
import re
-from typing import Callable, Any, Optional, Tuple, List
+from typing import Callable, Any, Optional
DEBUG = 0x00001
@@ -75,13 +76,13 @@ def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no co
)
-def get_pattern_context(pattern: str, index: int) -> Tuple[str, int, int]:
+def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
"""Get the pattern context."""
last = 0
current_line = 1
col = 1
- text = [] # type: List[str]
+ text = [] # type: list[str]
line = 1
offset = None # type: Optional[int]
diff --git a/requirements.txt b/requirements.txt
index 050b9add..b5e6dd57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ arrow==1.2.3
backports.csv==1.0.7
backports.functools-lru-cache==1.6.4
backports.zoneinfo==0.2.1;python_version<"3.9"
-beautifulsoup4==4.11.1
+beautifulsoup4==4.11.2
bleach==6.0.0
certifi==2022.12.7
cheroot==9.0.0