Bump beautifulsoup4 from 4.11.1 to 4.11.2 (#1987)

* Bump beautifulsoup4 from 4.11.1 to 4.11.2

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.1 to 4.11.2.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.11.2

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2023-03-02 20:56:24 -08:00 committed by GitHub
parent ded93ef2f5
commit 8e42757b2d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
23 changed files with 449 additions and 537 deletions

View file

@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
provides methods and Pythonic idioms that make it easy to navigate, provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree. search, and modify the parse tree.
Beautiful Soup works with Python 3.5 and up. It works better if lxml Beautiful Soup works with Python 3.6 and up. It works better if lxml
and/or html5lib is installed. and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the For more than you ever wanted to know about Beautiful Soup, see the
@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.11.1" __version__ = "4.11.2"
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license. # Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
@ -211,7 +211,7 @@ class BeautifulSoup(Tag):
warnings.warn( warnings.warn(
'The "%s" argument to the BeautifulSoup constructor ' 'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name), 'has been renamed to "%s."' % (old_name, new_name),
DeprecationWarning DeprecationWarning, stacklevel=3
) )
return kwargs.pop(old_name) return kwargs.pop(old_name)
return None return None
@ -405,7 +405,8 @@ class BeautifulSoup(Tag):
'The input looks more like a URL than markup. You may want to use' 'The input looks more like a URL than markup. You may want to use'
' an HTTP client like requests to get the document behind' ' an HTTP client like requests to get the document behind'
' the URL, and feed that document to Beautiful Soup.', ' the URL, and feed that document to Beautiful Soup.',
MarkupResemblesLocatorWarning MarkupResemblesLocatorWarning,
stacklevel=3
) )
return True return True
return False return False
@ -436,7 +437,7 @@ class BeautifulSoup(Tag):
'The input looks more like a filename than markup. You may' 'The input looks more like a filename than markup. You may'
' want to open this file and pass the filehandle into' ' want to open this file and pass the filehandle into'
' Beautiful Soup.', ' Beautiful Soup.',
MarkupResemblesLocatorWarning MarkupResemblesLocatorWarning, stacklevel=3
) )
return True return True
return False return False
@ -789,7 +790,7 @@ class BeautifulStoneSoup(BeautifulSoup):
warnings.warn( warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using ' 'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.', 'it, pass features="xml" into the BeautifulSoup constructor.',
DeprecationWarning DeprecationWarning, stacklevel=2
) )
super(BeautifulStoneSoup, self).__init__(*args, **kwargs) super(BeautifulStoneSoup, self).__init__(*args, **kwargs)

View file

@ -122,7 +122,7 @@ class TreeBuilder(object):
# A value for these tag/attribute combinations is a space- or # A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA. # comma-separated list of CDATA, rather than a single CDATA.
DEFAULT_CDATA_LIST_ATTRIBUTES = {} DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
# Whitespace should be preserved inside these tags. # Whitespace should be preserved inside these tags.
DEFAULT_PRESERVE_WHITESPACE_TAGS = set() DEFAULT_PRESERVE_WHITESPACE_TAGS = set()

View file

@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# ATM because the html5lib TreeBuilder doesn't use # ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit. # UnicodeDammit.
if exclude_encodings: if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") warnings.warn(
"You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
stacklevel=3
)
# html5lib only parses HTML, so if it's given XML that's worth # html5lib only parses HTML, so if it's given XML that's worth
# noting. # noting.
@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# These methods are defined by Beautiful Soup. # These methods are defined by Beautiful Soup.
def feed(self, markup): def feed(self, markup):
if self.soup.parse_only is not None: if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") warnings.warn(
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
stacklevel=4
)
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser self.underlying_builder.parser = parser
extra_kwargs = dict() extra_kwargs = dict()
@ -249,9 +255,9 @@ class AttrList(object):
# If this attribute is a multi-valued attribute for this element, # If this attribute is a multi-valued attribute for this element,
# turn its value into a list. # turn its value into a list.
list_attr = self.element.cdata_list_attributes or {} list_attr = self.element.cdata_list_attributes or {}
if (name in list_attr.get('*') if (name in list_attr.get('*', [])
or (self.element.name in list_attr or (self.element.name in list_attr
and name in list_attr[self.element.name])): and name in list_attr.get(self.element.name, []))):
# A node that is being cloned may have already undergone # A node that is being cloned may have already undergone
# this procedure. # this procedure.
if not isinstance(value, list): if not isinstance(value, list):

View file

@ -10,30 +10,9 @@ __all__ = [
from html.parser import HTMLParser from html.parser import HTMLParser
try:
from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys import sys
import warnings import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from bs4.element import ( from bs4.element import (
CData, CData,
Comment, Comment,
@ -91,19 +70,6 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
self._initialize_xml_detector() self._initialize_xml_detector()
def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although
this requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() by raising an exception,
which we don't want to do.
In any event, this method is called only on very strange
markup and our best strategy is to pretend it didn't happen
and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs): def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag. """Handle an incoming empty-element tag.
@ -203,9 +169,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
:param name: Character number, possibly in hexadecimal. :param name: Character number, possibly in hexadecimal.
""" """
# XXX workaround for a bug in HTMLParser. Remove this once # TODO: This was originally a workaround for a bug in
# it's fixed in all supported versions. # HTMLParser. (http://bugs.python.org/issue13633) The bug has
# http://bugs.python.org/issue13633 # been fixed, but removing this code still makes some
# Beautiful Soup tests fail. This needs investigation.
if name.startswith('x'): if name.startswith('x'):
real_name = int(name.lstrip('x'), 16) real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'): elif name.startswith('X'):
@ -333,9 +300,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser_args = parser_args or [] parser_args = parser_args or []
parser_kwargs = parser_kwargs or {} parser_kwargs = parser_kwargs or {}
parser_kwargs.update(extra_parser_kwargs) parser_kwargs.update(extra_parser_kwargs)
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs) self.parser_args = (parser_args, parser_kwargs)
@ -395,105 +359,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
args, kwargs = self.parser_args args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs) parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup parser.soup = self.soup
try:
parser.feed(markup) parser.feed(markup)
parser.close() parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
parser.already_closed_empty_element = [] parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

View file

@ -496,13 +496,16 @@ class PageElement(object):
def extend(self, tags): def extend(self, tags):
"""Appends the given PageElements to this one's contents. """Appends the given PageElements to this one's contents.
:param tags: A list of PageElements. :param tags: A list of PageElements. If a single Tag is
provided instead, this PageElement's contents will be extended
with that Tag's contents.
""" """
if isinstance(tags, Tag): if isinstance(tags, Tag):
# Calling self.append() on another tag's contents will change tags = tags.contents
# the list we're iterating over. Make a list that won't if isinstance(tags, list):
# change. # Moving items around the tree may change their position in
tags = list(tags.contents) # the original list. Make a list that won't change.
tags = list(tags)
for tag in tags: for tag in tags:
self.append(tag) self.append(tag)
@ -586,8 +589,9 @@ class PageElement(object):
:kwargs: A dictionary of filters on attribute values. :kwargs: A dictionary of filters on attribute values.
:return: A ResultSet containing PageElements. :return: A ResultSet containing PageElements.
""" """
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, self.next_elements, return self._find_all(name, attrs, string, limit, self.next_elements,
**kwargs) _stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3 findAllNext = find_all_next # BS3
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
@ -624,8 +628,11 @@ class PageElement(object):
:return: A ResultSet of PageElements. :return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
return self._find_all(name, attrs, string, limit, _stacklevel = kwargs.pop('_stacklevel', 2)
self.next_siblings, **kwargs) return self._find_all(
name, attrs, string, limit,
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findNextSiblings = find_next_siblings # BS3 findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2 fetchNextSiblings = find_next_siblings # BS2
@ -663,8 +670,11 @@ class PageElement(object):
:return: A ResultSet of PageElements. :return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
return self._find_all(name, attrs, string, limit, self.previous_elements, _stacklevel = kwargs.pop('_stacklevel', 2)
**kwargs) return self._find_all(
name, attrs, string, limit, self.previous_elements,
_stacklevel=_stacklevel+1, **kwargs
)
findAllPrevious = find_all_previous # BS3 findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2 fetchPrevious = find_all_previous # BS2
@ -702,8 +712,11 @@ class PageElement(object):
:return: A ResultSet of PageElements. :return: A ResultSet of PageElements.
:rtype: bs4.element.ResultSet :rtype: bs4.element.ResultSet
""" """
return self._find_all(name, attrs, string, limit, _stacklevel = kwargs.pop('_stacklevel', 2)
self.previous_siblings, **kwargs) return self._find_all(
name, attrs, string, limit,
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
)
findPreviousSiblings = find_previous_siblings # BS3 findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2 fetchPreviousSiblings = find_previous_siblings # BS2
@ -724,7 +737,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different # NOTE: We can't use _find_one because findParents takes a different
# set of arguments. # set of arguments.
r = None r = None
l = self.find_parents(name, attrs, 1, **kwargs) l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
@ -744,8 +757,9 @@ class PageElement(object):
:return: A PageElement. :return: A PageElement.
:rtype: bs4.element.Tag | bs4.element.NavigableString :rtype: bs4.element.Tag | bs4.element.NavigableString
""" """
_stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents, return self._find_all(name, attrs, None, limit, self.parents,
**kwargs) _stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3 findParents = find_parents # BS3
fetchParents = find_parents # BS2 fetchParents = find_parents # BS2
@ -771,19 +785,20 @@ class PageElement(object):
def _find_one(self, method, name, attrs, string, **kwargs): def _find_one(self, method, name, attrs, string, **kwargs):
r = None r = None
l = method(name, attrs, string, 1, **kwargs) l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
def _find_all(self, name, attrs, string, limit, generator, **kwargs): def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match." "Iterates over a generator looking for things that match."
_stacklevel = kwargs.pop('_stacklevel', 3)
if string is None and 'text' in kwargs: if string is None and 'text' in kwargs:
string = kwargs.pop('text') string = kwargs.pop('text')
warnings.warn( warnings.warn(
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
DeprecationWarning DeprecationWarning, stacklevel=_stacklevel
) )
if isinstance(name, SoupStrainer): if isinstance(name, SoupStrainer):
@ -1306,7 +1321,8 @@ class Tag(PageElement):
sourceline=self.sourceline, sourcepos=self.sourcepos, sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element, can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes, cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags preserve_whitespace_tags=self.preserve_whitespace_tags,
interesting_string_types=self.interesting_string_types
) )
for attr in ('can_be_empty_element', 'hidden'): for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr)) setattr(clone, attr, getattr(self, attr))
@ -1558,7 +1574,7 @@ class Tag(PageElement):
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
name=tag_name name=tag_name
), ),
DeprecationWarning DeprecationWarning, stacklevel=2
) )
return self.find(tag_name) return self.find(tag_name)
# We special case contents to avoid recursion. # We special case contents to avoid recursion.
@ -1862,7 +1878,8 @@ class Tag(PageElement):
:rtype: bs4.element.Tag | bs4.element.NavigableString :rtype: bs4.element.Tag | bs4.element.NavigableString
""" """
r = None r = None
l = self.find_all(name, attrs, recursive, string, 1, **kwargs) l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
**kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
@ -1889,7 +1906,9 @@ class Tag(PageElement):
generator = self.descendants generator = self.descendants
if not recursive: if not recursive:
generator = self.children generator = self.children
return self._find_all(name, attrs, string, limit, generator, **kwargs) _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, string, limit, generator,
_stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3 findAll = find_all # BS3
findChildren = find_all # BS2 findChildren = find_all # BS2
@ -1993,7 +2012,7 @@ class Tag(PageElement):
""" """
warnings.warn( warnings.warn(
'has_key is deprecated. Use has_attr(key) instead.', 'has_key is deprecated. Use has_attr(key) instead.',
DeprecationWarning DeprecationWarning, stacklevel=2
) )
return self.has_attr(key) return self.has_attr(key)
@ -2024,7 +2043,7 @@ class SoupStrainer(object):
string = kwargs.pop('text') string = kwargs.pop('text')
warnings.warn( warnings.warn(
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning DeprecationWarning, stacklevel=2
) )
self.name = self._normalize_search_value(name) self.name = self._normalize_search_value(name)

View file

@ -149,14 +149,14 @@ class HTMLFormatter(Formatter):
"""A generic Formatter for HTML.""" """A generic Formatter for HTML."""
REGISTRY = {} REGISTRY = {}
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter): class XMLFormatter(Formatter):
"""A generic Formatter for XML.""" """A generic Formatter for XML."""
REGISTRY = {} REGISTRY = {}
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters. # Set up aliases for the default formatters.

View file

@ -29,6 +29,29 @@ from bs4.builder import (
) )
default_builder = HTMLParserTreeBuilder default_builder = HTMLParserTreeBuilder
# Some tests depend on specific third-party libraries. We use
# @pytest.mark.skipIf on the following conditionals to skip them
# if the libraries are not installed.
try:
from soupsieve import SelectorSyntaxError
SOUP_SIEVE_PRESENT = True
except ImportError:
SOUP_SIEVE_PRESENT = False
try:
import html5lib
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError:
LXML_PRESENT = False
LXML_VERSION = (0,)
BAD_DOCUMENT = """A bare string BAD_DOCUMENT = """A bare string
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"> <!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd"> <!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
@ -258,10 +281,10 @@ class TreeBuilderSmokeTest(object):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"multi_valued_attributes", "multi_valued_attributes",
[None, dict(b=['class']), {'*': ['notclass']}] [None, {}, dict(b=['class']), {'*': ['notclass']}]
) )
def test_attribute_not_multi_valued(self, multi_valued_attributes): def test_attribute_not_multi_valued(self, multi_valued_attributes):
markup = '<a class="a b c">' markup = '<html xmlns="http://www.w3.org/1999/xhtml"><a class="a b c"></html>'
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes) soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a['class'] == 'a b c' assert soup.a['class'] == 'a b c'
@ -820,26 +843,27 @@ Hello, world!
soup = self.soup(text) soup = self.soup(text)
assert soup.p.encode("utf-8") == expected assert soup.p.encode("utf-8") == expected
def test_real_iso_latin_document(self): def test_real_iso_8859_document(self):
# Smoke test of interrelated functionality, using an # Smoke test of interrelated functionality, using an
# easy-to-understand document. # easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1. # Here it is in Unicode. Note that it claims to be in ISO-8859-1.
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' unicode_html = '<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use # That's because we're going to encode it into ISO-8859-1,
# that to test. # and use that to test.
iso_latin_html = unicode_html.encode("iso-8859-1") iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML. # Parse the ISO-8859-1 HTML.
soup = self.soup(iso_latin_html) soup = self.soup(iso_latin_html)
# Encode it to UTF-8. # Encode it to UTF-8.
result = soup.encode("utf-8") result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would # What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say # look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1. # UTF-8 instead of ISO-8859-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8") expected = unicode_html.replace("ISO-8859-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode. # And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8") expected = expected.encode("utf-8")
@ -1177,15 +1201,3 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
assert isinstance(soup.contents[0], Comment) assert isinstance(soup.contents[0], Comment)
assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?' assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?'
assert "html" == soup.contents[0].next_element.name assert "html" == soup.contents[0].next_element.name
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator

View file

@ -10,22 +10,23 @@ from bs4.builder import (
TreeBuilderRegistry, TreeBuilderRegistry,
) )
try: from . import (
from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT,
HTML5LIB_PRESENT = True LXML_PRESENT,
except ImportError: )
HTML5LIB_PRESENT = False
try: if HTML5LIB_PRESENT:
from bs4.builder import HTML5TreeBuilder
if LXML_PRESENT:
from bs4.builder import ( from bs4.builder import (
LXMLTreeBuilderForXML, LXMLTreeBuilderForXML,
LXMLTreeBuilder, LXMLTreeBuilder,
) )
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
# TODO: Split out the lxml and html5lib tests into their own classes
# and gate with pytest.mark.skipIf.
class TestBuiltInRegistry(object): class TestBuiltInRegistry(object):
"""Test the built-in registry with the default builders registered.""" """Test the built-in registry with the default builders registered."""

View file

@ -17,25 +17,23 @@ class TestUnicodeDammit(object):
dammit = UnicodeDammit(markup) dammit = UnicodeDammit(markup)
assert dammit.unicode_markup == markup assert dammit.unicode_markup == markup
def test_smart_quotes_to_unicode(self): @pytest.mark.parametrize(
"smart_quotes_to,expect_converted",
[(None, "\u2018\u2019\u201c\u201d"),
("xml", "&#x2018;&#x2019;&#x201C;&#x201D;"),
("html", "&lsquo;&rsquo;&ldquo;&rdquo;"),
("ascii", "''" + '""'),
]
)
def test_smart_quotes_to(self, smart_quotes_to, expect_converted):
"""Verify the functionality of the smart_quotes_to argument
to the UnicodeDammit constructor."""
markup = b"<foo>\x91\x92\x93\x94</foo>" markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup) converted = UnicodeDammit(
assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>" markup, known_definite_encodings=["windows-1252"],
smart_quotes_to=smart_quotes_to
def test_smart_quotes_to_xml_entities(self): ).unicode_markup
markup = b"<foo>\x91\x92\x93\x94</foo>" assert converted == "<foo>{}</foo>".format(expect_converted)
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
assert dammit.unicode_markup == "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>"
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
assert dammit.unicode_markup == "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>"
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
assert dammit.unicode_markup == """<foo>''""</foo>"""
def test_detect_utf8(self): def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
@ -275,23 +273,24 @@ class TestEntitySubstitution(object):
def setup_method(self): def setup_method(self):
self.sub = EntitySubstitution self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
assert self.sub.substitute_html(s) == "foo&forall;\N{SNOWMAN}&otilde;bar"
def test_smart_quote_substitution(self): @pytest.mark.parametrize(
"original,substituted",
[
# Basic case. Unicode characters corresponding to named
# HTML entites are substituted; others are not.
("foo\u2200\N{SNOWMAN}\u00f5bar",
"foo&forall;\N{SNOWMAN}&otilde;bar"),
# MS smart quotes are a common source of frustration, so we # MS smart quotes are a common source of frustration, so we
# give them a special test. # give them a special test.
quotes = b"\x91\x92foo\x93\x94" ('foo“”', "&lsquo;&rsquo;foo&ldquo;&rdquo;"),
dammit = UnicodeDammit(quotes) ]
assert self.sub.substitute_html(dammit.markup) == "&lsquo;&rsquo;foo&ldquo;&rdquo;" )
def test_substitute_html(self, original, substituted):
assert self.sub.substitute_html(original) == substituted
def test_html5_entity(self): def test_html5_entity(self):
# Some HTML5 entities correspond to single- or multi-character
# Unicode sequences.
for entity, u in ( for entity, u in (
# A few spot checks of our ability to recognize # A few spot checks of our ability to recognize
# special character sequences and convert them # special character sequences and convert them

View file

@ -1,27 +1,26 @@
"""Tests to ensure that the html5lib tree builder generates good trees.""" """Tests to ensure that the html5lib tree builder generates good trees."""
import pytest
import warnings import warnings
try: from bs4 import BeautifulSoup
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError as e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer from bs4.element import SoupStrainer
from . import ( from . import (
HTML5LIB_PRESENT,
HTML5TreeBuilderSmokeTest, HTML5TreeBuilderSmokeTest,
SoupTest, SoupTest,
skipIf,
) )
@skipIf( @pytest.mark.skipif(
not HTML5LIB_PRESENT, not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.") reason="html5lib seems not to be present, not testing its tree builder."
)
class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest): class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``.""" """See ``HTML5TreeBuilderSmokeTest``."""
@property @property
def default_builder(self): def default_builder(self):
from bs4.builder import HTML5TreeBuilder
return HTML5TreeBuilder return HTML5TreeBuilder
def test_soupstrainer(self): def test_soupstrainer(self):
@ -29,10 +28,12 @@ class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
strainer = SoupStrainer("b") strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>" markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer) soup = BeautifulSoup(markup, "html5lib", parse_only=strainer)
assert soup.decode() == self.document_for(markup) assert soup.decode() == self.document_for(markup)
assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message) [warning] = w
assert warning.filename == __file__
assert "the html5lib tree builder doesn't support parse_only" in str(warning.message)
def test_correctly_nested_tables(self): def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't.""" """html5lib inserts <tbody> tags where other parsers don't."""

View file

@ -122,15 +122,3 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with_element = div.encode(formatter="html") with_element = div.encode(formatter="html")
expect = b"<div>%s</div>" % output_element expect = b"<div>%s</div>" % output_element
assert with_element == expect assert with_element == expect
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way
that doesn't cause a crash.
"""
parser = BeautifulSoupHTMLParser()
with warnings.catch_warnings(record=True) as warns:
parser.error("don't crash")
[warning] = warns
assert "don't crash" == str(warning.message)

View file

@ -1,16 +1,10 @@
"""Tests to ensure that the lxml tree builder generates good trees.""" """Tests to ensure that the lxml tree builder generates good trees."""
import pickle import pickle
import pytest
import re import re
import warnings import warnings
from . import LXML_PRESENT, LXML_VERSION
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError as e:
LXML_PRESENT = False
LXML_VERSION = (0,)
if LXML_PRESENT: if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
@ -23,13 +17,14 @@ from bs4.element import Comment, Doctype, SoupStrainer
from . import ( from . import (
HTMLTreeBuilderSmokeTest, HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest,
SOUP_SIEVE_PRESENT,
SoupTest, SoupTest,
skipIf,
) )
@skipIf( @pytest.mark.skipif(
not LXML_PRESENT, not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.") reason="lxml seems not to be present, not testing its tree builder."
)
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``.""" """See ``HTMLTreeBuilderSmokeTest``."""
@ -54,9 +49,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed. # test if an old version of lxml is installed.
@skipIf( @pytest.mark.skipif(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0), not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.") reason="Skipping doctype test for old version of lxml to avoid segfault."
)
def test_empty_doctype(self): def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>") soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0] doctype = soup.contents[0]
@ -68,7 +64,9 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />") soup = BeautifulStoneSoup("<b />")
assert "<b/>" == str(soup.b) assert "<b/>" == str(soup.b)
assert "BeautifulStoneSoup class is deprecated" in str(w[0].message) [warning] = w
assert warning.filename == __file__
assert "BeautifulStoneSoup class is deprecated" in str(warning.message)
def test_tracking_line_numbers(self): def test_tracking_line_numbers(self):
# The lxml TreeBuilder cannot keep track of line numbers from # The lxml TreeBuilder cannot keep track of line numbers from
@ -85,9 +83,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
assert "sourceline" == soup.p.sourceline.name assert "sourceline" == soup.p.sourceline.name
assert "sourcepos" == soup.p.sourcepos.name assert "sourcepos" == soup.p.sourcepos.name
@skipIf( @pytest.mark.skipif(
not LXML_PRESENT, not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.") reason="lxml seems not to be present, not testing its XML tree builder."
)
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``.""" """See ``HTMLTreeBuilderSmokeTest``."""
@ -148,6 +147,9 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
} }
@pytest.mark.skipif(
not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed"
)
def test_namespace_interaction_with_select_and_find(self): def test_namespace_interaction_with_select_and_find(self):
# Demonstrate how namespaces interact with select* and # Demonstrate how namespaces interact with select* and
# find* methods. # find* methods.

View file

@ -3,15 +3,18 @@ import copy
import pickle import pickle
import pytest import pytest
from soupsieve import SelectorSyntaxError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import ( from bs4.element import (
Comment, Comment,
SoupStrainer, SoupStrainer,
) )
from . import SoupTest from . import (
SoupTest,
SOUP_SIEVE_PRESENT,
)
if SOUP_SIEVE_PRESENT:
from soupsieve import SelectorSyntaxError
class TestEncoding(SoupTest): class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings.""" """Test the ability to encode objects into strings."""
@ -213,6 +216,7 @@ class TestFormatters(SoupTest):
assert soup.contents[0].name == 'pre' assert soup.contents[0].name == 'pre'
@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed")
class TestCSSSelectors(SoupTest): class TestCSSSelectors(SoupTest):
"""Test basic CSS selector functionality. """Test basic CSS selector functionality.
@ -694,6 +698,7 @@ class TestPersistence(SoupTest):
assert tag.can_be_empty_element == copied.can_be_empty_element assert tag.can_be_empty_element == copied.can_be_empty_element
assert tag.cdata_list_attributes == copied.cdata_list_attributes assert tag.cdata_list_attributes == copied.cdata_list_attributes
assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
assert tag.interesting_string_types == copied.interesting_string_types
def test_unicode_pickle(self): def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled. # A tree containing Unicode characters can be pickled.

View file

@ -30,19 +30,11 @@ from bs4.element import (
from . import ( from . import (
default_builder, default_builder,
LXML_PRESENT,
SoupTest, SoupTest,
skipIf,
) )
import warnings import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError as e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest): class TestConstructor(SoupTest):
def test_short_unicode_input(self): def test_short_unicode_input(self):
@ -139,7 +131,7 @@ class TestConstructor(SoupTest):
assert " an id " == a['id'] assert " an id " == a['id']
assert ["a", "class"] == a['class'] assert ["a", "class"] == a['class']
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets # TreeBuilder takes an argument called 'multi_valued_attributes' which lets
# you customize or disable this. As always, you can customize the TreeBuilder # you customize or disable this. As always, you can customize the TreeBuilder
# by passing in a keyword argument to the BeautifulSoup constructor. # by passing in a keyword argument to the BeautifulSoup constructor.
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
@ -219,10 +211,17 @@ class TestConstructor(SoupTest):
class TestWarnings(SoupTest): class TestWarnings(SoupTest):
# Note that some of the tests in this class create BeautifulSoup
# objects directly rather than using self.soup(). That's
# because SoupTest.soup is defined in a different file,
# which will throw off the assertion in _assert_warning
# that the code that triggered the warning is in the same
# file as the test.
def _assert_warning(self, warnings, cls): def _assert_warning(self, warnings, cls):
for w in warnings: for w in warnings:
if isinstance(w.message, cls): if isinstance(w.message, cls):
assert w.filename == __file__
return w return w
raise Exception("%s warning not found in %r" % (cls, warnings)) raise Exception("%s warning not found in %r" % (cls, warnings))
@ -243,13 +242,17 @@ class TestWarnings(SoupTest):
def test_no_warning_if_explicit_parser_specified(self): def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = BeautifulSoup("<a><b></b></a>", "html.parser") soup = self.soup("<a><b></b></a>")
assert [] == w assert [] == w
def test_parseOnlyThese_renamed_to_parse_only(self): def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) soup = BeautifulSoup(
msg = str(w[0].message) "<a><b></b></a>", "html.parser",
parseOnlyThese=SoupStrainer("b"),
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "parseOnlyThese" in msg assert "parseOnlyThese" in msg
assert "parse_only" in msg assert "parse_only" in msg
assert b"<b></b>" == soup.encode() assert b"<b></b>" == soup.encode()
@ -257,8 +260,11 @@ class TestWarnings(SoupTest):
def test_fromEncoding_renamed_to_from_encoding(self): def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9" utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8") soup = BeautifulSoup(
msg = str(w[0].message) utf8, "html.parser", fromEncoding="utf8"
)
warning = self._assert_warning(w, DeprecationWarning)
msg = str(warning.message)
assert "fromEncoding" in msg assert "fromEncoding" in msg
assert "from_encoding" in msg assert "from_encoding" in msg
assert "utf8" == soup.original_encoding assert "utf8" == soup.original_encoding
@ -276,7 +282,7 @@ class TestWarnings(SoupTest):
# A warning is issued if the "markup" looks like the name of # A warning is issued if the "markup" looks like the name of
# an HTML or text file, or a full path to a file on disk. # an HTML or text file, or a full path to a file on disk.
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup("markup" + extension) soup = BeautifulSoup("markup" + extension, "html.parser")
warning = self._assert_warning(w, MarkupResemblesLocatorWarning) warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
assert "looks more like a filename" in str(warning.message) assert "looks more like a filename" in str(warning.message)
@ -295,7 +301,7 @@ class TestWarnings(SoupTest):
def test_url_warning_with_bytes_url(self): def test_url_warning_with_bytes_url(self):
url = b"http://www.crummybytes.com/" url = b"http://www.crummybytes.com/"
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(url) soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning( warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning warning_list, MarkupResemblesLocatorWarning
) )
@ -307,7 +313,7 @@ class TestWarnings(SoupTest):
with warnings.catch_warnings(record=True) as warning_list: with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise # note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning # python's warnings system swallows the second warning
soup = self.soup(url) soup = BeautifulSoup(url, "html.parser")
warning = self._assert_warning( warning = self._assert_warning(
warning_list, MarkupResemblesLocatorWarning warning_list, MarkupResemblesLocatorWarning
) )
@ -348,9 +354,12 @@ class TestNewTag(SoupTest):
assert dict(bar="baz", name="a name") == new_tag.attrs assert dict(bar="baz", name="a name") == new_tag.attrs
assert None == new_tag.parent assert None == new_tag.parent
def test_tag_inherits_self_closing_rules_from_builder(self): @pytest.mark.skipif(
if LXML_PRESENT: not LXML_PRESENT,
xml_soup = BeautifulSoup("", "lxml-xml") reason="lxml not installed, cannot parse XML document"
)
def test_xml_tag_inherits_self_closing_rules_from_builder(self):
xml_soup = BeautifulSoup("", "xml")
xml_br = xml_soup.new_tag("br") xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p") xml_p = xml_soup.new_tag("p")
@ -359,6 +368,7 @@ class TestNewTag(SoupTest):
assert b"<br/>" == xml_br.encode() assert b"<br/>" == xml_br.encode()
assert b"<p/>" == xml_p.encode() assert b"<p/>" == xml_p.encode()
def test_tag_inherits_self_closing_rules_from_builder(self):
html_soup = BeautifulSoup("", "html.parser") html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br") html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p") html_p = html_soup.new_tag("p")
@ -450,13 +460,3 @@ class TestEncodingConversion(SoupTest):
# The internal data structures can be encoded as UTF-8. # The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data) soup_from_unicode = self.soup(self.unicode_data)
assert soup_from_unicode.encode('utf-8') == self.utf8_data assert soup_from_unicode.encode('utf-8') == self.utf8_data
@skipIf(
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")

View file

@ -33,7 +33,6 @@ from bs4.element import (
) )
from . import ( from . import (
SoupTest, SoupTest,
skipIf,
) )
class TestFind(SoupTest): class TestFind(SoupTest):
@ -910,12 +909,16 @@ class TestTreeModification(SoupTest):
soup.a.extend(l) soup.a.extend(l)
assert "<a><g></g><f></f><e></e><d></d><c></c><b></b></a>" == soup.decode() assert "<a><g></g><f></f><e></e><d></d><c></c><b></b></a>" == soup.decode()
def test_extend_with_another_tags_contents(self): @pytest.mark.parametrize(
"get_tags", [lambda tag: tag, lambda tag: tag.contents]
)
def test_extend_with_another_tags_contents(self, get_tags):
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>' data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
soup = self.soup(data) soup = self.soup(data)
d1 = soup.find('div', id='d1') d1 = soup.find('div', id='d1')
d2 = soup.find('div', id='d2') d2 = soup.find('div', id='d2')
d2.extend(d1) tags = get_tags(d1)
d2.extend(tags)
assert '<div id="d1"></div>' == d1.decode() assert '<div id="d1"></div>' == d1.decode()
assert '<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>' == d2.decode() assert '<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>' == d2.decode()
@ -1272,19 +1275,30 @@ class TestTreeModification(SoupTest):
class TestDeprecatedArguments(SoupTest): class TestDeprecatedArguments(SoupTest):
def test_find_type_method_string(self): @pytest.mark.parametrize(
"method_name", [
"find", "find_all", "find_parent", "find_parents",
"find_next", "find_all_next", "find_previous",
"find_all_previous", "find_next_sibling", "find_next_siblings",
"find_previous_sibling", "find_previous_siblings",
]
)
def test_find_type_method_string(self, method_name):
soup = self.soup("<a>some</a><b>markup</b>") soup = self.soup("<a>some</a><b>markup</b>")
method = getattr(soup.b, method_name)
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
[result] = soup.find_all(text='markup') method(text='markup')
assert result == 'markup' [warning] = w
assert result.parent.name == 'b' assert warning.filename == __file__
msg = str(w[0].message) msg = str(warning.message)
assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead." assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead."
def test_soupstrainer_constructor_string(self): def test_soupstrainer_constructor_string(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
strainer = SoupStrainer(text="text") strainer = SoupStrainer(text="text")
assert strainer.text == 'text' assert strainer.text == 'text'
msg = str(w[0].message) [warning] = w
msg = str(warning.message)
assert warning.filename == __file__
assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead." assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead."

View file

@ -25,13 +25,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE. SOFTWARE.
""" """
from __future__ import annotations
from .__meta__ import __version__, __version_info__ # noqa: F401 from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp from . import css_parser as cp
from . import css_match as cm from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401 from .util import DEBUG, SelectorSyntaxError # noqa: F401
import bs4 # type: ignore[import] import bs4 # type: ignore[import]
from typing import Dict, Optional, Any, List, Iterator, Iterable from typing import Optional, Any, Iterator, Iterable
__all__ = ( __all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve', 'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
@ -44,17 +45,14 @@ SoupSieve = cm.SoupSieve
def compile( # noqa: A001 def compile( # noqa: A001
pattern: str, pattern: str,
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> cm.SoupSieve: ) -> cm.SoupSieve:
"""Compile CSS pattern.""" """Compile CSS pattern."""
ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
if isinstance(pattern, SoupSieve): if isinstance(pattern, SoupSieve):
if flags: if flags:
raise ValueError("Cannot process 'flags' argument on a compiled selector list") raise ValueError("Cannot process 'flags' argument on a compiled selector list")
@ -64,7 +62,12 @@ def compile( # noqa: A001
raise ValueError("Cannot process 'custom' argument on a compiled selector list") raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern return pattern
return cp._cached_css_compile(pattern, ns, cs, flags) return cp._cached_css_compile(
pattern,
ct.Namespaces(namespaces) if namespaces is not None else namespaces,
ct.CustomSelectors(custom) if custom is not None else custom,
flags
)
def purge() -> None: def purge() -> None:
@ -76,10 +79,10 @@ def purge() -> None:
def closest( def closest(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> 'bs4.Tag': ) -> 'bs4.Tag':
"""Match closest ancestor.""" """Match closest ancestor."""
@ -90,10 +93,10 @@ def closest(
def match( def match(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> bool: ) -> bool:
"""Match node.""" """Match node."""
@ -104,12 +107,12 @@ def match(
def filter( # noqa: A001 def filter( # noqa: A001
select: str, select: str,
iterable: Iterable['bs4.Tag'], iterable: Iterable['bs4.Tag'],
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> List['bs4.Tag']: ) -> list['bs4.Tag']:
"""Filter list of nodes.""" """Filter list of nodes."""
return compile(select, namespaces, flags, **kwargs).filter(iterable) return compile(select, namespaces, flags, **kwargs).filter(iterable)
@ -118,10 +121,10 @@ def filter( # noqa: A001
def select_one( def select_one(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> 'bs4.Tag': ) -> 'bs4.Tag':
"""Select a single tag.""" """Select a single tag."""
@ -132,13 +135,13 @@ def select_one(
def select( def select(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
limit: int = 0, limit: int = 0,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> List['bs4.Tag']: ) -> list['bs4.Tag']:
"""Select the specified tags.""" """Select the specified tags."""
return compile(select, namespaces, flags, **kwargs).select(tag, limit) return compile(select, namespaces, flags, **kwargs).select(tag, limit)
@ -147,11 +150,11 @@ def select(
def iselect( def iselect(
select: str, select: str,
tag: 'bs4.Tag', tag: 'bs4.Tag',
namespaces: Optional[Dict[str, str]] = None, namespaces: Optional[dict[str, str]] = None,
limit: int = 0, limit: int = 0,
flags: int = 0, flags: int = 0,
*, *,
custom: Optional[Dict[str, str]] = None, custom: Optional[dict[str, str]] = None,
**kwargs: Any **kwargs: Any
) -> Iterator['bs4.Tag']: ) -> Iterator['bs4.Tag']:
"""Iterate the specified tags.""" """Iterate the specified tags."""

View file

@ -1,4 +1,5 @@
"""Meta related things.""" """Meta related things."""
from __future__ import annotations
from collections import namedtuple from collections import namedtuple
import re import re
@ -83,7 +84,7 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre"
cls, cls,
major: int, minor: int, micro: int, release: str = "final", major: int, minor: int, micro: int, release: str = "final",
pre: int = 0, post: int = 0, dev: int = 0 pre: int = 0, post: int = 0, dev: int = 0
) -> "Version": ) -> Version:
"""Validate version info.""" """Validate version info."""
# Ensure all parts are positive integers. # Ensure all parts are positive integers.
@ -192,5 +193,5 @@ def parse_version(ver: str) -> Version:
return Version(major, minor, micro, release, pre, post, dev) return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(2, 3, 2, "final", post=1) __version_info__ = Version(2, 4, 0, "final")
__version__ = __version_info__._get_canonical() __version__ = __version_info__._get_canonical()

View file

@ -1,11 +1,12 @@
"""CSS matcher.""" """CSS matcher."""
from __future__ import annotations
from datetime import datetime from datetime import datetime
from . import util from . import util
import re import re
from . import css_types as ct from . import css_types as ct
import unicodedata import unicodedata
import bs4 # type: ignore[import] import bs4 # type: ignore[import]
from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401
# Empty tag pattern (whitespace okay) # Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -64,12 +65,12 @@ class _FakeParent:
fake parent so we can traverse the root element as a child. fake parent so we can traverse the root element as a child.
""" """
def __init__(self, element: 'bs4.Tag') -> None: def __init__(self, element: bs4.Tag) -> None:
"""Initialize.""" """Initialize."""
self.contents = [element] self.contents = [element]
def __len__(self) -> 'bs4.PageElement': def __len__(self) -> bs4.PageElement:
"""Length.""" """Length."""
return len(self.contents) return len(self.contents)
@ -87,59 +88,59 @@ class _DocumentNav:
raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag))) raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag)))
@staticmethod @staticmethod
def is_doc(obj: 'bs4.Tag') -> bool: def is_doc(obj: bs4.Tag) -> bool:
"""Is `BeautifulSoup` object.""" """Is `BeautifulSoup` object."""
return isinstance(obj, bs4.BeautifulSoup) return isinstance(obj, bs4.BeautifulSoup)
@staticmethod @staticmethod
def is_tag(obj: 'bs4.PageElement') -> bool: def is_tag(obj: bs4.PageElement) -> bool:
"""Is tag.""" """Is tag."""
return isinstance(obj, bs4.Tag) return isinstance(obj, bs4.Tag)
@staticmethod @staticmethod
def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is declaration.""" """Is declaration."""
return isinstance(obj, bs4.Declaration) return isinstance(obj, bs4.Declaration)
@staticmethod @staticmethod
def is_cdata(obj: 'bs4.PageElement') -> bool: def is_cdata(obj: bs4.PageElement) -> bool:
"""Is CDATA.""" """Is CDATA."""
return isinstance(obj, bs4.CData) return isinstance(obj, bs4.CData)
@staticmethod @staticmethod
def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
"""Is processing instruction.""" """Is processing instruction."""
return isinstance(obj, bs4.ProcessingInstruction) return isinstance(obj, bs4.ProcessingInstruction)
@staticmethod @staticmethod
def is_navigable_string(obj: 'bs4.PageElement') -> bool: def is_navigable_string(obj: bs4.PageElement) -> bool:
"""Is navigable string.""" """Is navigable string."""
return isinstance(obj, bs4.NavigableString) return isinstance(obj, bs4.NavigableString)
@staticmethod @staticmethod
def is_special_string(obj: 'bs4.PageElement') -> bool: def is_special_string(obj: bs4.PageElement) -> bool:
"""Is special string.""" """Is special string."""
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
@classmethod @classmethod
def is_content_string(cls, obj: 'bs4.PageElement') -> bool: def is_content_string(cls, obj: bs4.PageElement) -> bool:
"""Check if node is content string.""" """Check if node is content string."""
return cls.is_navigable_string(obj) and not cls.is_special_string(obj) return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
@staticmethod @staticmethod
def create_fake_parent(el: 'bs4.Tag') -> _FakeParent: def create_fake_parent(el: bs4.Tag) -> _FakeParent:
"""Create fake parent for a given element.""" """Create fake parent for a given element."""
return _FakeParent(el) return _FakeParent(el)
@staticmethod @staticmethod
def is_xml_tree(el: 'bs4.Tag') -> bool: def is_xml_tree(el: bs4.Tag) -> bool:
"""Check if element (or document) is from a XML tree.""" """Check if element (or document) is from a XML tree."""
return bool(el._is_xml) return bool(el._is_xml)
def is_iframe(self, el: 'bs4.Tag') -> bool: def is_iframe(self, el: bs4.Tag) -> bool:
"""Check if element is an `iframe`.""" """Check if element is an `iframe`."""
return bool( return bool(
@ -147,7 +148,7 @@ class _DocumentNav:
self.is_html_tag(el) # type: ignore[attr-defined] self.is_html_tag(el) # type: ignore[attr-defined]
) )
def is_root(self, el: 'bs4.Tag') -> bool: def is_root(self, el: bs4.Tag) -> bool:
""" """
Return whether element is a root element. Return whether element is a root element.
@ -161,7 +162,7 @@ class _DocumentNav:
root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
return root return root
def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']: def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
"""Get contents or contents in reverse.""" """Get contents or contents in reverse."""
if not no_iframe or not self.is_iframe(el): if not no_iframe or not self.is_iframe(el):
for content in el.contents: for content in el.contents:
@ -169,12 +170,12 @@ class _DocumentNav:
def get_children( def get_children(
self, self,
el: 'bs4.Tag', el: bs4.Tag,
start: Optional[int] = None, start: Optional[int] = None,
reverse: bool = False, reverse: bool = False,
tags: bool = True, tags: bool = True,
no_iframe: bool = False no_iframe: bool = False
) -> Iterator['bs4.PageElement']: ) -> Iterator[bs4.PageElement]:
"""Get children.""" """Get children."""
if not no_iframe or not self.is_iframe(el): if not no_iframe or not self.is_iframe(el):
@ -195,10 +196,10 @@ class _DocumentNav:
def get_descendants( def get_descendants(
self, self,
el: 'bs4.Tag', el: bs4.Tag,
tags: bool = True, tags: bool = True,
no_iframe: bool = False no_iframe: bool = False
) -> Iterator['bs4.PageElement']: ) -> Iterator[bs4.PageElement]:
"""Get descendants.""" """Get descendants."""
if not no_iframe or not self.is_iframe(el): if not no_iframe or not self.is_iframe(el):
@ -229,7 +230,7 @@ class _DocumentNav:
if not tags or is_tag: if not tags or is_tag:
yield child yield child
def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag': def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
"""Get parent.""" """Get parent."""
parent = el.parent parent = el.parent
@ -238,25 +239,25 @@ class _DocumentNav:
return parent return parent
@staticmethod @staticmethod
def get_tag_name(el: 'bs4.Tag') -> Optional[str]: def get_tag_name(el: bs4.Tag) -> Optional[str]:
"""Get tag.""" """Get tag."""
return cast(Optional[str], el.name) return cast(Optional[str], el.name)
@staticmethod @staticmethod
def get_prefix_name(el: 'bs4.Tag') -> Optional[str]: def get_prefix_name(el: bs4.Tag) -> Optional[str]:
"""Get prefix.""" """Get prefix."""
return cast(Optional[str], el.prefix) return cast(Optional[str], el.prefix)
@staticmethod @staticmethod
def get_uri(el: 'bs4.Tag') -> Optional[str]: def get_uri(el: bs4.Tag) -> Optional[str]:
"""Get namespace `URI`.""" """Get namespace `URI`."""
return cast(Optional[str], el.namespace) return cast(Optional[str], el.namespace)
@classmethod @classmethod
def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement': def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get next sibling tag.""" """Get next sibling tag."""
sibling = el.next_sibling sibling = el.next_sibling
@ -265,7 +266,7 @@ class _DocumentNav:
return sibling return sibling
@classmethod @classmethod
def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement': def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
"""Get previous sibling tag.""" """Get previous sibling tag."""
sibling = el.previous_sibling sibling = el.previous_sibling
@ -274,7 +275,7 @@ class _DocumentNav:
return sibling return sibling
@staticmethod @staticmethod
def has_html_ns(el: 'bs4.Tag') -> bool: def has_html_ns(el: bs4.Tag) -> bool:
""" """
Check if element has an HTML namespace. Check if element has an HTML namespace.
@ -286,13 +287,13 @@ class _DocumentNav:
return bool(ns and ns == NS_XHTML) return bool(ns and ns == NS_XHTML)
@staticmethod @staticmethod
def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]: def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]:
"""Return namespace and attribute name without the prefix.""" """Return namespace and attribute name without the prefix."""
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
@classmethod @classmethod
def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]: def normalize_value(cls, value: Any) -> str | Sequence[str]:
"""Normalize the value to be a string or list of strings.""" """Normalize the value to be a string or list of strings."""
# Treat `None` as empty string. # Treat `None` as empty string.
@ -327,10 +328,10 @@ class _DocumentNav:
@classmethod @classmethod
def get_attribute_by_name( def get_attribute_by_name(
cls, cls,
el: 'bs4.Tag', el: bs4.Tag,
name: str, name: str,
default: Optional[Union[str, Sequence[str]]] = None default: Optional[str | Sequence[str]] = None
) -> Optional[Union[str, Sequence[str]]]: ) -> Optional[str | Sequence[str]]:
"""Get attribute by name.""" """Get attribute by name."""
value = default value = default
@ -347,14 +348,14 @@ class _DocumentNav:
return value return value
@classmethod @classmethod
def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]: def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]:
"""Iterate attributes.""" """Iterate attributes."""
for k, v in el.attrs.items(): for k, v in el.attrs.items():
yield k, cls.normalize_value(v) yield k, cls.normalize_value(v)
@classmethod @classmethod
def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]: def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
"""Get classes.""" """Get classes."""
classes = cls.get_attribute_by_name(el, 'class', []) classes = cls.get_attribute_by_name(el, 'class', [])
@ -362,14 +363,14 @@ class _DocumentNav:
classes = RE_NOT_WS.findall(classes) classes = RE_NOT_WS.findall(classes)
return cast(Sequence[str], classes) return cast(Sequence[str], classes)
def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str: def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
"""Get text.""" """Get text."""
return ''.join( return ''.join(
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
) )
def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]: def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
"""Get Own Text.""" """Get Own Text."""
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
@ -423,10 +424,10 @@ class Inputs:
return 0 <= minutes <= 59 return 0 <= minutes <= 59
@classmethod @classmethod
def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]: def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]:
"""Parse the input value.""" """Parse the input value."""
parsed = None # type: Optional[Tuple[float, ...]] parsed = None # type: Optional[tuple[float, ...]]
if value is None: if value is None:
return value return value
if itype == "date": if itype == "date":
@ -484,7 +485,7 @@ class CSSMatch(_DocumentNav):
def __init__( def __init__(
self, self,
selectors: ct.SelectorList, selectors: ct.SelectorList,
scope: 'bs4.Tag', scope: bs4.Tag,
namespaces: Optional[ct.Namespaces], namespaces: Optional[ct.Namespaces],
flags: int flags: int
) -> None: ) -> None:
@ -492,11 +493,11 @@ class CSSMatch(_DocumentNav):
self.assert_valid_input(scope) self.assert_valid_input(scope)
self.tag = scope self.tag = scope
self.cached_meta_lang = [] # type: List[Tuple[str, str]] self.cached_meta_lang = [] # type: list[tuple[str, str]]
self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']] self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]] self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
self.selectors = selectors self.selectors = selectors
self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]] self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
self.flags = flags self.flags = flags
self.iframe_restrict = False self.iframe_restrict = False
@ -527,7 +528,7 @@ class CSSMatch(_DocumentNav):
return self.is_xml or self.has_html_namespace return self.is_xml or self.has_html_namespace
def get_tag_ns(self, el: 'bs4.Tag') -> str: def get_tag_ns(self, el: bs4.Tag) -> str:
"""Get tag namespace.""" """Get tag namespace."""
if self.supports_namespaces(): if self.supports_namespaces():
@ -539,24 +540,24 @@ class CSSMatch(_DocumentNav):
namespace = NS_XHTML namespace = NS_XHTML
return namespace return namespace
def is_html_tag(self, el: 'bs4.Tag') -> bool: def is_html_tag(self, el: bs4.Tag) -> bool:
"""Check if tag is in HTML namespace.""" """Check if tag is in HTML namespace."""
return self.get_tag_ns(el) == NS_XHTML return self.get_tag_ns(el) == NS_XHTML
def get_tag(self, el: 'bs4.Tag') -> Optional[str]: def get_tag(self, el: bs4.Tag) -> Optional[str]:
"""Get tag.""" """Get tag."""
name = self.get_tag_name(el) name = self.get_tag_name(el)
return util.lower(name) if name is not None and not self.is_xml else name return util.lower(name) if name is not None and not self.is_xml else name
def get_prefix(self, el: 'bs4.Tag') -> Optional[str]: def get_prefix(self, el: bs4.Tag) -> Optional[str]:
"""Get prefix.""" """Get prefix."""
prefix = self.get_prefix_name(el) prefix = self.get_prefix_name(el)
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
def find_bidi(self, el: 'bs4.Tag') -> Optional[int]: def find_bidi(self, el: bs4.Tag) -> Optional[int]:
"""Get directionality from element text.""" """Get directionality from element text."""
for node in self.get_children(el, tags=False): for node in self.get_children(el, tags=False):
@ -600,13 +601,18 @@ class CSSMatch(_DocumentNav):
ranges = lang_range.split('-') ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-') subtags = lang_tag.lower().split('-')
length = len(ranges) length = len(ranges)
slength = len(subtags)
rindex = 0 rindex = 0
sindex = 0 sindex = 0
r = ranges[rindex] r = ranges[rindex]
s = subtags[sindex] s = subtags[sindex]
# Empty specified language should match unspecified language attributes
if length == 1 and slength == 1 and not r and r == s:
return True
# Primary tag needs to match # Primary tag needs to match
if r != '*' and r != s: if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
match = False match = False
rindex += 1 rindex += 1
@ -645,10 +651,10 @@ class CSSMatch(_DocumentNav):
def match_attribute_name( def match_attribute_name(
self, self,
el: 'bs4.Tag', el: bs4.Tag,
attr: str, attr: str,
prefix: Optional[str] prefix: Optional[str]
) -> Optional[Union[str, Sequence[str]]]: ) -> Optional[str | Sequence[str]]:
"""Match attribute name and return value if it exists.""" """Match attribute name and return value if it exists."""
value = None value = None
@ -696,7 +702,7 @@ class CSSMatch(_DocumentNav):
break break
return value return value
def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool: def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match the namespace of the element.""" """Match the namespace of the element."""
match = True match = True
@ -717,7 +723,7 @@ class CSSMatch(_DocumentNav):
match = False match = False
return match return match
def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool: def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
"""Match attributes.""" """Match attributes."""
match = True match = True
@ -736,7 +742,7 @@ class CSSMatch(_DocumentNav):
break break
return match return match
def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool: def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
"""Match tag name.""" """Match tag name."""
name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
@ -745,7 +751,7 @@ class CSSMatch(_DocumentNav):
name not in (self.get_tag(el), '*') name not in (self.get_tag(el), '*')
) )
def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool: def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool:
"""Match the tag.""" """Match the tag."""
match = True match = True
@ -757,7 +763,7 @@ class CSSMatch(_DocumentNav):
match = False match = False
return match return match
def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match past relationship.""" """Match past relationship."""
found = False found = False
@ -785,12 +791,12 @@ class CSSMatch(_DocumentNav):
found = self.match_selectors(sibling, relation) found = self.match_selectors(sibling, relation)
return found return found
def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool: def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
"""Match future child.""" """Match future child."""
match = False match = False
if recursive: if recursive:
children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']] children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
else: else:
children = self.get_children children = self.get_children
for child in children(parent, no_iframe=self.iframe_restrict): for child in children(parent, no_iframe=self.iframe_restrict):
@ -799,7 +805,7 @@ class CSSMatch(_DocumentNav):
break break
return match return match
def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match future relationship.""" """Match future relationship."""
found = False found = False
@ -822,7 +828,7 @@ class CSSMatch(_DocumentNav):
found = self.match_selectors(sibling, relation) found = self.match_selectors(sibling, relation)
return found return found
def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
"""Match relationship to other elements.""" """Match relationship to other elements."""
found = False found = False
@ -837,7 +843,7 @@ class CSSMatch(_DocumentNav):
return found return found
def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool: def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
"""Match element's ID.""" """Match element's ID."""
found = True found = True
@ -847,7 +853,7 @@ class CSSMatch(_DocumentNav):
break break
return found return found
def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool: def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
"""Match element's classes.""" """Match element's classes."""
current_classes = self.get_classes(el) current_classes = self.get_classes(el)
@ -858,7 +864,7 @@ class CSSMatch(_DocumentNav):
break break
return found return found
def match_root(self, el: 'bs4.Tag') -> bool: def match_root(self, el: bs4.Tag) -> bool:
"""Match element as root.""" """Match element as root."""
is_root = self.is_root(el) is_root = self.is_root(el)
@ -884,12 +890,12 @@ class CSSMatch(_DocumentNav):
sibling = self.get_next(sibling, tags=False) sibling = self.get_next(sibling, tags=False)
return is_root return is_root
def match_scope(self, el: 'bs4.Tag') -> bool: def match_scope(self, el: bs4.Tag) -> bool:
"""Match element as scope.""" """Match element as scope."""
return self.scope is el return self.scope is el
def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool: def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
"""Match tag type for `nth` matches.""" """Match tag type for `nth` matches."""
return ( return (
@ -897,7 +903,7 @@ class CSSMatch(_DocumentNav):
(self.get_tag_ns(child) == self.get_tag_ns(el)) (self.get_tag_ns(child) == self.get_tag_ns(el))
) )
def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool: def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
"""Match `nth` elements.""" """Match `nth` elements."""
matched = True matched = True
@ -998,7 +1004,7 @@ class CSSMatch(_DocumentNav):
break break
return matched return matched
def match_empty(self, el: 'bs4.Tag') -> bool: def match_empty(self, el: bs4.Tag) -> bool:
"""Check if element is empty (if requested).""" """Check if element is empty (if requested)."""
is_empty = True is_empty = True
@ -1011,7 +1017,7 @@ class CSSMatch(_DocumentNav):
break break
return is_empty return is_empty
def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool: def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
"""Match selectors.""" """Match selectors."""
match = True match = True
@ -1020,11 +1026,11 @@ class CSSMatch(_DocumentNav):
match = False match = False
return match return match
def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool: def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
"""Match element if it contains text.""" """Match element if it contains text."""
match = True match = True
content = None # type: Optional[Union[str, Sequence[str]]] content = None # type: Optional[str | Sequence[str]]
for contain_list in contains: for contain_list in contains:
if content is None: if content is None:
if contain_list.own: if contain_list.own:
@ -1048,7 +1054,7 @@ class CSSMatch(_DocumentNav):
match = False match = False
return match return match
def match_default(self, el: 'bs4.Tag') -> bool: def match_default(self, el: bs4.Tag) -> bool:
"""Match default.""" """Match default."""
match = False match = False
@ -1087,13 +1093,13 @@ class CSSMatch(_DocumentNav):
break break
return match return match
def match_indeterminate(self, el: 'bs4.Tag') -> bool: def match_indeterminate(self, el: bs4.Tag) -> bool:
"""Match default.""" """Match default."""
match = False match = False
name = cast(str, self.get_attribute_by_name(el, 'name')) name = cast(str, self.get_attribute_by_name(el, 'name'))
def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']: def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]:
"""Find this input's form.""" """Find this input's form."""
form = None form = None
parent = self.get_parent(el, no_iframe=True) parent = self.get_parent(el, no_iframe=True)
@ -1148,7 +1154,7 @@ class CSSMatch(_DocumentNav):
return match return match
def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool: def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
"""Match languages.""" """Match languages."""
match = False match = False
@ -1183,7 +1189,7 @@ class CSSMatch(_DocumentNav):
break break
# Use cached meta language. # Use cached meta language.
if not found_lang and self.cached_meta_lang: if found_lang is None and self.cached_meta_lang:
for cache in self.cached_meta_lang: for cache in self.cached_meta_lang:
if root is cache[0]: if root is cache[0]:
found_lang = cache[1] found_lang = cache[1]
@ -1217,13 +1223,13 @@ class CSSMatch(_DocumentNav):
found_lang = content found_lang = content
self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
break break
if found_lang: if found_lang is not None:
break break
if not found_lang: if found_lang is None:
self.cached_meta_lang.append((cast(str, root), '')) self.cached_meta_lang.append((cast(str, root), ''))
# If we determined a language, compare. # If we determined a language, compare.
if found_lang: if found_lang is not None:
for patterns in langs: for patterns in langs:
match = False match = False
for pattern in patterns: for pattern in patterns:
@ -1234,7 +1240,7 @@ class CSSMatch(_DocumentNav):
return match return match
def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool: def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
"""Check directionality.""" """Check directionality."""
# If we have to match both left and right, we can't match either. # If we have to match both left and right, we can't match either.
@ -1297,7 +1303,7 @@ class CSSMatch(_DocumentNav):
# Match parents direction # Match parents direction
return self.match_dir(self.get_parent(el, no_iframe=True), directionality) return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
def match_range(self, el: 'bs4.Tag', condition: int) -> bool: def match_range(self, el: bs4.Tag, condition: int) -> bool:
""" """
Match range. Match range.
@ -1337,7 +1343,7 @@ class CSSMatch(_DocumentNav):
return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
def match_defined(self, el: 'bs4.Tag') -> bool: def match_defined(self, el: bs4.Tag) -> bool:
""" """
Match defined. Match defined.
@ -1360,7 +1366,7 @@ class CSSMatch(_DocumentNav):
) )
) )
def match_placeholder_shown(self, el: 'bs4.Tag') -> bool: def match_placeholder_shown(self, el: bs4.Tag) -> bool:
""" """
Match placeholder shown according to HTML spec. Match placeholder shown according to HTML spec.
@ -1375,7 +1381,7 @@ class CSSMatch(_DocumentNav):
return match return match
def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool: def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
"""Check if element matches one of the selectors.""" """Check if element matches one of the selectors."""
match = False match = False
@ -1459,7 +1465,7 @@ class CSSMatch(_DocumentNav):
return match return match
def select(self, limit: int = 0) -> Iterator['bs4.Tag']: def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
"""Match all tags under the targeted tag.""" """Match all tags under the targeted tag."""
lim = None if limit < 1 else limit lim = None if limit < 1 else limit
@ -1472,7 +1478,7 @@ class CSSMatch(_DocumentNav):
if lim < 1: if lim < 1:
break break
def closest(self) -> Optional['bs4.Tag']: def closest(self) -> Optional[bs4.Tag]:
"""Match closest ancestor.""" """Match closest ancestor."""
current = self.tag current = self.tag
@ -1484,12 +1490,12 @@ class CSSMatch(_DocumentNav):
current = self.get_parent(current) current = self.get_parent(current)
return closest return closest
def filter(self) -> List['bs4.Tag']: # noqa A001 def filter(self) -> list[bs4.Tag]: # noqa A001
"""Filter tag's children.""" """Filter tag's children."""
return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
def match(self, el: 'bs4.Tag') -> bool: def match(self, el: bs4.Tag) -> bool:
"""Match.""" """Match."""
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
@ -1501,7 +1507,7 @@ class SoupSieve(ct.Immutable):
pattern: str pattern: str
selectors: ct.SelectorList selectors: ct.SelectorList
namespaces: Optional[ct.Namespaces] namespaces: Optional[ct.Namespaces]
custom: Dict[str, str] custom: dict[str, str]
flags: int flags: int
__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
@ -1524,17 +1530,17 @@ class SoupSieve(ct.Immutable):
flags=flags flags=flags
) )
def match(self, tag: 'bs4.Tag') -> bool: def match(self, tag: bs4.Tag) -> bool:
"""Match.""" """Match."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag': def closest(self, tag: bs4.Tag) -> bs4.Tag:
"""Match closest ancestor.""" """Match closest ancestor."""
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
""" """
Filter. Filter.
@ -1551,18 +1557,18 @@ class SoupSieve(ct.Immutable):
else: else:
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag': def select_one(self, tag: bs4.Tag) -> bs4.Tag:
"""Select a single tag.""" """Select a single tag."""
tags = self.select(tag, limit=1) tags = self.select(tag, limit=1)
return tags[0] if tags else None return tags[0] if tags else None
def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']: def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
"""Select the specified tags.""" """Select the specified tags."""
return list(self.iselect(tag, limit)) return list(self.iselect(tag, limit))
def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']: def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
"""Iterate the specified tags.""" """Iterate the specified tags."""
for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):

View file

@ -1,4 +1,5 @@
"""CSS selector parser.""" """CSS selector parser."""
from __future__ import annotations
import re import re
from functools import lru_cache from functools import lru_cache
from . import util from . import util
@ -6,7 +7,7 @@ from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import SelectorSyntaxError from .util import SelectorSyntaxError
import warnings import warnings
from typing import Optional, Dict, Match, Tuple, Type, Any, List, Union, Iterator, cast from typing import Optional, Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -232,7 +233,7 @@ def _purge_cache() -> None:
_cached_css_compile.cache_clear() _cached_css_compile.cache_clear()
def process_custom(custom: Optional[ct.CustomSelectors]) -> Dict[str, Union[str, ct.SelectorList]]: def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
"""Process custom.""" """Process custom."""
custom_selectors = {} custom_selectors = {}
@ -325,7 +326,7 @@ class SelectorPattern:
class SpecialPseudoPattern(SelectorPattern): class SpecialPseudoPattern(SelectorPattern):
"""Selector pattern.""" """Selector pattern."""
def __init__(self, patterns: Tuple[Tuple[str, Tuple[str, ...], str, Type[SelectorPattern]], ...]) -> None: def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
"""Initialize.""" """Initialize."""
self.patterns = {} self.patterns = {}
@ -372,19 +373,19 @@ class _Selector:
"""Initialize.""" """Initialize."""
self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag] self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
self.ids = kwargs.get('ids', []) # type: List[str] self.ids = kwargs.get('ids', []) # type: list[str]
self.classes = kwargs.get('classes', []) # type: List[str] self.classes = kwargs.get('classes', []) # type: list[str]
self.attributes = kwargs.get('attributes', []) # type: List[ct.SelectorAttribute] self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) # type: List[ct.SelectorNth] self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) # type: List[ct.SelectorList] self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
self.relations = kwargs.get('relations', []) # type: List[_Selector] self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) # type: Optional[str] self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
self.contains = kwargs.get('contains', []) # type: List[ct.SelectorContains] self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
self.lang = kwargs.get('lang', []) # type: List[ct.SelectorLang] self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) # type: int self.flags = kwargs.get('flags', 0) # type: int
self.no_match = kwargs.get('no_match', False) # type: bool self.no_match = kwargs.get('no_match', False) # type: bool
def _freeze_relations(self, relations: List['_Selector']) -> ct.SelectorList: def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
"""Freeze relation.""" """Freeze relation."""
if relations: if relations:
@ -394,7 +395,7 @@ class _Selector:
else: else:
return ct.SelectorList() return ct.SelectorList()
def freeze(self) -> Union[ct.Selector, ct.SelectorNull]: def freeze(self) -> ct.Selector | ct.SelectorNull:
"""Freeze self.""" """Freeze self."""
if self.no_match: if self.no_match:
@ -461,7 +462,7 @@ class CSSParser:
def __init__( def __init__(
self, self,
selector: str, selector: str,
custom: Optional[Dict[str, Union[str, ct.SelectorList]]] = None, custom: Optional[dict[str, str | ct.SelectorList]] = None,
flags: int = 0 flags: int = 0
) -> None: ) -> None:
"""Initialize.""" """Initialize."""
@ -583,9 +584,9 @@ class CSSParser:
sel: _Selector, sel: _Selector,
m: Match[str], m: Match[str],
has_selector: bool, has_selector: bool,
iselector: Iterator[Tuple[str, Match[str]]], iselector: Iterator[tuple[str, Match[str]]],
is_html: bool is_html: bool
) -> Tuple[bool, bool]: ) -> tuple[bool, bool]:
"""Parse pseudo class.""" """Parse pseudo class."""
complex_pseudo = False complex_pseudo = False
@ -678,7 +679,7 @@ class CSSParser:
sel: _Selector, sel: _Selector,
m: Match[str], m: Match[str],
has_selector: bool, has_selector: bool,
iselector: Iterator[Tuple[str, Match[str]]] iselector: Iterator[tuple[str, Match[str]]]
) -> bool: ) -> bool:
"""Parse `nth` pseudo.""" """Parse `nth` pseudo."""
@ -743,7 +744,7 @@ class CSSParser:
sel: _Selector, sel: _Selector,
name: str, name: str,
has_selector: bool, has_selector: bool,
iselector: Iterator[Tuple[str, Match[str]]], iselector: Iterator[tuple[str, Match[str]]],
index: int index: int
) -> bool: ) -> bool:
"""Parse pseudo with opening bracket.""" """Parse pseudo with opening bracket."""
@ -752,7 +753,7 @@ class CSSParser:
if name == ':not': if name == ':not':
flags |= FLG_NOT flags |= FLG_NOT
elif name == ':has': elif name == ':has':
flags |= FLG_RELATIVE | FLG_FORGIVE flags |= FLG_RELATIVE
elif name in (':where', ':is'): elif name in (':where', ':is'):
flags |= FLG_FORGIVE flags |= FLG_FORGIVE
@ -766,21 +767,16 @@ class CSSParser:
sel: _Selector, sel: _Selector,
m: Match[str], m: Match[str],
has_selector: bool, has_selector: bool,
selectors: List[_Selector], selectors: list[_Selector],
rel_type: str, rel_type: str,
index: int index: int
) -> Tuple[bool, _Selector, str]: ) -> tuple[bool, _Selector, str]:
"""Parse combinator tokens.""" """Parse combinator tokens."""
combinator = m.group('relation').strip() combinator = m.group('relation').strip()
if not combinator: if not combinator:
combinator = WS_COMBINATOR combinator = WS_COMBINATOR
if combinator == COMMA_COMBINATOR: if combinator == COMMA_COMBINATOR:
if not has_selector:
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
sel.no_match = True
sel.rel_type = rel_type sel.rel_type = rel_type
selectors[-1].relations.append(sel) selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR rel_type = ":" + WS_COMBINATOR
@ -814,12 +810,12 @@ class CSSParser:
sel: _Selector, sel: _Selector,
m: Match[str], m: Match[str],
has_selector: bool, has_selector: bool,
selectors: List[_Selector], selectors: list[_Selector],
relations: List[_Selector], relations: list[_Selector],
is_pseudo: bool, is_pseudo: bool,
is_forgive: bool, is_forgive: bool,
index: int index: int
) -> Tuple[bool, _Selector]: ) -> tuple[bool, _Selector]:
"""Parse combinator tokens.""" """Parse combinator tokens."""
combinator = m.group('relation').strip() combinator = m.group('relation').strip()
@ -924,7 +920,7 @@ class CSSParser:
def parse_selectors( def parse_selectors(
self, self,
iselector: Iterator[Tuple[str, Match[str]]], iselector: Iterator[tuple[str, Match[str]]],
index: int = 0, index: int = 0,
flags: int = 0 flags: int = 0
) -> ct.SelectorList: ) -> ct.SelectorList:
@ -935,7 +931,7 @@ class CSSParser:
selectors = [] selectors = []
has_selector = False has_selector = False
closed = False closed = False
relations = [] # type: List[_Selector] relations = [] # type: list[_Selector]
rel_type = ":" + WS_COMBINATOR rel_type = ":" + WS_COMBINATOR
# Setup various flags # Setup various flags
@ -1069,18 +1065,8 @@ class CSSParser:
selectors.append(sel) selectors.append(sel)
# Forgive empty slots in pseudo-classes that have lists (and are forgiving) # Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive: elif is_forgive and (not selectors or not relations):
if is_relative: # Handle normal pseudo-classes with empty slots like `:is()` etc.
# Handle relative selectors pseudo-classes with empty slots like `:has()`
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
sel.rel_type = rel_type
sel.no_match = True
selectors[-1].relations.append(sel)
has_selector = True
else:
# Handle normal pseudo-classes with empty slots
if not selectors or not relations:
# Others like `:is()` etc.
sel.no_match = True sel.no_match = True
del relations[:] del relations[:]
selectors.append(sel) selectors.append(sel)
@ -1112,7 +1098,7 @@ class CSSParser:
# Return selector list # Return selector list
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
def selector_iter(self, pattern: str) -> Iterator[Tuple[str, Match[str]]]: def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
"""Iterate selector tokens.""" """Iterate selector tokens."""
# Ignore whitespace and comments at start and end of pattern # Ignore whitespace and comments at start and end of pattern

View file

@ -1,7 +1,8 @@
"""CSS selector structure items.""" """CSS selector structure items."""
from __future__ import annotations
import copyreg import copyreg
from .pretty import pretty from .pretty import pretty
from typing import Any, Type, Tuple, Union, Dict, Iterator, Hashable, Optional, Pattern, Iterable, Mapping from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
__all__ = ( __all__ = (
'Selector', 'Selector',
@ -33,7 +34,7 @@ SEL_PLACEHOLDER_SHOWN = 0x400
class Immutable: class Immutable:
"""Immutable.""" """Immutable."""
__slots__: Tuple[str, ...] = ('_hash',) __slots__: tuple[str, ...] = ('_hash',)
_hash: int _hash: int
@ -48,7 +49,7 @@ class Immutable:
super(Immutable, self).__setattr__('_hash', hash(tuple(temp))) super(Immutable, self).__setattr__('_hash', hash(tuple(temp)))
@classmethod @classmethod
def __base__(cls) -> "Type[Immutable]": def __base__(cls) -> "type[Immutable]":
"""Get base class.""" """Get base class."""
return cls return cls
@ -99,7 +100,7 @@ class ImmutableDict(Mapping[Any, Any]):
def __init__( def __init__(
self, self,
arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]] arg: dict[Any, Any] | Iterable[tuple[Any, Any]]
) -> None: ) -> None:
"""Initialize.""" """Initialize."""
@ -107,7 +108,7 @@ class ImmutableDict(Mapping[Any, Any]):
self._d = dict(arg) self._d = dict(arg)
self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())])) self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())]))
def _validate(self, arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]]) -> None: def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None:
"""Validate arguments.""" """Validate arguments."""
if isinstance(arg, dict): if isinstance(arg, dict):
@ -147,12 +148,12 @@ class ImmutableDict(Mapping[Any, Any]):
class Namespaces(ImmutableDict): class Namespaces(ImmutableDict):
"""Namespaces.""" """Namespaces."""
def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize.""" """Initialize."""
super().__init__(arg) super().__init__(arg)
def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments.""" """Validate arguments."""
if isinstance(arg, dict): if isinstance(arg, dict):
@ -165,12 +166,12 @@ class Namespaces(ImmutableDict):
class CustomSelectors(ImmutableDict): class CustomSelectors(ImmutableDict):
"""Custom selectors.""" """Custom selectors."""
def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize.""" """Initialize."""
super().__init__(arg) super().__init__(arg)
def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments.""" """Validate arguments."""
if isinstance(arg, dict): if isinstance(arg, dict):
@ -188,30 +189,30 @@ class Selector(Immutable):
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
) )
tag: Optional['SelectorTag'] tag: Optional[SelectorTag]
ids: Tuple[str, ...] ids: tuple[str, ...]
classes: Tuple[str, ...] classes: tuple[str, ...]
attributes: Tuple['SelectorAttribute', ...] attributes: tuple[SelectorAttribute, ...]
nth: Tuple['SelectorNth', ...] nth: tuple[SelectorNth, ...]
selectors: Tuple['SelectorList', ...] selectors: tuple[SelectorList, ...]
relation: 'SelectorList' relation: SelectorList
rel_type: Optional[str] rel_type: Optional[str]
contains: Tuple['SelectorContains', ...] contains: tuple[SelectorContains, ...]
lang: Tuple['SelectorLang', ...] lang: tuple[SelectorLang, ...]
flags: int flags: int
def __init__( def __init__(
self, self,
tag: Optional['SelectorTag'], tag: Optional[SelectorTag],
ids: Tuple[str, ...], ids: tuple[str, ...],
classes: Tuple[str, ...], classes: tuple[str, ...],
attributes: Tuple['SelectorAttribute', ...], attributes: tuple[SelectorAttribute, ...],
nth: Tuple['SelectorNth', ...], nth: tuple[SelectorNth, ...],
selectors: Tuple['SelectorList', ...], selectors: tuple[SelectorList, ...],
relation: 'SelectorList', relation: SelectorList,
rel_type: Optional[str], rel_type: Optional[str],
contains: Tuple['SelectorContains', ...], contains: tuple[SelectorContains, ...],
lang: Tuple['SelectorLang', ...], lang: tuple[SelectorLang, ...],
flags: int flags: int
): ):
"""Initialize.""" """Initialize."""
@ -286,7 +287,7 @@ class SelectorContains(Immutable):
__slots__ = ("text", "own", "_hash") __slots__ = ("text", "own", "_hash")
text: Tuple[str, ...] text: tuple[str, ...]
own: bool own: bool
def __init__(self, text: Iterable[str], own: bool) -> None: def __init__(self, text: Iterable[str], own: bool) -> None:
@ -305,9 +306,9 @@ class SelectorNth(Immutable):
b: int b: int
of_type: bool of_type: bool
last: bool last: bool
selectors: 'SelectorList' selectors: SelectorList
def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: 'SelectorList') -> None: def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None:
"""Initialize.""" """Initialize."""
super().__init__( super().__init__(
@ -325,7 +326,7 @@ class SelectorLang(Immutable):
__slots__ = ("languages", "_hash",) __slots__ = ("languages", "_hash",)
languages: Tuple[str, ...] languages: tuple[str, ...]
def __init__(self, languages: Iterable[str]): def __init__(self, languages: Iterable[str]):
"""Initialize.""" """Initialize."""
@ -353,13 +354,13 @@ class SelectorList(Immutable):
__slots__ = ("selectors", "is_not", "is_html", "_hash") __slots__ = ("selectors", "is_not", "is_html", "_hash")
selectors: Tuple[Union['Selector', 'SelectorNull'], ...] selectors: tuple[Selector | SelectorNull, ...]
is_not: bool is_not: bool
is_html: bool is_html: bool
def __init__( def __init__(
self, self,
selectors: Optional[Iterable[Union['Selector', 'SelectorNull']]] = None, selectors: Optional[Iterable[Selector | SelectorNull]] = None,
is_not: bool = False, is_not: bool = False,
is_html: bool = False is_html: bool = False
) -> None: ) -> None:
@ -371,7 +372,7 @@ class SelectorList(Immutable):
is_html=is_html is_html=is_html
) )
def __iter__(self) -> Iterator[Union['Selector', 'SelectorNull']]: def __iter__(self) -> Iterator[Selector | SelectorNull]:
"""Iterator.""" """Iterator."""
return iter(self.selectors) return iter(self.selectors)
@ -381,7 +382,7 @@ class SelectorList(Immutable):
return len(self.selectors) return len(self.selectors)
def __getitem__(self, index: int) -> Union['Selector', 'SelectorNull']: def __getitem__(self, index: int) -> Selector | SelectorNull:
"""Get item.""" """Get item."""
return self.selectors[index] return self.selectors[index]

View file

@ -65,6 +65,7 @@ SelectorList(
is_html=False) is_html=False)
``` ```
""" """
from __future__ import annotations
import re import re
from typing import Any from typing import Any

View file

@ -1,8 +1,9 @@
"""Utility.""" """Utility."""
from __future__ import annotations
from functools import wraps, lru_cache from functools import wraps, lru_cache
import warnings import warnings
import re import re
from typing import Callable, Any, Optional, Tuple, List from typing import Callable, Any, Optional
DEBUG = 0x00001 DEBUG = 0x00001
@ -75,13 +76,13 @@ def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no co
) )
def get_pattern_context(pattern: str, index: int) -> Tuple[str, int, int]: def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
"""Get the pattern context.""" """Get the pattern context."""
last = 0 last = 0
current_line = 1 current_line = 1
col = 1 col = 1
text = [] # type: List[str] text = [] # type: list[str]
line = 1 line = 1
offset = None # type: Optional[int] offset = None # type: Optional[int]

View file

@ -4,7 +4,7 @@ arrow==1.2.3
backports.csv==1.0.7 backports.csv==1.0.7
backports.functools-lru-cache==1.6.4 backports.functools-lru-cache==1.6.4
backports.zoneinfo==0.2.1;python_version<"3.9" backports.zoneinfo==0.2.1;python_version<"3.9"
beautifulsoup4==4.11.1 beautifulsoup4==4.11.2
bleach==6.0.0 bleach==6.0.0
certifi==2022.12.7 certifi==2022.12.7
cheroot==9.0.0 cheroot==9.0.0