diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index b3c9feb8..db71cc7c 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -7,7 +7,7 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. -Beautiful Soup works with Python 3.5 and up. It works better if lxml +Beautiful Soup works with Python 3.6 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the @@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.11.1" -__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" +__version__ = "4.11.2" +__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -211,7 +211,7 @@ class BeautifulSoup(Tag): warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name), - DeprecationWarning + DeprecationWarning, stacklevel=3 ) return kwargs.pop(old_name) return None @@ -405,7 +405,8 @@ class BeautifulSoup(Tag): 'The input looks more like a URL than markup. You may want to use' ' an HTTP client like requests to get the document behind' ' the URL, and feed that document to Beautiful Soup.', - MarkupResemblesLocatorWarning + MarkupResemblesLocatorWarning, + stacklevel=3 ) return True return False @@ -436,7 +437,7 @@ class BeautifulSoup(Tag): 'The input looks more like a filename than markup. You may' ' want to open this file and pass the filehandle into' ' Beautiful Soup.', - MarkupResemblesLocatorWarning + MarkupResemblesLocatorWarning, stacklevel=3 ) return True return False @@ -789,7 +790,7 @@ class BeautifulStoneSoup(BeautifulSoup): warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' 'it, pass features="xml" into the BeautifulSoup constructor.', - DeprecationWarning + DeprecationWarning, stacklevel=2 ) super(BeautifulStoneSoup, self).__init__(*args, **kwargs) diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 9f789f3e..2e397458 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -122,7 +122,7 @@ class TreeBuilder(object): # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - DEFAULT_CDATA_LIST_ATTRIBUTES = {} + DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list) # Whitespace should be preserved inside these tags. DEFAULT_PRESERVE_WHITESPACE_TAGS = set() diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 58bc176e..dac21732 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -70,7 +70,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # ATM because the html5lib TreeBuilder doesn't use # UnicodeDammit. if exclude_encodings: - warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + warnings.warn( + "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.", + stacklevel=3 + ) # html5lib only parses HTML, so if it's given XML that's worth # noting. @@ -81,7 +84,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # These methods are defined by Beautiful Soup. def feed(self, markup): if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + warnings.warn( + "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", + stacklevel=4 + ) parser = html5lib.HTMLParser(tree=self.create_treebuilder) self.underlying_builder.parser = parser extra_kwargs = dict() @@ -249,9 +255,9 @@ class AttrList(object): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. list_attr = self.element.cdata_list_attributes or {} - if (name in list_attr.get('*') + if (name in list_attr.get('*', []) or (self.element.name in list_attr - and name in list_attr[self.element.name])): + and name in list_attr.get(self.element.name, []))): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index fae4d0f2..e48b6a0e 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -10,30 +10,9 @@ __all__ = [ from html.parser import HTMLParser -try: - from html.parser import HTMLParseError -except ImportError as e: - # HTMLParseError is removed in Python 3.5. Since it can never be - # thrown in 3.5, we can just define our own class as a placeholder. - class HTMLParseError(Exception): - pass - import sys import warnings -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 -CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 -CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 - - from bs4.element import ( CData, Comment, @@ -90,20 +69,7 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): self.already_closed_empty_element = [] self._initialize_xml_detector() - - def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although - this requirement doesn't appear to be documented. - In Python 2, HTMLParser implements error() by raising an exception, - which we don't want to do. - - In any event, this method is called only on very strange - markup and our best strategy is to pretend it didn't happen - and keep going. - """ - warnings.warn(msg) - def handle_startendtag(self, name, attrs): """Handle an incoming empty-element tag. @@ -203,9 +169,10 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): :param name: Character number, possibly in hexadecimal. """ - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed in all supported versions. - # http://bugs.python.org/issue13633 + # TODO: This was originally a workaround for a bug in + # HTMLParser. (http://bugs.python.org/issue13633) The bug has + # been fixed, but removing this code still makes some + # Beautiful Soup tests fail. This needs investigation. if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -333,10 +300,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser_args = parser_args or [] parser_kwargs = parser_kwargs or {} parser_kwargs.update(extra_parser_kwargs) - if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - parser_kwargs['strict'] = False - if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - parser_kwargs['convert_charrefs'] = False + parser_kwargs['convert_charrefs'] = False self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, @@ -395,105 +359,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup - try: - parser.feed(markup) - parser.close() - except HTMLParseError as e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e + parser.feed(markup) + parser.close() parser.already_closed_empty_element = [] - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like

as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True diff --git a/lib/bs4/element.py b/lib/bs4/element.py index 74b1dc0f..583d0e8a 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -496,13 +496,16 @@ class PageElement(object): def extend(self, tags): """Appends the given PageElements to this one's contents. - :param tags: A list of PageElements. + :param tags: A list of PageElements. If a single Tag is + provided instead, this PageElement's contents will be extended + with that Tag's contents. """ if isinstance(tags, Tag): - # Calling self.append() on another tag's contents will change - # the list we're iterating over. Make a list that won't - # change. - tags = list(tags.contents) + tags = tags.contents + if isinstance(tags, list): + # Moving items around the tree may change their position in + # the original list. Make a list that won't change. + tags = list(tags) for tag in tags: self.append(tag) @@ -586,8 +589,9 @@ class PageElement(object): :kwargs: A dictionary of filters on attribute values. :return: A ResultSet containing PageElements. """ + _stacklevel = kwargs.pop('_stacklevel', 2) return self._find_all(name, attrs, string, limit, self.next_elements, - **kwargs) + _stacklevel=_stacklevel+1, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): @@ -624,8 +628,11 @@ class PageElement(object): :return: A ResultSet of PageElements. :rtype: bs4.element.ResultSet """ - return self._find_all(name, attrs, string, limit, - self.next_siblings, **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, + self.next_siblings, _stacklevel=_stacklevel+1, **kwargs + ) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 @@ -663,8 +670,11 @@ class PageElement(object): :return: A ResultSet of PageElements. :rtype: bs4.element.ResultSet """ - return self._find_all(name, attrs, string, limit, self.previous_elements, - **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, self.previous_elements, + _stacklevel=_stacklevel+1, **kwargs + ) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 @@ -702,8 +712,11 @@ class PageElement(object): :return: A ResultSet of PageElements. :rtype: bs4.element.ResultSet """ - return self._find_all(name, attrs, string, limit, - self.previous_siblings, **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, + self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs + ) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 @@ -724,7 +737,7 @@ class PageElement(object): # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None - l = self.find_parents(name, attrs, 1, **kwargs) + l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) if l: r = l[0] return r @@ -744,8 +757,9 @@ class PageElement(object): :return: A PageElement. :rtype: bs4.element.Tag | bs4.element.NavigableString """ + _stacklevel = kwargs.pop('_stacklevel', 2) return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) + _stacklevel=_stacklevel+1, **kwargs) findParents = find_parents # BS3 fetchParents = find_parents # BS2 @@ -771,19 +785,20 @@ class PageElement(object): def _find_one(self, method, name, attrs, string, **kwargs): r = None - l = method(name, attrs, string, 1, **kwargs) + l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) if l: r = l[0] return r def _find_all(self, name, attrs, string, limit, generator, **kwargs): "Iterates over a generator looking for things that match." + _stacklevel = kwargs.pop('_stacklevel', 3) if string is None and 'text' in kwargs: string = kwargs.pop('text') warnings.warn( "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", - DeprecationWarning + DeprecationWarning, stacklevel=_stacklevel ) if isinstance(name, SoupStrainer): @@ -1306,7 +1321,8 @@ class Tag(PageElement): sourceline=self.sourceline, sourcepos=self.sourcepos, can_be_empty_element=self.can_be_empty_element, cdata_list_attributes=self.cdata_list_attributes, - preserve_whitespace_tags=self.preserve_whitespace_tags + preserve_whitespace_tags=self.preserve_whitespace_tags, + interesting_string_types=self.interesting_string_types ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) @@ -1558,7 +1574,7 @@ class Tag(PageElement): '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( name=tag_name ), - DeprecationWarning + DeprecationWarning, stacklevel=2 ) return self.find(tag_name) # We special case contents to avoid recursion. @@ -1862,7 +1878,8 @@ class Tag(PageElement): :rtype: bs4.element.Tag | bs4.element.NavigableString """ r = None - l = self.find_all(name, attrs, recursive, string, 1, **kwargs) + l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, + **kwargs) if l: r = l[0] return r @@ -1889,7 +1906,9 @@ class Tag(PageElement): generator = self.descendants if not recursive: generator = self.children - return self._find_all(name, attrs, string, limit, generator, **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all(name, attrs, string, limit, generator, + _stacklevel=_stacklevel+1, **kwargs) findAll = find_all # BS3 findChildren = find_all # BS2 @@ -1993,7 +2012,7 @@ class Tag(PageElement): """ warnings.warn( 'has_key is deprecated. Use has_attr(key) instead.', - DeprecationWarning + DeprecationWarning, stacklevel=2 ) return self.has_attr(key) @@ -2024,7 +2043,7 @@ class SoupStrainer(object): string = kwargs.pop('text') warnings.warn( "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", - DeprecationWarning + DeprecationWarning, stacklevel=2 ) self.name = self._normalize_search_value(name) diff --git a/lib/bs4/formatter.py b/lib/bs4/formatter.py index 65e57b57..83cc1c5c 100644 --- a/lib/bs4/formatter.py +++ b/lib/bs4/formatter.py @@ -149,14 +149,14 @@ class HTMLFormatter(Formatter): """A generic Formatter for HTML.""" REGISTRY = {} def __init__(self, *args, **kwargs): - return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) class XMLFormatter(Formatter): """A generic Formatter for XML.""" REGISTRY = {} def __init__(self, *args, **kwargs): - return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) # Set up aliases for the default formatters. diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py index 4af4b0ce..f4d62db9 100644 --- a/lib/bs4/tests/__init__.py +++ b/lib/bs4/tests/__init__.py @@ -29,6 +29,29 @@ from bs4.builder import ( ) default_builder = HTMLParserTreeBuilder +# Some tests depend on specific third-party libraries. We use +# @pytest.mark.skipIf on the following conditionals to skip them +# if the libraries are not installed. +try: + from soupsieve import SelectorSyntaxError + SOUP_SIEVE_PRESENT = True +except ImportError: + SOUP_SIEVE_PRESENT = False + +try: + import html5lib + HTML5LIB_PRESENT = True +except ImportError: + HTML5LIB_PRESENT = False + +try: + import lxml.etree + LXML_PRESENT = True + LXML_VERSION = lxml.etree.LXML_VERSION +except ImportError: + LXML_PRESENT = False + LXML_VERSION = (0,) + BAD_DOCUMENT = """A bare string @@ -258,10 +281,10 @@ class TreeBuilderSmokeTest(object): @pytest.mark.parametrize( "multi_valued_attributes", - [None, dict(b=['class']), {'*': ['notclass']}] + [None, {}, dict(b=['class']), {'*': ['notclass']}] ) def test_attribute_not_multi_valued(self, multi_valued_attributes): - markup = '' + markup = '' soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes) assert soup.a['class'] == 'a b c' @@ -820,26 +843,27 @@ Hello, world! soup = self.soup(text) assert soup.p.encode("utf-8") == expected - def test_real_iso_latin_document(self): + def test_real_iso_8859_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. - # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + # Here it is in Unicode. Note that it claims to be in ISO-8859-1. + unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' - # That's because we're going to encode it into ISO-Latin-1, and use - # that to test. + # That's because we're going to encode it into ISO-8859-1, + # and use that to test. iso_latin_html = unicode_html.encode("iso-8859-1") - # Parse the ISO-Latin-1 HTML. + # Parse the ISO-8859-1 HTML. soup = self.soup(iso_latin_html) + # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say - # UTF-8 instead of ISO-Latin-1. - expected = unicode_html.replace("ISO-Latin-1", "utf-8") + # UTF-8 instead of ISO-8859-1. + expected = unicode_html.replace("ISO-8859-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") @@ -1177,15 +1201,3 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): assert isinstance(soup.contents[0], Comment) assert soup.contents[0] == '?xml version="1.0" encoding="utf-8"?' assert "html" == soup.contents[0].next_element.name - -def skipIf(condition, reason): - def nothing(test, *args, **kwargs): - return None - - def decorator(test_item): - if condition: - return nothing - else: - return test_item - - return decorator diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py index 5fa874c8..9327174f 100644 --- a/lib/bs4/tests/test_builder_registry.py +++ b/lib/bs4/tests/test_builder_registry.py @@ -10,22 +10,23 @@ from bs4.builder import ( TreeBuilderRegistry, ) -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError: - HTML5LIB_PRESENT = False +from . import ( + HTML5LIB_PRESENT, + LXML_PRESENT, +) -try: +if HTML5LIB_PRESENT: + from bs4.builder import HTML5TreeBuilder + +if LXML_PRESENT: from bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, ) - LXML_PRESENT = True -except ImportError: - LXML_PRESENT = False +# TODO: Split out the lxml and html5lib tests into their own classes +# and gate with pytest.mark.skipIf. class TestBuiltInRegistry(object): """Test the built-in registry with the default builders registered.""" diff --git a/lib/bs4/tests/test_dammit.py b/lib/bs4/tests/test_dammit.py index 9971234e..9aad0ac6 100644 --- a/lib/bs4/tests/test_dammit.py +++ b/lib/bs4/tests/test_dammit.py @@ -17,26 +17,24 @@ class TestUnicodeDammit(object): dammit = UnicodeDammit(markup) assert dammit.unicode_markup == markup - def test_smart_quotes_to_unicode(self): + @pytest.mark.parametrize( + "smart_quotes_to,expect_converted", + [(None, "\u2018\u2019\u201c\u201d"), + ("xml", "‘’“”"), + ("html", "‘’“”"), + ("ascii", "''" + '""'), + ] + ) + def test_smart_quotes_to(self, smart_quotes_to, expect_converted): + """Verify the functionality of the smart_quotes_to argument + to the UnicodeDammit constructor.""" markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup) - assert dammit.unicode_markup == "\u2018\u2019\u201c\u201d" - - def test_smart_quotes_to_xml_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - assert dammit.unicode_markup == "‘’“”" - - def test_smart_quotes_to_html_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - assert dammit.unicode_markup == "‘’“”" - - def test_smart_quotes_to_ascii(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - assert dammit.unicode_markup == """''""""" - + converted = UnicodeDammit( + markup, known_definite_encodings=["windows-1252"], + smart_quotes_to=smart_quotes_to + ).unicode_markup + assert converted == "{}".format(expect_converted) + def test_detect_utf8(self): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) @@ -275,23 +273,24 @@ class TestEntitySubstitution(object): def setup_method(self): self.sub = EntitySubstitution - def test_simple_html_substitution(self): - # Unicode characters corresponding to named HTML entites - # are substituted, and no others. - s = "foo\u2200\N{SNOWMAN}\u00f5bar" - assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar" - def test_smart_quote_substitution(self): - # MS smart quotes are a common source of frustration, so we - # give them a special test. - quotes = b"\x91\x92foo\x93\x94" - dammit = UnicodeDammit(quotes) - assert self.sub.substitute_html(dammit.markup) == "‘’foo“”" + @pytest.mark.parametrize( + "original,substituted", + [ + # Basic case. Unicode characters corresponding to named + # HTML entites are substituted; others are not. + ("foo\u2200\N{SNOWMAN}\u00f5bar", + "foo∀\N{SNOWMAN}õbar"), + # MS smart quotes are a common source of frustration, so we + # give them a special test. + ('‘’foo“”', "‘’foo“”"), + ] + ) + def test_substitute_html(self, original, substituted): + assert self.sub.substitute_html(original) == substituted + def test_html5_entity(self): - # Some HTML5 entities correspond to single- or multi-character - # Unicode sequences. - for entity, u in ( # A few spot checks of our ability to recognize # special character sequences and convert them diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py index b32ab304..4197720f 100644 --- a/lib/bs4/tests/test_html5lib.py +++ b/lib/bs4/tests/test_html5lib.py @@ -1,27 +1,26 @@ """Tests to ensure that the html5lib tree builder generates good trees.""" +import pytest import warnings -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError as e: - HTML5LIB_PRESENT = False +from bs4 import BeautifulSoup from bs4.element import SoupStrainer from . import ( + HTML5LIB_PRESENT, HTML5TreeBuilderSmokeTest, SoupTest, - skipIf, ) -@skipIf( +@pytest.mark.skipif( not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") + reason="html5lib seems not to be present, not testing its tree builder." +) class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest): """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): + from bs4.builder import HTML5TreeBuilder return HTML5TreeBuilder def test_soupstrainer(self): @@ -29,10 +28,12 @@ class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest): strainer = SoupStrainer("b") markup = "

A bold statement.

" with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) + soup = BeautifulSoup(markup, "html5lib", parse_only=strainer) assert soup.decode() == self.document_for(markup) - assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message) + [warning] = w + assert warning.filename == __file__ + assert "the html5lib tree builder doesn't support parse_only" in str(warning.message) def test_correctly_nested_tables(self): """html5lib inserts tags where other parsers don't.""" diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py index bfcfa1f3..470d3936 100644 --- a/lib/bs4/tests/test_htmlparser.py +++ b/lib/bs4/tests/test_htmlparser.py @@ -122,15 +122,3 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): with_element = div.encode(formatter="html") expect = b"
%s
" % output_element assert with_element == expect - -class TestHTMLParserSubclass(SoupTest): - def test_error(self): - """Verify that our HTMLParser subclass implements error() in a way - that doesn't cause a crash. - """ - parser = BeautifulSoupHTMLParser() - with warnings.catch_warnings(record=True) as warns: - parser.error("don't crash") - [warning] = warns - assert "don't crash" == str(warning.message) - diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py index 396ca0ef..c7bf45d3 100644 --- a/lib/bs4/tests/test_lxml.py +++ b/lib/bs4/tests/test_lxml.py @@ -1,16 +1,10 @@ """Tests to ensure that the lxml tree builder generates good trees.""" import pickle +import pytest import re import warnings - -try: - import lxml.etree - LXML_PRESENT = True - LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError as e: - LXML_PRESENT = False - LXML_VERSION = (0,) +from . import LXML_PRESENT, LXML_VERSION if LXML_PRESENT: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML @@ -23,13 +17,14 @@ from bs4.element import Comment, Doctype, SoupStrainer from . import ( HTMLTreeBuilderSmokeTest, XMLTreeBuilderSmokeTest, + SOUP_SIEVE_PRESENT, SoupTest, - skipIf, ) -@skipIf( +@pytest.mark.skipif( not LXML_PRESENT, - "lxml seems not to be present, not testing its tree builder.") + reason="lxml seems not to be present, not testing its tree builder." +) class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @@ -54,9 +49,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. - @skipIf( + @pytest.mark.skipif( not LXML_PRESENT or LXML_VERSION < (2,3,5,0), - "Skipping doctype test for old version of lxml to avoid segfault.") + reason="Skipping doctype test for old version of lxml to avoid segfault." + ) def test_empty_doctype(self): soup = self.soup("") doctype = soup.contents[0] @@ -68,7 +64,9 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") assert "" == str(soup.b) - assert "BeautifulStoneSoup class is deprecated" in str(w[0].message) + [warning] = w + assert warning.filename == __file__ + assert "BeautifulStoneSoup class is deprecated" in str(warning.message) def test_tracking_line_numbers(self): # The lxml TreeBuilder cannot keep track of line numbers from @@ -85,9 +83,10 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): assert "sourceline" == soup.p.sourceline.name assert "sourcepos" == soup.p.sourcepos.name -@skipIf( +@pytest.mark.skipif( not LXML_PRESENT, - "lxml seems not to be present, not testing its XML tree builder.") + reason="lxml seems not to be present, not testing its XML tree builder." +) class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @@ -148,6 +147,9 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): } + @pytest.mark.skipif( + not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed" + ) def test_namespace_interaction_with_select_and_find(self): # Demonstrate how namespaces interact with select* and # find* methods. diff --git a/lib/bs4/tests/test_pageelement.py b/lib/bs4/tests/test_pageelement.py index 26783f2c..6674dadf 100644 --- a/lib/bs4/tests/test_pageelement.py +++ b/lib/bs4/tests/test_pageelement.py @@ -3,15 +3,18 @@ import copy import pickle import pytest -from soupsieve import SelectorSyntaxError - from bs4 import BeautifulSoup from bs4.element import ( Comment, SoupStrainer, ) -from . import SoupTest +from . import ( + SoupTest, + SOUP_SIEVE_PRESENT, +) +if SOUP_SIEVE_PRESENT: + from soupsieve import SelectorSyntaxError class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" @@ -213,6 +216,7 @@ class TestFormatters(SoupTest): assert soup.contents[0].name == 'pre' +@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed") class TestCSSSelectors(SoupTest): """Test basic CSS selector functionality. @@ -694,6 +698,7 @@ class TestPersistence(SoupTest): assert tag.can_be_empty_element == copied.can_be_empty_element assert tag.cdata_list_attributes == copied.cdata_list_attributes assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags + assert tag.interesting_string_types == copied.interesting_string_types def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py index 445f74da..64b8cf12 100644 --- a/lib/bs4/tests/test_soup.py +++ b/lib/bs4/tests/test_soup.py @@ -30,19 +30,11 @@ from bs4.element import ( from . import ( default_builder, + LXML_PRESENT, SoupTest, - skipIf, ) import warnings - -try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True -except ImportError as e: - LXML_PRESENT = False -PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) - class TestConstructor(SoupTest): def test_short_unicode_input(self): @@ -139,7 +131,7 @@ class TestConstructor(SoupTest): assert " an id " == a['id'] assert ["a", "class"] == a['class'] - # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets + # TreeBuilder takes an argument called 'multi_valued_attributes' which lets # you customize or disable this. As always, you can customize the TreeBuilder # by passing in a keyword argument to the BeautifulSoup constructor. soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None) @@ -219,10 +211,17 @@ class TestConstructor(SoupTest): class TestWarnings(SoupTest): + # Note that some of the tests in this class create BeautifulSoup + # objects directly rather than using self.soup(). That's + # because SoupTest.soup is defined in a different file, + # which will throw off the assertion in _assert_warning + # that the code that triggered the warning is in the same + # file as the test. def _assert_warning(self, warnings, cls): for w in warnings: if isinstance(w.message, cls): + assert w.filename == __file__ return w raise Exception("%s warning not found in %r" % (cls, warnings)) @@ -243,13 +242,17 @@ class TestWarnings(SoupTest): def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: - soup = BeautifulSoup("
", "html.parser") + soup = self.soup("") assert [] == w def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("", parseOnlyThese=SoupStrainer("b")) - msg = str(w[0].message) + soup = BeautifulSoup( + "", "html.parser", + parseOnlyThese=SoupStrainer("b"), + ) + warning = self._assert_warning(w, DeprecationWarning) + msg = str(warning.message) assert "parseOnlyThese" in msg assert "parse_only" in msg assert b"" == soup.encode() @@ -257,8 +260,11 @@ class TestWarnings(SoupTest): def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" - soup = self.soup(utf8, fromEncoding="utf8") - msg = str(w[0].message) + soup = BeautifulSoup( + utf8, "html.parser", fromEncoding="utf8" + ) + warning = self._assert_warning(w, DeprecationWarning) + msg = str(warning.message) assert "fromEncoding" in msg assert "from_encoding" in msg assert "utf8" == soup.original_encoding @@ -276,7 +282,7 @@ class TestWarnings(SoupTest): # A warning is issued if the "markup" looks like the name of # an HTML or text file, or a full path to a file on disk. with warnings.catch_warnings(record=True) as w: - soup = self.soup("markup" + extension) + soup = BeautifulSoup("markup" + extension, "html.parser") warning = self._assert_warning(w, MarkupResemblesLocatorWarning) assert "looks more like a filename" in str(warning.message) @@ -291,11 +297,11 @@ class TestWarnings(SoupTest): with warnings.catch_warnings(record=True) as w: soup = self.soup("markup" + extension) assert [] == w - + def test_url_warning_with_bytes_url(self): url = b"http://www.crummybytes.com/" with warnings.catch_warnings(record=True) as warning_list: - soup = self.soup(url) + soup = BeautifulSoup(url, "html.parser") warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) @@ -307,7 +313,7 @@ class TestWarnings(SoupTest): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning - soup = self.soup(url) + soup = BeautifulSoup(url, "html.parser") warning = self._assert_warning( warning_list, MarkupResemblesLocatorWarning ) @@ -347,18 +353,22 @@ class TestNewTag(SoupTest): assert "foo" == new_tag.name assert dict(bar="baz", name="a name") == new_tag.attrs assert None == new_tag.parent - + + @pytest.mark.skipif( + not LXML_PRESENT, + reason="lxml not installed, cannot parse XML document" + ) + def test_xml_tag_inherits_self_closing_rules_from_builder(self): + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the
and

tag are empty-element, just because + # they have no contents. + assert b"
" == xml_br.encode() + assert b"

" == xml_p.encode() + def test_tag_inherits_self_closing_rules_from_builder(self): - if LXML_PRESENT: - xml_soup = BeautifulSoup("", "lxml-xml") - xml_br = xml_soup.new_tag("br") - xml_p = xml_soup.new_tag("p") - - # Both the
and

tag are empty-element, just because - # they have no contents. - assert b"
" == xml_br.encode() - assert b"

" == xml_p.encode() - html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") @@ -450,13 +460,3 @@ class TestEncodingConversion(SoupTest): # The internal data structures can be encoded as UTF-8. soup_from_unicode = self.soup(self.unicode_data) assert soup_from_unicode.encode('utf-8') == self.utf8_data - - @skipIf( - PYTHON_3_PRE_3_2, - "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") - def test_attribute_name_containing_unicode_characters(self): - markup = '

' - assert self.soup(markup).div.encode("utf8") == markup.encode("utf8") - - - diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py index bfd6826e..26995f95 100644 --- a/lib/bs4/tests/test_tree.py +++ b/lib/bs4/tests/test_tree.py @@ -33,7 +33,6 @@ from bs4.element import ( ) from . import ( SoupTest, - skipIf, ) class TestFind(SoupTest): @@ -910,12 +909,16 @@ class TestTreeModification(SoupTest): soup.a.extend(l) assert "" == soup.decode() - def test_extend_with_another_tags_contents(self): + @pytest.mark.parametrize( + "get_tags", [lambda tag: tag, lambda tag: tag.contents] + ) + def test_extend_with_another_tags_contents(self, get_tags): data = '
' soup = self.soup(data) d1 = soup.find('div', id='d1') d2 = soup.find('div', id='d2') - d2.extend(d1) + tags = get_tags(d1) + d2.extend(tags) assert '
' == d1.decode() assert '' == d2.decode() @@ -1272,19 +1275,30 @@ class TestTreeModification(SoupTest): class TestDeprecatedArguments(SoupTest): - def test_find_type_method_string(self): + @pytest.mark.parametrize( + "method_name", [ + "find", "find_all", "find_parent", "find_parents", + "find_next", "find_all_next", "find_previous", + "find_all_previous", "find_next_sibling", "find_next_siblings", + "find_previous_sibling", "find_previous_siblings", + ] + ) + def test_find_type_method_string(self, method_name): soup = self.soup("somemarkup") + method = getattr(soup.b, method_name) with warnings.catch_warnings(record=True) as w: - [result] = soup.find_all(text='markup') - assert result == 'markup' - assert result.parent.name == 'b' - msg = str(w[0].message) + method(text='markup') + [warning] = w + assert warning.filename == __file__ + msg = str(warning.message) assert msg == "The 'text' argument to find()-type methods is deprecated. Use 'string' instead." def test_soupstrainer_constructor_string(self): with warnings.catch_warnings(record=True) as w: strainer = SoupStrainer(text="text") assert strainer.text == 'text' - msg = str(w[0].message) + [warning] = w + msg = str(warning.message) + assert warning.filename == __file__ assert msg == "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead." diff --git a/lib/soupsieve/__init__.py b/lib/soupsieve/__init__.py index c89b7002..591a4f4f 100644 --- a/lib/soupsieve/__init__.py +++ b/lib/soupsieve/__init__.py @@ -25,13 +25,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from __future__ import annotations from .__meta__ import __version__, __version_info__ # noqa: F401 from . import css_parser as cp from . import css_match as cm from . import css_types as ct from .util import DEBUG, SelectorSyntaxError # noqa: F401 import bs4 # type: ignore[import] -from typing import Dict, Optional, Any, List, Iterator, Iterable +from typing import Optional, Any, Iterator, Iterable __all__ = ( 'DEBUG', 'SelectorSyntaxError', 'SoupSieve', @@ -44,17 +45,14 @@ SoupSieve = cm.SoupSieve def compile( # noqa: A001 pattern: str, - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any ) -> cm.SoupSieve: """Compile CSS pattern.""" - ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces] - cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors] - if isinstance(pattern, SoupSieve): if flags: raise ValueError("Cannot process 'flags' argument on a compiled selector list") @@ -64,7 +62,12 @@ def compile( # noqa: A001 raise ValueError("Cannot process 'custom' argument on a compiled selector list") return pattern - return cp._cached_css_compile(pattern, ns, cs, flags) + return cp._cached_css_compile( + pattern, + ct.Namespaces(namespaces) if namespaces is not None else namespaces, + ct.CustomSelectors(custom) if custom is not None else custom, + flags + ) def purge() -> None: @@ -76,10 +79,10 @@ def purge() -> None: def closest( select: str, tag: 'bs4.Tag', - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any ) -> 'bs4.Tag': """Match closest ancestor.""" @@ -90,10 +93,10 @@ def closest( def match( select: str, tag: 'bs4.Tag', - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any ) -> bool: """Match node.""" @@ -104,12 +107,12 @@ def match( def filter( # noqa: A001 select: str, iterable: Iterable['bs4.Tag'], - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any -) -> List['bs4.Tag']: +) -> list['bs4.Tag']: """Filter list of nodes.""" return compile(select, namespaces, flags, **kwargs).filter(iterable) @@ -118,10 +121,10 @@ def filter( # noqa: A001 def select_one( select: str, tag: 'bs4.Tag', - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any ) -> 'bs4.Tag': """Select a single tag.""" @@ -132,13 +135,13 @@ def select_one( def select( select: str, tag: 'bs4.Tag', - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, limit: int = 0, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any -) -> List['bs4.Tag']: +) -> list['bs4.Tag']: """Select the specified tags.""" return compile(select, namespaces, flags, **kwargs).select(tag, limit) @@ -147,11 +150,11 @@ def select( def iselect( select: str, tag: 'bs4.Tag', - namespaces: Optional[Dict[str, str]] = None, + namespaces: Optional[dict[str, str]] = None, limit: int = 0, flags: int = 0, *, - custom: Optional[Dict[str, str]] = None, + custom: Optional[dict[str, str]] = None, **kwargs: Any ) -> Iterator['bs4.Tag']: """Iterate the specified tags.""" diff --git a/lib/soupsieve/__meta__.py b/lib/soupsieve/__meta__.py index 3bd6607f..5369314e 100644 --- a/lib/soupsieve/__meta__.py +++ b/lib/soupsieve/__meta__.py @@ -1,4 +1,5 @@ """Meta related things.""" +from __future__ import annotations from collections import namedtuple import re @@ -83,7 +84,7 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre" cls, major: int, minor: int, micro: int, release: str = "final", pre: int = 0, post: int = 0, dev: int = 0 - ) -> "Version": + ) -> Version: """Validate version info.""" # Ensure all parts are positive integers. @@ -192,5 +193,5 @@ def parse_version(ver: str) -> Version: return Version(major, minor, micro, release, pre, post, dev) -__version_info__ = Version(2, 3, 2, "final", post=1) +__version_info__ = Version(2, 4, 0, "final") __version__ = __version_info__._get_canonical() diff --git a/lib/soupsieve/css_match.py b/lib/soupsieve/css_match.py index 49e5f070..65752829 100644 --- a/lib/soupsieve/css_match.py +++ b/lib/soupsieve/css_match.py @@ -1,11 +1,12 @@ """CSS matcher.""" +from __future__ import annotations from datetime import datetime from . import util import re from . import css_types as ct import unicodedata import bs4 # type: ignore[import] -from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast +from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401 # Empty tag pattern (whitespace okay) RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') @@ -64,12 +65,12 @@ class _FakeParent: fake parent so we can traverse the root element as a child. """ - def __init__(self, element: 'bs4.Tag') -> None: + def __init__(self, element: bs4.Tag) -> None: """Initialize.""" self.contents = [element] - def __len__(self) -> 'bs4.PageElement': + def __len__(self) -> bs4.PageElement: """Length.""" return len(self.contents) @@ -87,59 +88,59 @@ class _DocumentNav: raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag))) @staticmethod - def is_doc(obj: 'bs4.Tag') -> bool: + def is_doc(obj: bs4.Tag) -> bool: """Is `BeautifulSoup` object.""" return isinstance(obj, bs4.BeautifulSoup) @staticmethod - def is_tag(obj: 'bs4.PageElement') -> bool: + def is_tag(obj: bs4.PageElement) -> bool: """Is tag.""" return isinstance(obj, bs4.Tag) @staticmethod - def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover + def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover """Is declaration.""" return isinstance(obj, bs4.Declaration) @staticmethod - def is_cdata(obj: 'bs4.PageElement') -> bool: + def is_cdata(obj: bs4.PageElement) -> bool: """Is CDATA.""" return isinstance(obj, bs4.CData) @staticmethod - def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover + def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover """Is processing instruction.""" return isinstance(obj, bs4.ProcessingInstruction) @staticmethod - def is_navigable_string(obj: 'bs4.PageElement') -> bool: + def is_navigable_string(obj: bs4.PageElement) -> bool: """Is navigable string.""" return isinstance(obj, bs4.NavigableString) @staticmethod - def is_special_string(obj: 'bs4.PageElement') -> bool: + def is_special_string(obj: bs4.PageElement) -> bool: """Is special string.""" return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) @classmethod - def is_content_string(cls, obj: 'bs4.PageElement') -> bool: + def is_content_string(cls, obj: bs4.PageElement) -> bool: """Check if node is content string.""" return cls.is_navigable_string(obj) and not cls.is_special_string(obj) @staticmethod - def create_fake_parent(el: 'bs4.Tag') -> _FakeParent: + def create_fake_parent(el: bs4.Tag) -> _FakeParent: """Create fake parent for a given element.""" return _FakeParent(el) @staticmethod - def is_xml_tree(el: 'bs4.Tag') -> bool: + def is_xml_tree(el: bs4.Tag) -> bool: """Check if element (or document) is from a XML tree.""" return bool(el._is_xml) - def is_iframe(self, el: 'bs4.Tag') -> bool: + def is_iframe(self, el: bs4.Tag) -> bool: """Check if element is an `iframe`.""" return bool( @@ -147,7 +148,7 @@ class _DocumentNav: self.is_html_tag(el) # type: ignore[attr-defined] ) - def is_root(self, el: 'bs4.Tag') -> bool: + def is_root(self, el: bs4.Tag) -> bool: """ Return whether element is a root element. @@ -161,7 +162,7 @@ class _DocumentNav: root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] return root - def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']: + def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]: """Get contents or contents in reverse.""" if not no_iframe or not self.is_iframe(el): for content in el.contents: @@ -169,12 +170,12 @@ class _DocumentNav: def get_children( self, - el: 'bs4.Tag', + el: bs4.Tag, start: Optional[int] = None, reverse: bool = False, tags: bool = True, no_iframe: bool = False - ) -> Iterator['bs4.PageElement']: + ) -> Iterator[bs4.PageElement]: """Get children.""" if not no_iframe or not self.is_iframe(el): @@ -195,10 +196,10 @@ class _DocumentNav: def get_descendants( self, - el: 'bs4.Tag', + el: bs4.Tag, tags: bool = True, no_iframe: bool = False - ) -> Iterator['bs4.PageElement']: + ) -> Iterator[bs4.PageElement]: """Get descendants.""" if not no_iframe or not self.is_iframe(el): @@ -229,7 +230,7 @@ class _DocumentNav: if not tags or is_tag: yield child - def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag': + def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag: """Get parent.""" parent = el.parent @@ -238,25 +239,25 @@ class _DocumentNav: return parent @staticmethod - def get_tag_name(el: 'bs4.Tag') -> Optional[str]: + def get_tag_name(el: bs4.Tag) -> Optional[str]: """Get tag.""" return cast(Optional[str], el.name) @staticmethod - def get_prefix_name(el: 'bs4.Tag') -> Optional[str]: + def get_prefix_name(el: bs4.Tag) -> Optional[str]: """Get prefix.""" return cast(Optional[str], el.prefix) @staticmethod - def get_uri(el: 'bs4.Tag') -> Optional[str]: + def get_uri(el: bs4.Tag) -> Optional[str]: """Get namespace `URI`.""" return cast(Optional[str], el.namespace) @classmethod - def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement': + def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: """Get next sibling tag.""" sibling = el.next_sibling @@ -265,7 +266,7 @@ class _DocumentNav: return sibling @classmethod - def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement': + def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: """Get previous sibling tag.""" sibling = el.previous_sibling @@ -274,7 +275,7 @@ class _DocumentNav: return sibling @staticmethod - def has_html_ns(el: 'bs4.Tag') -> bool: + def has_html_ns(el: bs4.Tag) -> bool: """ Check if element has an HTML namespace. @@ -286,13 +287,13 @@ class _DocumentNav: return bool(ns and ns == NS_XHTML) @staticmethod - def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]: + def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]: """Return namespace and attribute name without the prefix.""" return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) @classmethod - def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]: + def normalize_value(cls, value: Any) -> str | Sequence[str]: """Normalize the value to be a string or list of strings.""" # Treat `None` as empty string. @@ -327,10 +328,10 @@ class _DocumentNav: @classmethod def get_attribute_by_name( cls, - el: 'bs4.Tag', + el: bs4.Tag, name: str, - default: Optional[Union[str, Sequence[str]]] = None - ) -> Optional[Union[str, Sequence[str]]]: + default: Optional[str | Sequence[str]] = None + ) -> Optional[str | Sequence[str]]: """Get attribute by name.""" value = default @@ -347,14 +348,14 @@ class _DocumentNav: return value @classmethod - def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]: + def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]: """Iterate attributes.""" for k, v in el.attrs.items(): yield k, cls.normalize_value(v) @classmethod - def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]: + def get_classes(cls, el: bs4.Tag) -> Sequence[str]: """Get classes.""" classes = cls.get_attribute_by_name(el, 'class', []) @@ -362,14 +363,14 @@ class _DocumentNav: classes = RE_NOT_WS.findall(classes) return cast(Sequence[str], classes) - def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str: + def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: """Get text.""" return ''.join( [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] ) - def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]: + def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: """Get Own Text.""" return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] @@ -423,10 +424,10 @@ class Inputs: return 0 <= minutes <= 59 @classmethod - def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]: + def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]: """Parse the input value.""" - parsed = None # type: Optional[Tuple[float, ...]] + parsed = None # type: Optional[tuple[float, ...]] if value is None: return value if itype == "date": @@ -484,7 +485,7 @@ class CSSMatch(_DocumentNav): def __init__( self, selectors: ct.SelectorList, - scope: 'bs4.Tag', + scope: bs4.Tag, namespaces: Optional[ct.Namespaces], flags: int ) -> None: @@ -492,11 +493,11 @@ class CSSMatch(_DocumentNav): self.assert_valid_input(scope) self.tag = scope - self.cached_meta_lang = [] # type: List[Tuple[str, str]] - self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']] - self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]] + self.cached_meta_lang = [] # type: list[tuple[str, str]] + self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] + self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] self.selectors = selectors - self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]] + self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] self.flags = flags self.iframe_restrict = False @@ -527,7 +528,7 @@ class CSSMatch(_DocumentNav): return self.is_xml or self.has_html_namespace - def get_tag_ns(self, el: 'bs4.Tag') -> str: + def get_tag_ns(self, el: bs4.Tag) -> str: """Get tag namespace.""" if self.supports_namespaces(): @@ -539,24 +540,24 @@ class CSSMatch(_DocumentNav): namespace = NS_XHTML return namespace - def is_html_tag(self, el: 'bs4.Tag') -> bool: + def is_html_tag(self, el: bs4.Tag) -> bool: """Check if tag is in HTML namespace.""" return self.get_tag_ns(el) == NS_XHTML - def get_tag(self, el: 'bs4.Tag') -> Optional[str]: + def get_tag(self, el: bs4.Tag) -> Optional[str]: """Get tag.""" name = self.get_tag_name(el) return util.lower(name) if name is not None and not self.is_xml else name - def get_prefix(self, el: 'bs4.Tag') -> Optional[str]: + def get_prefix(self, el: bs4.Tag) -> Optional[str]: """Get prefix.""" prefix = self.get_prefix_name(el) return util.lower(prefix) if prefix is not None and not self.is_xml else prefix - def find_bidi(self, el: 'bs4.Tag') -> Optional[int]: + def find_bidi(self, el: bs4.Tag) -> Optional[int]: """Get directionality from element text.""" for node in self.get_children(el, tags=False): @@ -600,13 +601,18 @@ class CSSMatch(_DocumentNav): ranges = lang_range.split('-') subtags = lang_tag.lower().split('-') length = len(ranges) + slength = len(subtags) rindex = 0 sindex = 0 r = ranges[rindex] s = subtags[sindex] + # Empty specified language should match unspecified language attributes + if length == 1 and slength == 1 and not r and r == s: + return True + # Primary tag needs to match - if r != '*' and r != s: + if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): match = False rindex += 1 @@ -645,10 +651,10 @@ class CSSMatch(_DocumentNav): def match_attribute_name( self, - el: 'bs4.Tag', + el: bs4.Tag, attr: str, prefix: Optional[str] - ) -> Optional[Union[str, Sequence[str]]]: + ) -> Optional[str | Sequence[str]]: """Match attribute name and return value if it exists.""" value = None @@ -696,7 +702,7 @@ class CSSMatch(_DocumentNav): break return value - def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool: + def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: """Match the namespace of the element.""" match = True @@ -717,7 +723,7 @@ class CSSMatch(_DocumentNav): match = False return match - def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool: + def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: """Match attributes.""" match = True @@ -736,7 +742,7 @@ class CSSMatch(_DocumentNav): break return match - def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool: + def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: """Match tag name.""" name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) @@ -745,7 +751,7 @@ class CSSMatch(_DocumentNav): name not in (self.get_tag(el), '*') ) - def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool: + def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool: """Match the tag.""" match = True @@ -757,7 +763,7 @@ class CSSMatch(_DocumentNav): match = False return match - def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: + def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match past relationship.""" found = False @@ -785,12 +791,12 @@ class CSSMatch(_DocumentNav): found = self.match_selectors(sibling, relation) return found - def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool: + def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: """Match future child.""" match = False if recursive: - children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']] + children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]] else: children = self.get_children for child in children(parent, no_iframe=self.iframe_restrict): @@ -799,7 +805,7 @@ class CSSMatch(_DocumentNav): break return match - def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: + def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match future relationship.""" found = False @@ -822,7 +828,7 @@ class CSSMatch(_DocumentNav): found = self.match_selectors(sibling, relation) return found - def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool: + def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match relationship to other elements.""" found = False @@ -837,7 +843,7 @@ class CSSMatch(_DocumentNav): return found - def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool: + def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: """Match element's ID.""" found = True @@ -847,7 +853,7 @@ class CSSMatch(_DocumentNav): break return found - def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool: + def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: """Match element's classes.""" current_classes = self.get_classes(el) @@ -858,7 +864,7 @@ class CSSMatch(_DocumentNav): break return found - def match_root(self, el: 'bs4.Tag') -> bool: + def match_root(self, el: bs4.Tag) -> bool: """Match element as root.""" is_root = self.is_root(el) @@ -884,20 +890,20 @@ class CSSMatch(_DocumentNav): sibling = self.get_next(sibling, tags=False) return is_root - def match_scope(self, el: 'bs4.Tag') -> bool: + def match_scope(self, el: bs4.Tag) -> bool: """Match element as scope.""" return self.scope is el - def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool: + def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: """Match tag type for `nth` matches.""" - return( + return ( (self.get_tag(child) == self.get_tag(el)) and (self.get_tag_ns(child) == self.get_tag_ns(el)) ) - def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool: + def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool: """Match `nth` elements.""" matched = True @@ -998,7 +1004,7 @@ class CSSMatch(_DocumentNav): break return matched - def match_empty(self, el: 'bs4.Tag') -> bool: + def match_empty(self, el: bs4.Tag) -> bool: """Check if element is empty (if requested).""" is_empty = True @@ -1011,7 +1017,7 @@ class CSSMatch(_DocumentNav): break return is_empty - def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool: + def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: """Match selectors.""" match = True @@ -1020,11 +1026,11 @@ class CSSMatch(_DocumentNav): match = False return match - def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool: + def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: """Match element if it contains text.""" match = True - content = None # type: Optional[Union[str, Sequence[str]]] + content = None # type: Optional[str | Sequence[str]] for contain_list in contains: if content is None: if contain_list.own: @@ -1048,7 +1054,7 @@ class CSSMatch(_DocumentNav): match = False return match - def match_default(self, el: 'bs4.Tag') -> bool: + def match_default(self, el: bs4.Tag) -> bool: """Match default.""" match = False @@ -1087,13 +1093,13 @@ class CSSMatch(_DocumentNav): break return match - def match_indeterminate(self, el: 'bs4.Tag') -> bool: + def match_indeterminate(self, el: bs4.Tag) -> bool: """Match default.""" match = False name = cast(str, self.get_attribute_by_name(el, 'name')) - def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']: + def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]: """Find this input's form.""" form = None parent = self.get_parent(el, no_iframe=True) @@ -1148,7 +1154,7 @@ class CSSMatch(_DocumentNav): return match - def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool: + def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: """Match languages.""" match = False @@ -1183,7 +1189,7 @@ class CSSMatch(_DocumentNav): break # Use cached meta language. - if not found_lang and self.cached_meta_lang: + if found_lang is None and self.cached_meta_lang: for cache in self.cached_meta_lang: if root is cache[0]: found_lang = cache[1] @@ -1217,13 +1223,13 @@ class CSSMatch(_DocumentNav): found_lang = content self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) break - if found_lang: + if found_lang is not None: break - if not found_lang: + if found_lang is None: self.cached_meta_lang.append((cast(str, root), '')) # If we determined a language, compare. - if found_lang: + if found_lang is not None: for patterns in langs: match = False for pattern in patterns: @@ -1234,7 +1240,7 @@ class CSSMatch(_DocumentNav): return match - def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool: + def match_dir(self, el: bs4.Tag, directionality: int) -> bool: """Check directionality.""" # If we have to match both left and right, we can't match either. @@ -1297,7 +1303,7 @@ class CSSMatch(_DocumentNav): # Match parents direction return self.match_dir(self.get_parent(el, no_iframe=True), directionality) - def match_range(self, el: 'bs4.Tag', condition: int) -> bool: + def match_range(self, el: bs4.Tag, condition: int) -> bool: """ Match range. @@ -1337,7 +1343,7 @@ class CSSMatch(_DocumentNav): return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range - def match_defined(self, el: 'bs4.Tag') -> bool: + def match_defined(self, el: bs4.Tag) -> bool: """ Match defined. @@ -1360,7 +1366,7 @@ class CSSMatch(_DocumentNav): ) ) - def match_placeholder_shown(self, el: 'bs4.Tag') -> bool: + def match_placeholder_shown(self, el: bs4.Tag) -> bool: """ Match placeholder shown according to HTML spec. @@ -1375,7 +1381,7 @@ class CSSMatch(_DocumentNav): return match - def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool: + def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: """Check if element matches one of the selectors.""" match = False @@ -1459,7 +1465,7 @@ class CSSMatch(_DocumentNav): return match - def select(self, limit: int = 0) -> Iterator['bs4.Tag']: + def select(self, limit: int = 0) -> Iterator[bs4.Tag]: """Match all tags under the targeted tag.""" lim = None if limit < 1 else limit @@ -1472,7 +1478,7 @@ class CSSMatch(_DocumentNav): if lim < 1: break - def closest(self) -> Optional['bs4.Tag']: + def closest(self) -> Optional[bs4.Tag]: """Match closest ancestor.""" current = self.tag @@ -1484,12 +1490,12 @@ class CSSMatch(_DocumentNav): current = self.get_parent(current) return closest - def filter(self) -> List['bs4.Tag']: # noqa A001 + def filter(self) -> list[bs4.Tag]: # noqa A001 """Filter tag's children.""" return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] - def match(self, el: 'bs4.Tag') -> bool: + def match(self, el: bs4.Tag) -> bool: """Match.""" return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) @@ -1501,7 +1507,7 @@ class SoupSieve(ct.Immutable): pattern: str selectors: ct.SelectorList namespaces: Optional[ct.Namespaces] - custom: Dict[str, str] + custom: dict[str, str] flags: int __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") @@ -1524,17 +1530,17 @@ class SoupSieve(ct.Immutable): flags=flags ) - def match(self, tag: 'bs4.Tag') -> bool: + def match(self, tag: bs4.Tag) -> bool: """Match.""" return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) - def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag': + def closest(self, tag: bs4.Tag) -> bs4.Tag: """Match closest ancestor.""" return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() - def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001 + def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 """ Filter. @@ -1551,18 +1557,18 @@ class SoupSieve(ct.Immutable): else: return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] - def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag': + def select_one(self, tag: bs4.Tag) -> bs4.Tag: """Select a single tag.""" tags = self.select(tag, limit=1) return tags[0] if tags else None - def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']: + def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: """Select the specified tags.""" return list(self.iselect(tag, limit)) - def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']: + def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: """Iterate the specified tags.""" for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): diff --git a/lib/soupsieve/css_parser.py b/lib/soupsieve/css_parser.py index d77084d4..4b8db186 100644 --- a/lib/soupsieve/css_parser.py +++ b/lib/soupsieve/css_parser.py @@ -1,4 +1,5 @@ """CSS selector parser.""" +from __future__ import annotations import re from functools import lru_cache from . import util @@ -6,7 +7,7 @@ from . import css_match as cm from . import css_types as ct from .util import SelectorSyntaxError import warnings -from typing import Optional, Dict, Match, Tuple, Type, Any, List, Union, Iterator, cast +from typing import Optional, Match, Any, Iterator, cast UNICODE_REPLACEMENT_CHAR = 0xFFFD @@ -232,7 +233,7 @@ def _purge_cache() -> None: _cached_css_compile.cache_clear() -def process_custom(custom: Optional[ct.CustomSelectors]) -> Dict[str, Union[str, ct.SelectorList]]: +def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]: """Process custom.""" custom_selectors = {} @@ -325,7 +326,7 @@ class SelectorPattern: class SpecialPseudoPattern(SelectorPattern): """Selector pattern.""" - def __init__(self, patterns: Tuple[Tuple[str, Tuple[str, ...], str, Type[SelectorPattern]], ...]) -> None: + def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None: """Initialize.""" self.patterns = {} @@ -372,19 +373,19 @@ class _Selector: """Initialize.""" self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag] - self.ids = kwargs.get('ids', []) # type: List[str] - self.classes = kwargs.get('classes', []) # type: List[str] - self.attributes = kwargs.get('attributes', []) # type: List[ct.SelectorAttribute] - self.nth = kwargs.get('nth', []) # type: List[ct.SelectorNth] - self.selectors = kwargs.get('selectors', []) # type: List[ct.SelectorList] - self.relations = kwargs.get('relations', []) # type: List[_Selector] + self.ids = kwargs.get('ids', []) # type: list[str] + self.classes = kwargs.get('classes', []) # type: list[str] + self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute] + self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth] + self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList] + self.relations = kwargs.get('relations', []) # type: list[_Selector] self.rel_type = kwargs.get('rel_type', None) # type: Optional[str] - self.contains = kwargs.get('contains', []) # type: List[ct.SelectorContains] - self.lang = kwargs.get('lang', []) # type: List[ct.SelectorLang] + self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains] + self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang] self.flags = kwargs.get('flags', 0) # type: int self.no_match = kwargs.get('no_match', False) # type: bool - def _freeze_relations(self, relations: List['_Selector']) -> ct.SelectorList: + def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList: """Freeze relation.""" if relations: @@ -394,7 +395,7 @@ class _Selector: else: return ct.SelectorList() - def freeze(self) -> Union[ct.Selector, ct.SelectorNull]: + def freeze(self) -> ct.Selector | ct.SelectorNull: """Freeze self.""" if self.no_match: @@ -461,7 +462,7 @@ class CSSParser: def __init__( self, selector: str, - custom: Optional[Dict[str, Union[str, ct.SelectorList]]] = None, + custom: Optional[dict[str, str | ct.SelectorList]] = None, flags: int = 0 ) -> None: """Initialize.""" @@ -583,9 +584,9 @@ class CSSParser: sel: _Selector, m: Match[str], has_selector: bool, - iselector: Iterator[Tuple[str, Match[str]]], + iselector: Iterator[tuple[str, Match[str]]], is_html: bool - ) -> Tuple[bool, bool]: + ) -> tuple[bool, bool]: """Parse pseudo class.""" complex_pseudo = False @@ -678,7 +679,7 @@ class CSSParser: sel: _Selector, m: Match[str], has_selector: bool, - iselector: Iterator[Tuple[str, Match[str]]] + iselector: Iterator[tuple[str, Match[str]]] ) -> bool: """Parse `nth` pseudo.""" @@ -743,7 +744,7 @@ class CSSParser: sel: _Selector, name: str, has_selector: bool, - iselector: Iterator[Tuple[str, Match[str]]], + iselector: Iterator[tuple[str, Match[str]]], index: int ) -> bool: """Parse pseudo with opening bracket.""" @@ -752,7 +753,7 @@ class CSSParser: if name == ':not': flags |= FLG_NOT elif name == ':has': - flags |= FLG_RELATIVE | FLG_FORGIVE + flags |= FLG_RELATIVE elif name in (':where', ':is'): flags |= FLG_FORGIVE @@ -766,21 +767,16 @@ class CSSParser: sel: _Selector, m: Match[str], has_selector: bool, - selectors: List[_Selector], + selectors: list[_Selector], rel_type: str, index: int - ) -> Tuple[bool, _Selector, str]: + ) -> tuple[bool, _Selector, str]: """Parse combinator tokens.""" combinator = m.group('relation').strip() if not combinator: combinator = WS_COMBINATOR if combinator == COMMA_COMBINATOR: - if not has_selector: - # If we've not captured any selector parts, the comma is either at the beginning of the pattern - # or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class. - sel.no_match = True - sel.rel_type = rel_type selectors[-1].relations.append(sel) rel_type = ":" + WS_COMBINATOR @@ -814,12 +810,12 @@ class CSSParser: sel: _Selector, m: Match[str], has_selector: bool, - selectors: List[_Selector], - relations: List[_Selector], + selectors: list[_Selector], + relations: list[_Selector], is_pseudo: bool, is_forgive: bool, index: int - ) -> Tuple[bool, _Selector]: + ) -> tuple[bool, _Selector]: """Parse combinator tokens.""" combinator = m.group('relation').strip() @@ -924,7 +920,7 @@ class CSSParser: def parse_selectors( self, - iselector: Iterator[Tuple[str, Match[str]]], + iselector: Iterator[tuple[str, Match[str]]], index: int = 0, flags: int = 0 ) -> ct.SelectorList: @@ -935,7 +931,7 @@ class CSSParser: selectors = [] has_selector = False closed = False - relations = [] # type: List[_Selector] + relations = [] # type: list[_Selector] rel_type = ":" + WS_COMBINATOR # Setup various flags @@ -1069,22 +1065,12 @@ class CSSParser: selectors.append(sel) # Forgive empty slots in pseudo-classes that have lists (and are forgiving) - elif is_forgive: - if is_relative: - # Handle relative selectors pseudo-classes with empty slots like `:has()` - if selectors and selectors[-1].rel_type is None and rel_type == ': ': - sel.rel_type = rel_type - sel.no_match = True - selectors[-1].relations.append(sel) - has_selector = True - else: - # Handle normal pseudo-classes with empty slots - if not selectors or not relations: - # Others like `:is()` etc. - sel.no_match = True - del relations[:] - selectors.append(sel) - has_selector = True + elif is_forgive and (not selectors or not relations): + # Handle normal pseudo-classes with empty slots like `:is()` etc. + sel.no_match = True + del relations[:] + selectors.append(sel) + has_selector = True if not has_selector: # We will always need to finish a selector when `:has()` is used as it leads with combining. @@ -1112,7 +1098,7 @@ class CSSParser: # Return selector list return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) - def selector_iter(self, pattern: str) -> Iterator[Tuple[str, Match[str]]]: + def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]: """Iterate selector tokens.""" # Ignore whitespace and comments at start and end of pattern diff --git a/lib/soupsieve/css_types.py b/lib/soupsieve/css_types.py index e5a6e49c..a97d5f4b 100644 --- a/lib/soupsieve/css_types.py +++ b/lib/soupsieve/css_types.py @@ -1,7 +1,8 @@ """CSS selector structure items.""" +from __future__ import annotations import copyreg from .pretty import pretty -from typing import Any, Type, Tuple, Union, Dict, Iterator, Hashable, Optional, Pattern, Iterable, Mapping +from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping __all__ = ( 'Selector', @@ -33,7 +34,7 @@ SEL_PLACEHOLDER_SHOWN = 0x400 class Immutable: """Immutable.""" - __slots__: Tuple[str, ...] = ('_hash',) + __slots__: tuple[str, ...] = ('_hash',) _hash: int @@ -48,7 +49,7 @@ class Immutable: super(Immutable, self).__setattr__('_hash', hash(tuple(temp))) @classmethod - def __base__(cls) -> "Type[Immutable]": + def __base__(cls) -> "type[Immutable]": """Get base class.""" return cls @@ -99,7 +100,7 @@ class ImmutableDict(Mapping[Any, Any]): def __init__( self, - arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]] + arg: dict[Any, Any] | Iterable[tuple[Any, Any]] ) -> None: """Initialize.""" @@ -107,7 +108,7 @@ class ImmutableDict(Mapping[Any, Any]): self._d = dict(arg) self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())])) - def _validate(self, arg: Union[Dict[Any, Any], Iterable[Tuple[Any, Any]]]) -> None: + def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None: """Validate arguments.""" if isinstance(arg, dict): @@ -147,12 +148,12 @@ class ImmutableDict(Mapping[Any, Any]): class Namespaces(ImmutableDict): """Namespaces.""" - def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: + def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Initialize.""" super().__init__(arg) - def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: + def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Validate arguments.""" if isinstance(arg, dict): @@ -165,12 +166,12 @@ class Namespaces(ImmutableDict): class CustomSelectors(ImmutableDict): """Custom selectors.""" - def __init__(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: + def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Initialize.""" super().__init__(arg) - def _validate(self, arg: Union[Dict[str, str], Iterable[Tuple[str, str]]]) -> None: + def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Validate arguments.""" if isinstance(arg, dict): @@ -188,30 +189,30 @@ class Selector(Immutable): 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' ) - tag: Optional['SelectorTag'] - ids: Tuple[str, ...] - classes: Tuple[str, ...] - attributes: Tuple['SelectorAttribute', ...] - nth: Tuple['SelectorNth', ...] - selectors: Tuple['SelectorList', ...] - relation: 'SelectorList' + tag: Optional[SelectorTag] + ids: tuple[str, ...] + classes: tuple[str, ...] + attributes: tuple[SelectorAttribute, ...] + nth: tuple[SelectorNth, ...] + selectors: tuple[SelectorList, ...] + relation: SelectorList rel_type: Optional[str] - contains: Tuple['SelectorContains', ...] - lang: Tuple['SelectorLang', ...] + contains: tuple[SelectorContains, ...] + lang: tuple[SelectorLang, ...] flags: int def __init__( self, - tag: Optional['SelectorTag'], - ids: Tuple[str, ...], - classes: Tuple[str, ...], - attributes: Tuple['SelectorAttribute', ...], - nth: Tuple['SelectorNth', ...], - selectors: Tuple['SelectorList', ...], - relation: 'SelectorList', + tag: Optional[SelectorTag], + ids: tuple[str, ...], + classes: tuple[str, ...], + attributes: tuple[SelectorAttribute, ...], + nth: tuple[SelectorNth, ...], + selectors: tuple[SelectorList, ...], + relation: SelectorList, rel_type: Optional[str], - contains: Tuple['SelectorContains', ...], - lang: Tuple['SelectorLang', ...], + contains: tuple[SelectorContains, ...], + lang: tuple[SelectorLang, ...], flags: int ): """Initialize.""" @@ -286,7 +287,7 @@ class SelectorContains(Immutable): __slots__ = ("text", "own", "_hash") - text: Tuple[str, ...] + text: tuple[str, ...] own: bool def __init__(self, text: Iterable[str], own: bool) -> None: @@ -305,9 +306,9 @@ class SelectorNth(Immutable): b: int of_type: bool last: bool - selectors: 'SelectorList' + selectors: SelectorList - def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: 'SelectorList') -> None: + def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None: """Initialize.""" super().__init__( @@ -325,7 +326,7 @@ class SelectorLang(Immutable): __slots__ = ("languages", "_hash",) - languages: Tuple[str, ...] + languages: tuple[str, ...] def __init__(self, languages: Iterable[str]): """Initialize.""" @@ -353,13 +354,13 @@ class SelectorList(Immutable): __slots__ = ("selectors", "is_not", "is_html", "_hash") - selectors: Tuple[Union['Selector', 'SelectorNull'], ...] + selectors: tuple[Selector | SelectorNull, ...] is_not: bool is_html: bool def __init__( self, - selectors: Optional[Iterable[Union['Selector', 'SelectorNull']]] = None, + selectors: Optional[Iterable[Selector | SelectorNull]] = None, is_not: bool = False, is_html: bool = False ) -> None: @@ -371,7 +372,7 @@ class SelectorList(Immutable): is_html=is_html ) - def __iter__(self) -> Iterator[Union['Selector', 'SelectorNull']]: + def __iter__(self) -> Iterator[Selector | SelectorNull]: """Iterator.""" return iter(self.selectors) @@ -381,7 +382,7 @@ class SelectorList(Immutable): return len(self.selectors) - def __getitem__(self, index: int) -> Union['Selector', 'SelectorNull']: + def __getitem__(self, index: int) -> Selector | SelectorNull: """Get item.""" return self.selectors[index] diff --git a/lib/soupsieve/pretty.py b/lib/soupsieve/pretty.py index 57d16c97..f848d5e2 100644 --- a/lib/soupsieve/pretty.py +++ b/lib/soupsieve/pretty.py @@ -65,6 +65,7 @@ SelectorList( is_html=False) ``` """ +from __future__ import annotations import re from typing import Any diff --git a/lib/soupsieve/util.py b/lib/soupsieve/util.py index 2b1ed24b..cf4dc5cc 100644 --- a/lib/soupsieve/util.py +++ b/lib/soupsieve/util.py @@ -1,8 +1,9 @@ """Utility.""" +from __future__ import annotations from functools import wraps, lru_cache import warnings import re -from typing import Callable, Any, Optional, Tuple, List +from typing import Callable, Any, Optional DEBUG = 0x00001 @@ -75,13 +76,13 @@ def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no co ) -def get_pattern_context(pattern: str, index: int) -> Tuple[str, int, int]: +def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]: """Get the pattern context.""" last = 0 current_line = 1 col = 1 - text = [] # type: List[str] + text = [] # type: list[str] line = 1 offset = None # type: Optional[int] diff --git a/requirements.txt b/requirements.txt index 050b9add..b5e6dd57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ arrow==1.2.3 backports.csv==1.0.7 backports.functools-lru-cache==1.6.4 backports.zoneinfo==0.2.1;python_version<"3.9" -beautifulsoup4==4.11.1 +beautifulsoup4==4.11.2 bleach==6.0.0 certifi==2022.12.7 cheroot==9.0.0