Update soupsieve-2.2.1

This commit is contained in:
JonnyWong16 2021-10-14 20:45:43 -07:00
parent 9a54fb9a44
commit b581460b51
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
6 changed files with 146 additions and 233 deletions

View file

@ -1,10 +1,11 @@
"""CSS selector parser."""
from __future__ import unicode_literals
import re
from functools import lru_cache
from . import util
from . import css_match as cm
from . import css_types as ct
from .util import SelectorSyntaxError
import warnings
UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -59,6 +60,8 @@ PSEUDO_SIMPLE_NO_MATCH = {
# Complex pseudo classes that take selector lists
PSEUDO_COMPLEX = {
':contains',
':-soup-contains',
':-soup-contains-own',
':has',
':is',
':matches',
@ -117,9 +120,11 @@ PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
# Classes (`.class`)
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
# Prefix:Tag (`prefix|tag`)
PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER)
PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
# Attributes (`[attr]`, `[attr=value]`, etc.)
PAT_ATTR = r'\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
PAT_ATTR = r'''
\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
@ -196,7 +201,7 @@ FLG_PLACEHOLDER_SHOWN = 0x200
_MAXCACHE = 500
@util.lru_cache(maxsize=_MAXCACHE)
@lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile(pattern, namespaces, custom, flags):
"""Cached CSS compile."""
@ -245,7 +250,7 @@ def css_unescape(content, string=False):
codepoint = int(m.group(1)[1:], 16)
if codepoint == 0:
codepoint = UNICODE_REPLACEMENT_CHAR
value = util.uchr(codepoint)
value = chr(codepoint)
elif m.group(2):
value = m.group(2)[1:]
elif m.group(3):
@ -269,7 +274,7 @@ def escape(ident):
string.append('\\{}'.format(ident))
else:
for index, c in enumerate(ident):
codepoint = util.uord(c)
codepoint = ord(c)
if codepoint == 0x00:
string.append('\ufffd')
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
@ -300,12 +305,7 @@ class SelectorPattern(object):
return self.name
def enabled(self, flags):
"""Enabled."""
return True
def match(self, selector, index):
def match(self, selector, index, flags):
"""Match the selector."""
return self.re_pattern.match(selector, index)
@ -320,7 +320,7 @@ class SpecialPseudoPattern(SelectorPattern):
self.patterns = {}
for p in patterns:
name = p[0]
pattern = SelectorPattern(name, p[2])
pattern = p[3](name, p[2])
for pseudo in p[1]:
self.patterns[pseudo] = pattern
@ -332,12 +332,7 @@ class SpecialPseudoPattern(SelectorPattern):
return self.matched_name.get_name()
def enabled(self, flags):
"""Enabled."""
return True
def match(self, selector, index):
def match(self, selector, index, flags):
"""Match the selector."""
pseudo = None
@ -346,7 +341,7 @@ class SpecialPseudoPattern(SelectorPattern):
name = util.lower(css_unescape(m.group('name')))
pattern = self.patterns.get(name)
if pattern:
pseudo = pattern.match(selector, index)
pseudo = pattern.match(selector, index, flags)
if pseudo:
self.matched_name = pattern
@ -429,11 +424,16 @@ class CSSParser(object):
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
SpecialPseudoPattern(
(
("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG),
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR)
(
"pseudo_contains",
(':contains', ':-soup-contains', ':-soup-contains-own'),
PAT_PSEUDO_CONTAINS,
SelectorPattern
),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
)
),
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
@ -461,15 +461,11 @@ class CSSParser(object):
inverse = False
op = m.group('cmp')
case = util.lower(m.group('case')) if m.group('case') else None
parts = [css_unescape(a) for a in m.group('ns_attr').split('|')]
ns = ''
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
attr = css_unescape(m.group('attr_name'))
is_type = False
pattern2 = None
if len(parts) > 1:
ns = parts[0]
attr = parts[1]
else:
attr = parts[0]
if case:
flags = re.I if case == 'i' else 0
elif util.lower(attr) == 'type':
@ -532,13 +528,8 @@ class CSSParser(object):
def parse_tag_pattern(self, sel, m, has_selector):
"""Parse tag pattern from regex match."""
parts = [css_unescape(x) for x in m.group(0).split('|')]
if len(parts) > 1:
prefix = parts[0]
tag = parts[1]
else:
tag = parts[0]
prefix = None
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
tag = css_unescape(m.group('tag_name'))
sel.tag = ct.SelectorTag(tag, prefix)
has_selector = True
return has_selector
@ -817,7 +808,14 @@ class CSSParser(object):
def parse_pseudo_contains(self, sel, m, has_selector):
"""Parse contains."""
values = m.group('values')
pseudo = util.lower(css_unescape(m.group('name')))
if pseudo == ":contains":
warnings.warn(
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
FutureWarning
)
contains_own = pseudo == ":-soup-contains-own"
values = css_unescape(m.group('values'))
patterns = []
for token in RE_VALUES.finditer(values):
if token.group('split'):
@ -828,7 +826,7 @@ class CSSParser(object):
else:
value = css_unescape(value)
patterns.append(value)
sel.contains.append(ct.SelectorContains(tuple(patterns)))
sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own))
has_selector = True
return has_selector
@ -918,7 +916,7 @@ class CSSParser(object):
elif key == 'pseudo_class':
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
elif key == 'pseudo_element':
raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
elif key == 'pseudo_contains':
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
@ -1027,9 +1025,7 @@ class CSSParser(object):
while index <= end:
m = None
for v in self.css_tokens:
if not v.enabled(self.flags): # pragma: no cover
continue
m = v.match(pattern, index)
m = v.match(pattern, index, self.flags)
if m:
name = v.get_name()
if self.debug: # pragma: no cover
@ -1067,7 +1063,7 @@ class CSSParser(object):
# CSS pattern for `:link` and `:any-link`
CSS_LINK = CSSParser(
'html|*:is(a, area, link)[href]'
'html|*:is(a, area)[href]'
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:checked`
CSS_CHECKED = CSSParser(
@ -1098,23 +1094,23 @@ CSS_INDETERMINATE = CSSParser(
This pattern must be at the end.
Special logic is applied to the last selector.
*/
html|input[type="radio"][name][name!='']:not([checked])
html|input[type="radio"][name]:not([name='']):not([checked])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
# CSS pattern for `:disabled`
CSS_DISABLED = CSSParser(
'''
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|optgroup[disabled] > html|option,
html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
html|fieldset[disabled] >
html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:enabled`
CSS_ENABLED = CSSParser(
'''
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:required`
@ -1138,8 +1134,8 @@ CSS_PLACEHOLDER_SHOWN = CSSParser(
[type=email],
[type=password],
[type=number]
)[placeholder][placeholder!='']:is(:not([value]), [value=""]),
html|textarea[placeholder][placeholder!='']
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
html|textarea[placeholder]:not([placeholder=''])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
# CSS pattern default for `:nth-child` "of S" feature