mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2025-08-21 05:44:02 -07:00
[jsinterp] Fix regexp parsing and .replace[All] method
* For performance, make regexp object instantiation lazy * Other small performance improvements
This commit is contained in:
parent
3e92c60fcd
commit
0402710227
2 changed files with 93 additions and 37 deletions
|
@ -12,9 +12,11 @@ from .utils import (
|
|||
js_to_json,
|
||||
remove_quotes,
|
||||
unified_timestamp,
|
||||
variadic,
|
||||
)
|
||||
from .compat import (
|
||||
compat_basestring,
|
||||
compat_chr,
|
||||
compat_collections_chain_map as ChainMap,
|
||||
compat_itertools_zip_longest as zip_longest,
|
||||
compat_str,
|
||||
|
@ -205,10 +207,10 @@ class JSInterpreter(object):
|
|||
super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
|
||||
|
||||
class JS_RegExp(object):
|
||||
_RE_FLAGS = {
|
||||
RE_FLAGS = {
|
||||
# special knowledge: Python's re flags are bitmask values, current max 128
|
||||
# invent new bitmask values well above that for literal parsing
|
||||
# TODO: new pattern class to execute matches with these flags
|
||||
# TODO: execute matches with these flags (remaining: d, y)
|
||||
'd': 1024, # Generate indices for substring matches
|
||||
'g': 2048, # Global search
|
||||
'i': re.I, # Case-insensitive search
|
||||
|
@ -218,12 +220,19 @@ class JSInterpreter(object):
|
|||
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
|
||||
}
|
||||
|
||||
def __init__(self, pattern_txt, flags=''):
|
||||
def __init__(self, pattern_txt, flags=0):
|
||||
if isinstance(flags, compat_str):
|
||||
flags, _ = self.regex_flags(flags)
|
||||
# Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern
|
||||
# First, avoid https://github.com/python/cpython/issues/74534
|
||||
self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags)
|
||||
self.__self = None
|
||||
self.__pattern_txt = pattern_txt.replace('[[', r'[\[')
|
||||
self.__flags = flags
|
||||
|
||||
def __instantiate(self):
|
||||
if self.__self:
|
||||
return
|
||||
self.__self = re.compile(self.__pattern_txt, self.__flags)
|
||||
# Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern
|
||||
for name in dir(self.__self):
|
||||
# Only these? Obviously __class__, __init__.
|
||||
# PyPy creates a __weakref__ attribute with value None
|
||||
|
@ -232,15 +241,21 @@ class JSInterpreter(object):
|
|||
continue
|
||||
setattr(self, name, getattr(self.__self, name))
|
||||
|
||||
def __getattr__(self, name):
|
||||
self.__instantiate()
|
||||
if hasattr(self, name):
|
||||
return getattr(self, name)
|
||||
return super(JSInterpreter.JS_RegExp, self).__getattr__(name)
|
||||
|
||||
@classmethod
|
||||
def regex_flags(cls, expr):
|
||||
flags = 0
|
||||
if not expr:
|
||||
return flags, expr
|
||||
for idx, ch in enumerate(expr):
|
||||
if ch not in cls._RE_FLAGS:
|
||||
if ch not in cls.RE_FLAGS:
|
||||
break
|
||||
flags |= cls._RE_FLAGS[ch]
|
||||
flags |= cls.RE_FLAGS[ch]
|
||||
return flags, expr[idx + 1:]
|
||||
|
||||
@classmethod
|
||||
|
@ -265,17 +280,17 @@ class JSInterpreter(object):
|
|||
counters = dict((k, 0) for k in _MATCHING_PARENS.values())
|
||||
start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
|
||||
in_quote, escaping, skipping = None, False, 0
|
||||
after_op, in_regex_char_group, skip_re = True, False, 0
|
||||
after_op, in_regex_char_group = True, False
|
||||
|
||||
for idx, char in enumerate(expr):
|
||||
if skip_re > 0:
|
||||
skip_re -= 1
|
||||
continue
|
||||
paren_delta = 0
|
||||
if not in_quote:
|
||||
if char in _MATCHING_PARENS:
|
||||
counters[_MATCHING_PARENS[char]] += 1
|
||||
paren_delta = 1
|
||||
elif char in counters:
|
||||
counters[char] -= 1
|
||||
paren_delta = -1
|
||||
if not escaping:
|
||||
if char in _QUOTES and in_quote in (char, None):
|
||||
if in_quote or after_op or char != '/':
|
||||
|
@ -283,7 +298,7 @@ class JSInterpreter(object):
|
|||
elif in_quote == '/' and char in '[]':
|
||||
in_regex_char_group = char == '['
|
||||
escaping = not escaping and in_quote and char == '\\'
|
||||
after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op))
|
||||
after_op = not in_quote and (char in cls.OP_CHARS or paren_delta > 0 or (after_op and char.isspace()))
|
||||
|
||||
if char != delim[pos] or any(counters.values()) or in_quote:
|
||||
pos = skipping = 0
|
||||
|
@ -293,7 +308,7 @@ class JSInterpreter(object):
|
|||
continue
|
||||
elif pos == 0 and skip_delims:
|
||||
here = expr[idx:]
|
||||
for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]:
|
||||
for s in variadic(skip_delims):
|
||||
if here.startswith(s) and s:
|
||||
skipping = len(s) - 1
|
||||
break
|
||||
|
@ -316,7 +331,7 @@ class JSInterpreter(object):
|
|||
separated = list(cls._separate(expr, delim, 1))
|
||||
|
||||
if len(separated) < 2:
|
||||
raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals()))
|
||||
raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals()))
|
||||
return separated[0][1:].strip(), separated[1].strip()
|
||||
|
||||
@staticmethod
|
||||
|
@ -361,6 +376,20 @@ class JSInterpreter(object):
|
|||
except TypeError:
|
||||
return self._named_object(namespace, obj)
|
||||
|
||||
# used below
|
||||
_VAR_RET_THROW_RE = re.compile(r'''(?x)
|
||||
(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?P<throw>throw\s+)
|
||||
''')
|
||||
_COMPOUND_RE = re.compile(r'''(?x)
|
||||
(?P<try>try)\s*\{|
|
||||
(?P<if>if)\s*\(|
|
||||
(?P<switch>switch)\s*\(|
|
||||
(?P<for>for)\s*\(|
|
||||
(?P<while>while)\s*\(
|
||||
''')
|
||||
_FINALLY_RE = re.compile(r'finally\s*\{')
|
||||
_SWITCH_RE = re.compile(r'switch\s*\(')
|
||||
|
||||
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
|
||||
if allow_recursion < 0:
|
||||
raise self.Exception('Recursion limit reached')
|
||||
|
@ -375,7 +404,7 @@ class JSInterpreter(object):
|
|||
if should_return:
|
||||
return ret, should_return
|
||||
|
||||
m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt)
|
||||
m = self._VAR_RET_THROW_RE.match(stmt)
|
||||
if m:
|
||||
expr = stmt[len(m.group(0)):].strip()
|
||||
if m.group('throw'):
|
||||
|
@ -447,13 +476,7 @@ class JSInterpreter(object):
|
|||
for item in self._separate(inner)])
|
||||
expr = name + outer
|
||||
|
||||
m = re.match(r'''(?x)
|
||||
(?P<try>try)\s*\{|
|
||||
(?P<if>if)\s*\(|
|
||||
(?P<switch>switch)\s*\(|
|
||||
(?P<for>for)\s*\(|
|
||||
(?P<while>while)\s*\(
|
||||
''', expr)
|
||||
m = self._COMPOUND_RE.match(expr)
|
||||
md = m.groupdict() if m else {}
|
||||
if md.get('if'):
|
||||
cndn, expr = self._separate_at_paren(expr[m.end() - 1:])
|
||||
|
@ -512,7 +535,7 @@ class JSInterpreter(object):
|
|||
err = None
|
||||
pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion)
|
||||
|
||||
m = re.match(r'finally\s*\{', expr)
|
||||
m = self._FINALLY_RE.match(expr)
|
||||
if m:
|
||||
sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
|
||||
ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion)
|
||||
|
@ -531,7 +554,7 @@ class JSInterpreter(object):
|
|||
if remaining.startswith('{'):
|
||||
body, expr = self._separate_at_paren(remaining)
|
||||
else:
|
||||
switch_m = re.match(r'switch\s*\(', remaining) # FIXME
|
||||
switch_m = self._SWITCH_RE.match(remaining) # FIXME
|
||||
if switch_m:
|
||||
switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:])
|
||||
body, expr = self._separate_at_paren(remaining, '}')
|
||||
|
@ -735,7 +758,7 @@ class JSInterpreter(object):
|
|||
if obj == compat_str:
|
||||
if member == 'fromCharCode':
|
||||
assertion(argvals, 'takes one or more arguments')
|
||||
return ''.join(map(chr, argvals))
|
||||
return ''.join(map(compat_chr, argvals))
|
||||
raise self.Exception('Unsupported string method ' + member, expr=expr)
|
||||
elif obj == float:
|
||||
if member == 'pow':
|
||||
|
@ -808,10 +831,17 @@ class JSInterpreter(object):
|
|||
if idx >= len(obj):
|
||||
return None
|
||||
return ord(obj[idx])
|
||||
elif member == 'replace':
|
||||
elif member in ('replace', 'replaceAll'):
|
||||
assertion(isinstance(obj, compat_str), 'must be applied on a string')
|
||||
assertion(len(argvals) == 2, 'takes exactly two arguments')
|
||||
return re.sub(argvals[0], argvals[1], obj)
|
||||
# TODO: argvals[1] callable, other Py vs JS edge cases
|
||||
if isinstance(argvals[0], self.JS_RegExp):
|
||||
count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1
|
||||
assertion(member != 'replaceAll' or count == 0,
|
||||
'replaceAll must be called with a global RegExp')
|
||||
return argvals[0].sub(argvals[1], obj, count=count)
|
||||
count = ('replaceAll', 'replace').index(member)
|
||||
return re.sub(re.escape(argvals[0]), argvals[1], obj, count=count)
|
||||
|
||||
idx = int(member) if isinstance(obj, list) else member
|
||||
return obj[idx](argvals, allow_recursion=allow_recursion)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue