mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2025-07-05 20:41:47 -07:00
[JSInterp] Add tests and relevant functionality from yt-dlp
* thx seproDev, bashonly: yt-dlp/yt-dlp#12760, yt-dlp/yt-dlp#12761: - Improve nested attribute support - Pass global stack when extracting objects - interpret_statement: Match attribute before indexing - Fix assignment to array elements with nested brackets - Add new signature tests - Invalidate JS function cache - Avoid testdata dupes now that we cache by URL * rework nsig function name search * fully fixes #33102 * update cache required versions * update program version
This commit is contained in:
parent
bd2ded59f2
commit
c052a16f72
5 changed files with 129 additions and 22 deletions
|
@ -1652,7 +1652,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
assert os.path.basename(func_id) == func_id
|
||||
|
||||
self.write_debug('Extracting signature function {0}'.format(func_id))
|
||||
cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None
|
||||
cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None
|
||||
|
||||
if not cache_spec:
|
||||
code = self._load_player(video_id, player_url, player_id)
|
||||
|
@ -1813,6 +1813,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
return ret
|
||||
|
||||
def _extract_n_function_name(self, jscode):
|
||||
func_name, idx = None, None
|
||||
# these special cases are redundant and probably obsolete (2025-04):
|
||||
# they make the tests run ~10% faster without fallback warnings
|
||||
r"""
|
||||
func_name, idx = self._search_regex(
|
||||
# (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};
|
||||
# (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}};
|
||||
|
@ -1839,9 +1843,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
\(\s*[\w$]+\s*\)
|
||||
''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
|
||||
default=(None, None))
|
||||
"""
|
||||
|
||||
if not func_name:
|
||||
# nfunc=function(x){...}|function nfunc(x); ...
|
||||
# ... var y=[nfunc]|y[idx]=nfunc);
|
||||
# obvious REs hang, so use a two-stage tactic
|
||||
for m in re.finditer(r'''(?x)
|
||||
[\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s*
|
||||
(?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\])
|
||||
\s*?[;\n]
|
||||
''', jscode):
|
||||
func_name = self._search_regex(
|
||||
r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format(
|
||||
re.escape(m.group('nfunc')), '{'),
|
||||
jscode, 'Initial JS player n function name (2)', group=2, default=None)
|
||||
if func_name:
|
||||
idx = m.group('idx')
|
||||
break
|
||||
|
||||
# thx bashonly: yt-dlp/yt-dlp/pull/10611
|
||||
if not func_name:
|
||||
self.report_warning('Falling back to generic n function search')
|
||||
self.report_warning('Falling back to generic n function search', only_once=True)
|
||||
return self._search_regex(
|
||||
r'''(?xs)
|
||||
(?:(?<=[^\w$])|^) # instead of \b, which ignores $
|
||||
|
@ -1855,14 +1878,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
return func_name
|
||||
|
||||
return self._search_json(
|
||||
r'var\s+{0}\s*='.format(re.escape(func_name)), jscode,
|
||||
r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode,
|
||||
'Initial JS player n function list ({0}.{1})'.format(func_name, idx),
|
||||
func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]',
|
||||
func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]',
|
||||
transform_source=js_to_json)[int(idx)]
|
||||
|
||||
def _extract_n_function_code(self, video_id, player_url):
|
||||
player_id = self._extract_player_info(player_url)
|
||||
func_code = self.cache.load('youtube-nsig', player_id)
|
||||
func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07')
|
||||
jscode = func_code or self._load_player(video_id, player_url)
|
||||
jsi = JSInterpreter(jscode)
|
||||
|
||||
|
|
|
@ -303,8 +303,6 @@ _UNARY_OPERATORS_X = (
|
|||
('!', _js_unary_op(lambda x: _js_ternary(x, if_true=False, if_false=True))),
|
||||
)
|
||||
|
||||
_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS))
|
||||
|
||||
_COMP_OPERATORS = (
|
||||
('===', _js_id_op(operator.is_)),
|
||||
('!==', _js_id_op(operator.is_not)),
|
||||
|
@ -316,9 +314,12 @@ _COMP_OPERATORS = (
|
|||
('>', _js_comp_op(operator.gt)),
|
||||
)
|
||||
|
||||
_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS + _SC_OPERATORS))
|
||||
|
||||
_NAME_RE = r'[a-zA-Z_$][\w$]*'
|
||||
_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))
|
||||
_QUOTES = '\'"/'
|
||||
_NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?'
|
||||
|
||||
|
||||
class JS_Break(ExtractorError):
|
||||
|
@ -1088,15 +1089,18 @@ class JSInterpreter(object):
|
|||
|
||||
m = re.match(r'''(?x)
|
||||
(?P<assign>
|
||||
(?P<out>{_NAME_RE})(?:\[(?P<out_idx>(?:.+?\]\s*\[)*.+?)\])?\s*
|
||||
(?P<out>{_NAME_RE})(?P<out_idx>(?:\[{_NESTED_BRACKETS}\])+)?\s*
|
||||
(?P<op>{_OPERATOR_RE})?
|
||||
=(?!=)(?P<expr>.*)$
|
||||
)|(?P<return>
|
||||
(?!if|return|true|false|null|undefined|NaN|Infinity)(?P<name>{_NAME_RE})$
|
||||
)|(?P<indexing>
|
||||
(?P<in>{_NAME_RE})\[(?P<in_idx>(?:.+?\]\s*\[)*.+?)\]$
|
||||
)|(?P<attribute>
|
||||
(?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s*
|
||||
(?P<var>{_NAME_RE})(?:
|
||||
(?P<nullish>\?)?\.(?P<member>[^(]+)|
|
||||
\[(?P<member2>{_NESTED_BRACKETS})\]
|
||||
)\s*
|
||||
)|(?P<indexing>
|
||||
(?P<in>{_NAME_RE})(?P<in_idx>\[.+\])$
|
||||
)|(?P<function>
|
||||
(?P<fname>{_NAME_RE})\((?P<args>.*)\)$
|
||||
)'''.format(**globals()), expr)
|
||||
|
@ -1111,10 +1115,11 @@ class JSInterpreter(object):
|
|||
elif left_val in (None, JS_Undefined):
|
||||
raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr)
|
||||
|
||||
indexes = re.split(r'\]\s*\[', m.group('out_idx'))
|
||||
for i, idx in enumerate(indexes, 1):
|
||||
indexes = md['out_idx']
|
||||
while indexes:
|
||||
idx, indexes = self._separate_at_paren(indexes)
|
||||
idx = self.interpret_expression(idx, local_vars, allow_recursion)
|
||||
if i < len(indexes):
|
||||
if indexes:
|
||||
left_val = self._index(left_val, idx)
|
||||
if isinstance(idx, float):
|
||||
idx = int(idx)
|
||||
|
@ -1159,7 +1164,9 @@ class JSInterpreter(object):
|
|||
|
||||
if md.get('indexing'):
|
||||
val = local_vars[m.group('in')]
|
||||
for idx in re.split(r'\]\s*\[', m.group('in_idx')):
|
||||
indexes = m.group('in_idx')
|
||||
while indexes:
|
||||
idx, indexes = self._separate_at_paren(indexes)
|
||||
idx = self.interpret_expression(idx, local_vars, allow_recursion)
|
||||
val = self._index(val, idx)
|
||||
return val, should_return
|
||||
|
@ -1204,7 +1211,7 @@ class JSInterpreter(object):
|
|||
if obj is JS_Undefined:
|
||||
try:
|
||||
if variable not in self._objects:
|
||||
self._objects[variable] = self.extract_object(variable)
|
||||
self._objects[variable] = self.extract_object(variable, local_vars)
|
||||
obj = self._objects[variable]
|
||||
except self.Exception:
|
||||
if not nullish:
|
||||
|
@ -1215,7 +1222,7 @@ class JSInterpreter(object):
|
|||
|
||||
# Member access
|
||||
if arg_str is None:
|
||||
return self._index(obj, member)
|
||||
return self._index(obj, member, nullish)
|
||||
|
||||
# Function call
|
||||
argvals = [
|
||||
|
@ -1400,7 +1407,7 @@ class JSInterpreter(object):
|
|||
for v in self._separate(list_txt):
|
||||
yield self.interpret_expression(v, local_vars, allow_recursion)
|
||||
|
||||
def extract_object(self, objname):
|
||||
def extract_object(self, objname, *global_stack):
|
||||
_FUNC_NAME_RE = r'''(?:{n}|"{n}"|'{n}')'''.format(n=_NAME_RE)
|
||||
obj = {}
|
||||
fields = next(filter(None, (
|
||||
|
@ -1421,7 +1428,8 @@ class JSInterpreter(object):
|
|||
fields):
|
||||
argnames = self.build_arglist(f.group('args'))
|
||||
name = remove_quotes(f.group('key'))
|
||||
obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), 'F<{0}>'.format(name))
|
||||
obj[name] = function_with_repr(
|
||||
self.build_function(argnames, f.group('code'), *global_stack), 'F<{0}>'.format(name))
|
||||
|
||||
return obj
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
__version__ = '2021.12.17'
|
||||
__version__ = '2025.04.07'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue