From 35248fc0a954d13f1cccc46cc9ccf25da2c43bc4 Mon Sep 17 00:00:00 2001 From: unknown <7951720+u-n-k-n-o-w-n@users.noreply.github.com> Date: Sat, 20 Aug 2022 11:27:13 +0900 Subject: [PATCH 1/5] [options] Added workaround option to execute "n_function" --- test/test_youtube_signature.py | 14 +++++- youtube_dl/__init__.py | 1 + youtube_dl/compat.py | 6 +++ youtube_dl/extractor/youtube.py | 75 +++++++++++++++++++++++++++++++++ youtube_dl/options.py | 3 ++ 5 files changed, 98 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 98221b9c2..0690e0383 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -408,7 +408,10 @@ def t_factory(name, sig_func, url_pattern): jscode = testf.read() self.assertEqual(sig_func(jscode, sig_input), expected_sig) - test_func.__name__ = str('test_{0}_js_{1}'.format(name, test_id)) + tn = name + if name.endswith('_wd'): + tn = name[:-3] + test_func.__name__ = str('test_{0}_js_{1}'.format(tn, test_id)) setattr(TestSignature, test_func.__name__, test_func) return make_tfunc @@ -428,6 +431,10 @@ def n_sig(jscode, sig_input): return ie._extract_n_function_from_code(jsi, func_code)(sig_input) +def n_sig_wd(jscode, sig_input): + return YoutubeIE(FakeYDL())._call_n_function_with_webdriver('chrome', jscode, sig_input) + + make_sig_test = t_factory( 'signature', signature, re.compile(r'''(?x) @@ -442,6 +449,11 @@ make_nsig_test = t_factory( for test_spec in _NSIG_TESTS: make_nsig_test(*test_spec) +make_nsig_wd_test = t_factory( + 'nsig_wd', n_sig_wd, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) +for test_spec in _NSIG_TESTS: + make_nsig_wd_test(*test_spec) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3c1272e7b..15dcc89ca 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -419,6 +419,7 @@ def _real_main(argv=None): 'call_home': opts.call_home, 'sleep_interval': opts.sleep_interval, 'max_sleep_interval': opts.max_sleep_interval, + 'webdriver': opts.webdriver, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index ebe22bdf9..71f6e4571 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2491,6 +2491,11 @@ except ImportError: # compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, # compat_urllib_parse_urlencode, # compat_urllib_parse_parse_qs +try: + from urllib.parse import quote as compat_urllib_quote +except ImportError: # Python 2 + from urllib import quote as compat_urllib_quote + try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote @@ -3719,6 +3724,7 @@ __all__ = [ 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', + 'compat_urllib_quote', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b31798729..15c89864b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -12,6 +12,7 @@ import re import string import time import traceback +import importlib from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -22,6 +23,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_unquote_plus, + compat_urllib_quote, compat_urllib_parse_urlparse, compat_zip as zip, ) @@ -1604,6 +1606,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} self._player_cache = {} + self._webdriver = None + + def __del__(self): + if self._webdriver is not None: + self._webdriver.quit() # *ytcfgs, webpage=None def _extract_player_url(self, *ytcfgs, **kw_webpage): @@ -1852,6 +1859,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url is None: raise ExtractorError('Cannot decrypt nsig without player_url') + webdriver_type = self._downloader.params.get('webdriver', None) + if webdriver_type is not None: + try: + jscode = self._load_player(video_id, player_url) + ret = self._call_n_function_with_webdriver(webdriver_type, jscode, n) + except Exception as e: + self.report_warning( + '%s (%s %s)' % ( + 'Unable to decode n-parameter: download likely to be throttled', + error_to_compat_str(e), + traceback.format_exc()), + video_id=video_id) + return + self.write_debug('Decrypted nsig(with webdriver) {0} => {1}'.format(n, ret)) + return ret + try: jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) except ExtractorError as e: @@ -1875,6 +1898,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.write_debug('Decrypted nsig {0} => {1}'.format(n, ret)) return ret + def _call_n_function_with_webdriver(self, webdriver_type, jscode, n_param): + if self._webdriver is None: + wd = importlib.import_module('selenium.webdriver') + if webdriver_type == 'firefox': # geckodriver + o = wd.FirefoxOptions() + o.headless = True + s = wd.firefox.service.Service(log_path=os.path.devnull) + self._webdriver = wd.Firefox(options=o, service=s) + elif webdriver_type == 'chrome': # chromedriver + o = wd.ChromeOptions() + o.headless = True + """ + If you are using the snap version of the chromium, chromedriver is included in the snap package. + You should use that driver. + $ cd /snap/bin && sudo ln -s -T chromium.chromedriver chromedriver + or + s = wd.chrome.service.Service(executable_path='chromium.chromedriver') + self._webdriver = wd.Chrome(options=o, service=s) + """ + self._webdriver = wd.Chrome(options=o) + elif webdriver_type == 'edge': # msedgedriver + o = wd.EdgeOptions() + o.headless = True + self._webdriver = wd.Edge(options=o) + elif webdriver_type == 'safari': # safaridriver + """ + safaridriver does not have headless-mode. :( + But macOS includes safaridriver by default. + To enable automation on safaridriver, run the following command once from the admin terminal. + # safaridriver --enable + """ + self._webdriver = wd.Safari() + else: + raise ExtractorError('unsupported webdriver type: %s' % (webdriver_type)) + self._webdriver.get('about:blank') + funcname = self._extract_n_function_name(jscode) + alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + dummyfunc = ''.join(random.choice(alphabet) for _ in range(8)) + f = ('return ((e) => {{' + 'const d = decodeURIComponent(e);' + 'const p = d.lastIndexOf("}}");' + 'const th = d.substring(0, p);' + 'const bh = d.substring(p);' + 'const m = "var {0};" + th + ";{0} = {1};" + bh;' + 'const s = document.createElement("script");' + 's.innerHTML = m;' + 'document.body.append(s);' + 'return {0}("{2}");' + '}})("{3}");').format(dummyfunc, funcname, n_param, compat_urllib_quote(jscode)) + n = self._webdriver.execute_script(f) + return n + def _extract_n_function_name(self, jscode): func_name, idx = None, None diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 61705d1f0..27dcc4807 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -576,6 +576,9 @@ def parseOpts(overrideArguments=None): 'Upper bound of a range for randomized sleep before each download ' '(maximum possible number of seconds to sleep). Must only be used ' 'along with --min-sleep-interval.')) + workarounds.add_option( + '--webdriver', metavar='TYPE', dest='webdriver', default=None, + help='Specify webdriver type when you want to use selenium webdriver to execute "n_function" : "firefox", "chrome", "edge", or "safari"') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( From ec5462fc8a49daa05be6b81d7426813809586acb Mon Sep 17 00:00:00 2001 From: unknown <7951720+u-n-k-n-o-w-n@users.noreply.github.com> Date: Sun, 21 Aug 2022 23:25:54 +0900 Subject: [PATCH 2/5] webdriver tests only when specified --- test/test_youtube_signature.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 0690e0383..da4da947a 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -449,11 +449,17 @@ make_nsig_test = t_factory( for test_spec in _NSIG_TESTS: make_nsig_test(*test_spec) -make_nsig_wd_test = t_factory( - 'nsig_wd', n_sig_wd, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) -for test_spec in _NSIG_TESTS: - make_nsig_wd_test(*test_spec) - +test_wd = False +for arg in sys.argv: + if arg == '--test_wd': + test_wd = True + break +if test_wd: + sys.argv = [arg for arg in sys.argv if arg != '--test_wd'] + make_nsig_wd_test = t_factory( + 'nsig_wd', n_sig_wd, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) + for test_spec in _NSIG_TESTS: + make_nsig_wd_test(*test_spec) if __name__ == '__main__': unittest.main() From 2fbc8f99f5936838ad1b56a6000a521061b2bc72 Mon Sep 17 00:00:00 2001 From: unknown <7951720+u-n-k-n-o-w-n@users.noreply.github.com> Date: Mon, 22 Aug 2022 19:10:12 +0900 Subject: [PATCH 3/5] JS operation by webdriver was made into a component --- youtube_dl/extractor/youtube.py | 118 +++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 15c89864b..5d8250f3e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -30,6 +30,7 @@ from ..compat import ( from ..jsinterp import JSInterpreter from ..utils import ( bug_reports_message, + check_executable, clean_html, dict_get, error_to_compat_str, @@ -1606,11 +1607,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} self._player_cache = {} - self._webdriver = None - - def __del__(self): - if self._webdriver is not None: - self._webdriver.quit() + self._webdriver_wrapper = None # *ytcfgs, webpage=None def _extract_player_url(self, *ytcfgs, **kw_webpage): @@ -1899,40 +1896,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret def _call_n_function_with_webdriver(self, webdriver_type, jscode, n_param): - if self._webdriver is None: - wd = importlib.import_module('selenium.webdriver') - if webdriver_type == 'firefox': # geckodriver - o = wd.FirefoxOptions() - o.headless = True - s = wd.firefox.service.Service(log_path=os.path.devnull) - self._webdriver = wd.Firefox(options=o, service=s) - elif webdriver_type == 'chrome': # chromedriver - o = wd.ChromeOptions() - o.headless = True - """ - If you are using the snap version of the chromium, chromedriver is included in the snap package. - You should use that driver. - $ cd /snap/bin && sudo ln -s -T chromium.chromedriver chromedriver - or - s = wd.chrome.service.Service(executable_path='chromium.chromedriver') - self._webdriver = wd.Chrome(options=o, service=s) - """ - self._webdriver = wd.Chrome(options=o) - elif webdriver_type == 'edge': # msedgedriver - o = wd.EdgeOptions() - o.headless = True - self._webdriver = wd.Edge(options=o) - elif webdriver_type == 'safari': # safaridriver - """ - safaridriver does not have headless-mode. :( - But macOS includes safaridriver by default. - To enable automation on safaridriver, run the following command once from the admin terminal. - # safaridriver --enable - """ - self._webdriver = wd.Safari() - else: - raise ExtractorError('unsupported webdriver type: %s' % (webdriver_type)) - self._webdriver.get('about:blank') + if self._webdriver_wrapper is None: + self._webdriver_wrapper = WebDriverJSWrapper(webdriver_type) + self._webdriver_wrapper.get('about:blank') funcname = self._extract_n_function_name(jscode) alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' dummyfunc = ''.join(random.choice(alphabet) for _ in range(8)) @@ -1947,7 +1913,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'document.body.append(s);' 'return {0}("{2}");' '}})("{3}");').format(dummyfunc, funcname, n_param, compat_urllib_quote(jscode)) - n = self._webdriver.execute_script(f) + n = self._webdriver_wrapper.executeJS(f) return n def _extract_n_function_name(self, jscode): @@ -4323,3 +4289,75 @@ class YoutubeTruncatedIDIE(InfoExtractor): raise ExtractorError( 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), expected=True) + + +class WebDriverJSWrapper(object): + """WebDriver Wrapper class""" + + def __init__(self, webdriver_type, pageload_timeout=10, script_timeout=5): + self._webdriver = None + try: + wd = importlib.import_module('selenium.webdriver') + except ImportError as e: + self._raise_exception('Failed to import module "selenium.webdriver"', cause=e) + + if webdriver_type == 'firefox': # geckodriver + if not check_executable('geckodriver', ['--version']): + self._raise_exception('geckodriver not found in PATH') + o = wd.FirefoxOptions() + o.headless = True + s = wd.firefox.service.Service(log_path=os.path.devnull) + self._webdriver = wd.Firefox(options=o, service=s) + elif webdriver_type == 'chrome': # chromedriver + if not check_executable('chromedriver', ['--version']): + self._raise_exception('chromedriver not found in PATH') + o = wd.ChromeOptions() + o.headless = True + """ + If you are using the snap version of the chromium, chromedriver is included in the snap package. + You should use that driver. + $ cd /snap/bin && sudo ln -s -T chromium.chromedriver chromedriver + or + s = wd.chrome.service.Service(executable_path='chromium.chromedriver') + self._webdriver = wd.Chrome(options=o, service=s) + """ + self._webdriver = wd.Chrome(options=o) + elif webdriver_type == 'edge': # msedgedriver + if not check_executable('msedgedriver', ['--version']): + self._raise_exception('msedgedriver not found in PATH') + o = wd.EdgeOptions() + o.headless = True + self._webdriver = wd.Edge(options=o) + elif webdriver_type == 'safari': # safaridriver + if not check_executable('safaridriver', ['--version']): + self._raise_exception('safaridriver not found in PATH') + """ + safaridriver does not have headless-mode. :( + But macOS includes safaridriver by default. + To enable automation on safaridriver, run the following command once from the admin terminal. + # safaridriver --enable + """ + self._webdriver = wd.Safari() + else: + self._raise_exception('unsupported type: %s' % (webdriver_type)) + self._webdriver.set_page_load_timeout(pageload_timeout) + self._webdriver.set_script_timeout(script_timeout) + + def __del__(self): + if self._webdriver is not None: + self._webdriver.quit() + + def _raise_exception(self, msg, cause=None): + raise ExtractorError('[WebDriverJSWrapper] %s' % (msg), cause=cause) + + def get(self, url): + """Loads a web page in the current browser session""" + self._webdriver.get(url) + + def executeJS(self, jscode): + """Execute JS and return value""" + try: + ret = self._webdriver.execute_script(jscode) + except Exception as e: + self._raise_exception('Failed to execute JS', cause=e) + return ret From e7430aaeec37909bbba94801e0ec5acff4f4156b Mon Sep 17 00:00:00 2001 From: u-n-k-n-o-w-n <7951720+u-n-k-n-o-w-n@users.noreply.github.com> Date: Sat, 3 Sep 2022 12:21:32 +0900 Subject: [PATCH 4/5] help messages are now clearer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jouni Järvinen --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 27dcc4807..325e1fdbf 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -578,7 +578,7 @@ def parseOpts(overrideArguments=None): 'along with --min-sleep-interval.')) workarounds.add_option( '--webdriver', metavar='TYPE', dest='webdriver', default=None, - help='Specify webdriver type when you want to use selenium webdriver to execute "n_function" : "firefox", "chrome", "edge", or "safari"') + help='Specify webdriver type when you want to use Selenium to execute YouTube\'s "n_function" in order to avoid throttling: "firefox", "chrome", "edge", or "safari"') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( From 5ebdc22d19cd00629bf1d3afed26901049d2167d Mon Sep 17 00:00:00 2001 From: unknown <7951720+u-n-k-n-o-w-n@users.noreply.github.com> Date: Tue, 22 Apr 2025 19:34:41 +0900 Subject: [PATCH 5/5] recent selenium does not support headless property --- youtube_dl/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5d8250f3e..5eb3c9ab4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -4305,14 +4305,13 @@ class WebDriverJSWrapper(object): if not check_executable('geckodriver', ['--version']): self._raise_exception('geckodriver not found in PATH') o = wd.FirefoxOptions() - o.headless = True - s = wd.firefox.service.Service(log_path=os.path.devnull) - self._webdriver = wd.Firefox(options=o, service=s) + o.add_argument('-headless') + self._webdriver = wd.Firefox(options=o) elif webdriver_type == 'chrome': # chromedriver if not check_executable('chromedriver', ['--version']): self._raise_exception('chromedriver not found in PATH') o = wd.ChromeOptions() - o.headless = True + o.add_argument('--headless') """ If you are using the snap version of the chromium, chromedriver is included in the snap package. You should use that driver. @@ -4326,7 +4325,7 @@ class WebDriverJSWrapper(object): if not check_executable('msedgedriver', ['--version']): self._raise_exception('msedgedriver not found in PATH') o = wd.EdgeOptions() - o.headless = True + o.add_argument('--headless') self._webdriver = wd.Edge(options=o) elif webdriver_type == 'safari': # safaridriver if not check_executable('safaridriver', ['--version']):