From 0326bcb6c1a45067a804d2f32aa854a3fcd9b4ce Mon Sep 17 00:00:00 2001 From: Chuck Cho Date: Thu, 15 Aug 2019 15:14:47 -0700 Subject: [PATCH 0001/1705] [piksel] add subtitle capability (#20506) --- youtube_dl/extractor/piksel.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py index c0c276a50..401298cb8 100644 --- a/youtube_dl/extractor/piksel.py +++ b/youtube_dl/extractor/piksel.py @@ -18,15 +18,14 @@ class PikselIE(InfoExtractor): _VALID_URL = r'https?://player\.piksel\.com/v/(?P[a-z0-9]+)' _TESTS = [ { - 'url': 'http://player.piksel.com/v/nv60p12f', - 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', + 'url': 'http://player.piksel.com/v/ums2867l', + 'md5': '34e34c8d89dc2559976a6079db531e85', 'info_dict': { - 'id': 'nv60p12f', + 'id': 'ums2867l', 'ext': 'mp4', - 'title': 'فن الحياة - الحلقة 1', - 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', - 'timestamp': 1465231790, - 'upload_date': '20160606', + 'title': 'GX-005 with Caption', + 'timestamp': 1481335659, + 'upload_date': '20161210' } }, { @@ -39,7 +38,7 @@ class PikselIE(InfoExtractor): 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', 'timestamp': 1486171129, - 'upload_date': '20170204', + 'upload_date': '20170204' } } ] @@ -113,6 +112,13 @@ class PikselIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + if caption_url: + subtitles.setdefault(caption.get('locale', 'en'), []).append({ + 'url': caption_url}) + return { 'id': video_id, 'title': title, @@ -120,4 +126,5 @@ class PikselIE(InfoExtractor): 'thumbnail': video_data.get('thumbnailUrl'), 'timestamp': parse_iso8601(video_data.get('dateadd')), 'formats': formats, + 'subtitles': subtitles, } From 0add33abcb9eb3ac93f7af312940b033b4ae4168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 16 Aug 2019 23:36:23 +0700 Subject: [PATCH 0002/1705] [youtube] Improve unavailable message extraction (refs #22117) --- youtube_dl/extractor/youtube.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b63f19bb0..57d76a5a2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1809,10 +1809,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break def extract_unavailable_message(): - return self._html_search_regex( - (r'(?s)]+id=["\']unavailable-submessage["\'][^>]+>(.+?)]+id=["\']unavailable-message["\'][^>]*>(.+?)'), - video_webpage, 'unavailable message', default=None) + messages = [] + for tag, kind in (('h1', 'message'), ('div', 'submessage')): + msg = self._html_search_regex( + r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), + video_webpage, 'unavailable %s' % kind, default=None) + if msg: + messages.append(msg) + if messages: + return '\n'.join(messages) if not video_info: unavailable_message = extract_unavailable_message() From 393cc31d5eba52018b3de4dd76361d79cb1b5f49 Mon Sep 17 00:00:00 2001 From: supritkumar Date: Tue, 20 Aug 2019 22:52:59 -0400 Subject: [PATCH 0003/1705] [einthusan] Add support for einthusan.ca (#22171) --- youtube_dl/extractor/einthusan.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py index 1fb00c9b0..4e0f8bc81 100644 --- a/youtube_dl/extractor/einthusan.py +++ b/youtube_dl/extractor/einthusan.py @@ -19,7 +19,7 @@ from ..utils import ( class EinthusanIE(InfoExtractor): - _VALID_URL = r'https?://(?Peinthusan\.(?:tv|com))/movie/watch/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?Peinthusan\.(?:tv|com|ca))/movie/watch/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://einthusan.tv/movie/watch/9097/', 'md5': 'ff0f7f2065031b8a2cf13a933731c035', @@ -36,6 +36,9 @@ class EinthusanIE(InfoExtractor): }, { 'url': 'https://einthusan.com/movie/watch/9097/', 'only_matching': True, + }, { + 'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi', + 'only_matching': True, }] # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js From 183a18c4e7dad802404e932f3a7c33fad8db7891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Aug 2019 03:35:09 +0700 Subject: [PATCH 0004/1705] [usanetwork] Fix extraction (closes #22105) --- youtube_dl/extractor/usanetwork.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py index 823340776..54c7495cc 100644 --- a/youtube_dl/extractor/usanetwork.py +++ b/youtube_dl/extractor/usanetwork.py @@ -1,11 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .adobepass import AdobePassIE from ..utils import ( - extract_attributes, + NO_DEFAULT, smuggle_url, update_url_query, ) @@ -31,22 +29,22 @@ class USANetworkIE(AdobePassIE): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_params = extract_attributes(self._search_regex( - r'(]+data-usa-tve-player-container[^>]*>)', webpage, 'player params')) - video_id = player_params['data-mpx-guid'] - title = player_params['data-episode-title'] + def _x(name, default=NO_DEFAULT): + return self._search_regex( + r'data-%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' % name, + webpage, name, default=default, group='value') - account_pid, path = re.search( - r'data-src="(?:https?)?//player\.theplatform\.com/p/([^/]+)/.*?/(media/guid/\d+/\d+)', - webpage).groups() + video_id = _x('mpx-guid') + title = _x('episode-title') + mpx_account_id = _x('mpx-account-id', '2304992029') query = { 'mbr': 'true', } - if player_params.get('data-is-full-episode') == '1': + if _x('is-full-episode', None) == '1': query['manifest'] = 'm3u' - if player_params.get('data-entitlement') == 'auth': + if _x('is-entitlement', None) == '1': adobe_pass = {} drupal_settings = self._search_regex( r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', @@ -57,7 +55,7 @@ class USANetworkIE(AdobePassIE): adobe_pass = drupal_settings.get('adobePass', {}) resource = self._get_mvpd_resource( adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, player_params.get('data-episode-rating', 'TV-14')) + title, video_id, _x('episode-rating', 'TV-14')) query['auth'] = self._extract_mvpd_auth( url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) @@ -65,11 +63,11 @@ class USANetworkIE(AdobePassIE): info.update({ '_type': 'url_transparent', 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/%s/%s' % (account_pid, path), + 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), query), {'force_smil_url': True}), 'id': video_id, 'title': title, - 'series': player_params.get('data-show-title'), + 'series': _x('show-title', None), 'episode': title, 'ie_key': 'ThePlatform', }) From d1fcf255c5402d75a3f7b450bd1e795196d5817a Mon Sep 17 00:00:00 2001 From: phan-ctrl <54398886+phan-ctrl@users.noreply.github.com> Date: Tue, 27 Aug 2019 10:16:04 +0700 Subject: [PATCH 0005/1705] [safari] Fix authentication (closes #22161) (#22184) --- youtube_dl/extractor/safari.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 8d4806794..bd9ee1647 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -68,9 +68,10 @@ class SafariBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login: %s' % credentials, expected=True) - # oreilly serves two same groot_sessionid cookies in Set-Cookie header - # and expects first one to be actually set - self._apply_first_set_cookie_header(urlh, 'groot_sessionid') + # oreilly serves two same instances of the following cookies + # in Set-Cookie header and expects first one to be actually set + for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): + self._apply_first_set_cookie_header(urlh, cookie) _, urlh = self._download_webpage_handle( auth.get('redirect_uri') or next_uri, None, 'Completing login',) From 494d664e679c5b0f85e3c899579e7eb8a1cc8246 Mon Sep 17 00:00:00 2001 From: sofutru <54445344+sofutru@users.noreply.github.com> Date: Wed, 28 Aug 2019 01:39:59 +0700 Subject: [PATCH 0006/1705] [youtube] Add support for invidious.nixnet.xyz and yt.elukerio.org (#22223) --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 57d76a5a2..25d056b3c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -387,8 +387,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?invidious\.enkirton\.net/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.nixnet\.xyz/| (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?yt\.elukerio\.org/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: From b72305f07892daa287bd51b70dd8eeeed627e7d6 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 26 Aug 2019 23:16:18 +0800 Subject: [PATCH 0007/1705] [bbccouk] Extend _VALID_URL (closes #19200) --- youtube_dl/extractor/bbc.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e76507951..3f820eed2 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -40,6 +40,7 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| + sounds/play/| events/[^/]+/play/[^/]+/ ) (?P%s)(?!/(?:episodes|broadcasts|clips)) @@ -220,6 +221,20 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, + }, { + 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', + 'note': 'Audio', + 'info_dict': { + 'id': 'm0007jz9', + 'ext': 'mp4', + 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', + 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", + 'duration': 9840, + }, + 'params': { + # rtmp download + 'skip_download': True, + } }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, From acc86c9a978a916fed1d1ed9ecf201d2c1a3060c Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 26 Aug 2019 23:04:38 +0800 Subject: [PATCH 0008/1705] [bbc] Fix some tests --- youtube_dl/extractor/bbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 3f820eed2..901c5a54f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -71,7 +71,7 @@ class BBCCoUkIE(InfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', + 'title': 'Kaleidoscope, Leonard Cohen', 'description': 'The Canadian poet and songwriter reflects on his musical career.', }, 'params': { @@ -624,7 +624,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'title': 'Russia stages massive WW2 parade', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, From b500955a58efb019133b3214ea81635342728f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 28 Aug 2019 01:58:07 +0700 Subject: [PATCH 0009/1705] [openload] Add support for oload.vip (closes #22205) --- youtube_dl/extractor/openload.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 030355257..ab4980d4d 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,7 +243,12 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _DOMAINS = r'(?:openload\.(?:co|io|link|pw)|oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website)|oladblock\.(?:services|xyz|me)|openloed\.co)' + _DOMAINS = r'''(?x) + (?: + openload\.(?:co|io|link|pw)| + oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website|vip)| + oladblock\.(?:services|xyz|me)|openloed\.co) + ''' _VALID_URL = r'''(?x) https?:// (?P @@ -383,6 +388,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://openloed.co/f/b8NWEgkqNLI/', 'only_matching': True, + }, { + 'url': 'https://oload.vip/f/kUEfGclsU9o', + 'only_matching': True, }] @classmethod From 9d058b3206e431f19c955f9df3eaad34e7446f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 29 Aug 2019 23:08:19 +0700 Subject: [PATCH 0010/1705] [dailymotion] Add support for lequipe.fr (closes #21328, closes #22152) --- youtube_dl/extractor/dailymotion.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3d3d78041..745971900 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -48,7 +48,14 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P[^/?_]+)' + _VALID_URL = r'''(?ix) + https?:// + (?: + (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:www\.)?lequipe\.fr/video + ) + /(?P[^/?_]+) + ''' IE_NAME = 'dailymotion' _FORMATS = [ @@ -133,6 +140,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', 'only_matching': True, + }, { + 'url': 'https://www.lequipe.fr/video/x791mem', + 'only_matching': True, + }, { + 'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2', + 'only_matching': True, }] @staticmethod From 3f46a25a971b6c1e56747cc4813c1ccdfb18aaa2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 31 Aug 2019 10:02:09 +0100 Subject: [PATCH 0011/1705] [verystream] add support for woof.tube (closes #22217) --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index ab4980d4d..b638450af 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -462,7 +462,7 @@ class OpenloadIE(InfoExtractor): class VerystreamIE(OpenloadIE): IE_NAME = 'verystream' - _DOMAINS = r'(?:verystream\.com)' + _DOMAINS = r'(?:verystream\.com|woof\.tube)' _VALID_URL = r'''(?x) https?:// (?P From 71f47617c8845900c1a3c2da79f9f7654199fedb Mon Sep 17 00:00:00 2001 From: telephono Date: Sat, 31 Aug 2019 19:24:43 +0200 Subject: [PATCH 0012/1705] [downloader/external] Respect mtime option for aria2c (#22242) --- youtube_dl/downloader/external.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index acdb27712..c31f8910a 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -194,6 +194,7 @@ class Aria2cFD(ExternalFD): cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') + cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') cmd += ['--', info_dict['url']] return cmd From cc73d5ad15aed96f6462b8079ccb6716c2ef9f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 01:18:25 +0700 Subject: [PATCH 0013/1705] [openload] Fix domains regex --- youtube_dl/extractor/openload.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index b638450af..679eaf6c3 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,12 +243,13 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _DOMAINS = r'''(?x) + _DOMAINS = r''' (?: openload\.(?:co|io|link|pw)| oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website|vip)| - oladblock\.(?:services|xyz|me)|openloed\.co) - ''' + oladblock\.(?:services|xyz|me)|openloed\.co + ) + ''' _VALID_URL = r'''(?x) https?:// (?P @@ -396,7 +397,7 @@ class OpenloadIE(InfoExtractor): @classmethod def _extract_urls(cls, webpage): return re.findall( - r']+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' + r'(?x)]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' % (cls._DOMAINS, cls._EMBED_WORD), webpage) def _extract_decrypted_page(self, page_url, webpage, video_id): From d78657fd18ae6413239137298eee4c54f3efee32 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 22 Jul 2019 14:09:21 -0700 Subject: [PATCH 0014/1705] [extractor/generic] Add support for squarespace embeds (closes #21294) --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d34fc4b15..7dd2e2d5f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2075,6 +2075,17 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Squarespace video embed, 2019-08-28 + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + }, + 'params': { + 'skip_download': True, + }, + }, { # Zype embed 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2395,6 +2406,13 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) + # unescape re.sub replacement + def unescape_resub(m): + return unescapeHTML(m.group(0)) + + # unescape squarespace video embeds + webpage = re.sub(r']+class=[^>]*?sqs-video-wrapper[^>]*>', unescape_resub, webpage) + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name From 7cb51b5daf07a6a627a9084394636c570194cc4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 01:23:58 +0700 Subject: [PATCH 0015/1705] [extractor/generic] Improve squarespace detection and fix test (closes #21859, refs #21294, refs #21802) --- youtube_dl/extractor/generic.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7dd2e2d5f..d1725d98b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2081,6 +2081,11 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'Tc7b_JGdZfw', 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', }, 'params': { 'skip_download': True, @@ -2406,12 +2411,11 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) - # unescape re.sub replacement - def unescape_resub(m): - return unescapeHTML(m.group(0)) - - # unescape squarespace video embeds - webpage = re.sub(r']+class=[^>]*?sqs-video-wrapper[^>]*>', unescape_resub, webpage) + # Unescape squarespace embeds to be detected by generic extractor, + # see https://github.com/ytdl-org/youtube-dl/issues/21294 + webpage = re.sub( + r']+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', + lambda x: unescapeHTML(x.group(0)), webpage) # it's tempting to parse this further, but you would # have to take into account all the variations like From 8945b10f6e10337db0c9bf7a70758c8ecbb6c830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 02:59:40 +0700 Subject: [PATCH 0016/1705] [xhamster] Add support for more domains --- youtube_dl/extractor/xhamster.py | 35 ++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d268372e6..4297dffee 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -18,21 +18,21 @@ from ..utils import ( class XHamsterIE(InfoExtractor): + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster[27]\.com)' _VALID_URL = r'''(?x) https?:// - (?:.+?\.)?xhamster\.(?:com|one)/ + (?:.+?\.)?%s/ (?: movies/(?P\d+)/(?P[^/]*)\.html| videos/(?P[^/]*)-(?P\d+) ) - ''' - + ''' % _DOMAINS _TESTS = [{ - 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - 'md5': '8281348b8d3c53d39fffb377d24eac4e', + 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'md5': '98b4687efb1ffd331c4197854dc09e8f', 'info_dict': { 'id': '1509445', - 'display_id': 'femaleagent_shy_beauty_takes_the_bait', + 'display_id': 'femaleagent-shy-beauty-takes-the-bait', 'ext': 'mp4', 'title': 'FemaleAgent Shy beauty takes the bait', 'timestamp': 1350194821, @@ -40,13 +40,12 @@ class XHamsterIE(InfoExtractor): 'uploader': 'Ruseful2011', 'duration': 893, 'age_limit': 18, - 'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Beauti', 'Beauties', 'Beautiful', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy', 'Taking'], }, }, { - 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'url': 'https://xhamster.com/videos/britney-spears-sexy-booty-2221348?hd=', 'info_dict': { 'id': '2221348', - 'display_id': 'britney_spears_sexy_booty', + 'display_id': 'britney-spears-sexy-booty', 'ext': 'mp4', 'title': 'Britney Spears Sexy Booty', 'timestamp': 1379123460, @@ -54,13 +53,12 @@ class XHamsterIE(InfoExtractor): 'uploader': 'jojo747400', 'duration': 200, 'age_limit': 18, - 'categories': ['Britney Spears', 'Celebrities', 'HD Videos', 'Sexy', 'Sexy Booty'], }, 'params': { 'skip_download': True, }, }, { - # empty seo + # empty seo, unavailable via new URL schema 'url': 'http://xhamster.com/movies/5667973/.html', 'info_dict': { 'id': '5667973', @@ -71,7 +69,6 @@ class XHamsterIE(InfoExtractor): 'uploader': 'parejafree', 'duration': 72, 'age_limit': 18, - 'categories': ['Amateur', 'Blowjobs'], }, 'params': { 'skip_download': True, @@ -94,6 +91,18 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'https://xhamster.one/videos/femaleagent-shy-beauty-takes-the-bait-1509445', 'only_matching': True, + }, { + 'url': 'https://xhamster.desi/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'https://xhamster2.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', + 'only_matching': True, + }, { + 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'only_matching': True, }] def _real_extract(self, url): @@ -285,7 +294,7 @@ class XHamsterIE(InfoExtractor): class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' + _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { From df228355fd752400ada21d9e202c96932b3ac6e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 03:12:56 +0700 Subject: [PATCH 0017/1705] [xhamster:user] Add extractor (closes #16330, closes #18454) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/xhamster.py | 48 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 06de556b7..4adcae1e5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1425,6 +1425,7 @@ from .xfileshare import XFileShareIE from .xhamster import ( XHamsterIE, XHamsterEmbedIE, + XHamsterUserIE, ) from .xiami import ( XiamiSongIE, diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 4297dffee..a5b94d279 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -8,6 +9,7 @@ from ..utils import ( clean_html, determine_ext, dict_get, + extract_attributes, ExtractorError, int_or_none, parse_duration, @@ -331,3 +333,49 @@ class XHamsterEmbedIE(InfoExtractor): video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') + + +class XHamsterUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)' % XHamsterIE._DOMAINS + _TESTS = [{ + # Paginated user profile + 'url': 'https://xhamster.com/users/netvideogirls/videos', + 'info_dict': { + 'id': 'netvideogirls', + }, + 'playlist_mincount': 267, + }, { + # Non-paginated user profile + 'url': 'https://xhamster.com/users/firatkaan/videos', + 'info_dict': { + 'id': 'firatkaan', + }, + 'playlist_mincount': 1, + }] + + def _entries(self, user_id): + next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id + for pagenum in itertools.count(1): + page = self._download_webpage( + next_page_url, user_id, 'Downloading page %s' % pagenum) + for video_tag in re.findall( + r'(]+class=["\'].*?\bvideo-thumb__image-container[^>]+>)', + page): + video = extract_attributes(video_tag) + video_url = url_or_none(video.get('href')) + if not video_url or not XHamsterIE.suitable(video_url): + continue + video_id = XHamsterIE._match_id(video_url) + yield self.url_result( + video_url, ie=XHamsterIE.ie_key(), video_id=video_id) + mobj = re.search(r']+data-page=["\']next[^>]+>', page) + if not mobj: + break + next_page = extract_attributes(mobj.group(0)) + next_page_url = url_or_none(next_page.get('href')) + if not next_page_url: + break + + def _real_extract(self, url): + user_id = self._match_id(url) + return self.playlist_result(self._entries(user_id), user_id) From 79dd8884bb83ae7244f11c928fd9af6abf1f97f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 03:18:35 +0700 Subject: [PATCH 0018/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9b9e2e149..d4d8fc980 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version + +Core ++ [extractor/generic] Add support for squarespace embeds (#21294, #21802, + #21859) ++ [downloader/external] Respect mtime option for aria2c (#22242) + +Extractors ++ [xhamster:user] Add support for user pages (#16330, #18454) ++ [xhamster] Add support for more domains ++ [verystream] Add support for woof.tube (#22217) ++ [dailymotion] Add support for lequipe.fr (#21328, #22152) ++ [openload] Add support for oload.vip (#22205) ++ [bbccouk] Extend URL regular expression (#19200) ++ [youtube] Add support for invidious.nixnet.xyz and yt.elukerio.org (#22223) +* [safari] Fix authentication (#22161, #22184) +* [usanetwork] Fix extraction (#22105) ++ [einthusan] Add support for einthusan.ca (#22171) +* [youtube] Improve unavailable message extraction (#22117) ++ [piksel] Extract subtitles (#20506) + + version 2019.08.13 Core From f620d0d860c34cd34e421247ed637a9d03f730ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 03:33:02 +0700 Subject: [PATCH 0019/1705] release 2019.09.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 4c75c8d5d..52c5c1c32 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.08.13** +- [ ] I've verified that I'm running youtube-dl version **2019.09.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.08.13 + [debug] youtube-dl version 2019.09.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8e8c43c47..000d2f55a 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.08.13** +- [ ] I've verified that I'm running youtube-dl version **2019.09.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index df719a29c..77b710606 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.08.13** +- [ ] I've verified that I'm running youtube-dl version **2019.09.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 3616db1a7..ae112d965 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.08.13** +- [ ] I've verified that I'm running youtube-dl version **2019.09.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.08.13 + [debug] youtube-dl version 2019.09.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 0fa37aef1..fccfdf71a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.08.13** +- [ ] I've verified that I'm running youtube-dl version **2019.09.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index d4d8fc980..e91e49854 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.09.01 Core + [extractor/generic] Add support for squarespace embeds (#21294, #21802, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7cf60eefe..18bddc138 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1100,6 +1100,7 @@ - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me - **XHamster** - **XHamsterEmbed** + - **XHamsterUser** - **xiami:album**: 虾米音乐 - 专辑 - **xiami:artist**: 虾米音乐 - 歌手 - **xiami:collection**: 虾米音乐 - 精选集 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b53a08cae..98fa32286 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.08.13' +__version__ = '2019.09.01' From d7da1e37c72d931f7bdce9121a38e85bdadb9dc1 Mon Sep 17 00:00:00 2001 From: Patrick Dessalle Date: Sun, 1 Sep 2019 19:59:57 +0200 Subject: [PATCH 0020/1705] [nickjr] Add support for nickelodeonjunior.fr (#22246) --- youtube_dl/extractor/nick.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 5e34d776b..2e8b302ac 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -85,7 +85,8 @@ class NickBrIE(MTVServicesInfoExtractor): https?:// (?: (?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.[a-z]{2} + (?:www\.)?nickjr\.[a-z]{2}| + (?:www\.)?nickelodeonjunior\.fr ) /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?\#.]+) ''' @@ -101,6 +102,9 @@ class NickBrIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeonjunior.fr/paw-patrol-la-pat-patrouille/videos/episode-401-entier-paw-patrol/', + 'only_matching': True, }] def _real_extract(self, url): From 66d04c74e097c03e4d644d7292546884cbee3d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Sep 2019 01:23:22 +0700 Subject: [PATCH 0021/1705] [platzi:course] Add support for authentication --- youtube_dl/extractor/platzi.py | 73 ++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py index 557b2b5ad..cd6b966c5 100644 --- a/youtube_dl/extractor/platzi.py +++ b/youtube_dl/extractor/platzi.py @@ -18,43 +18,10 @@ from ..utils import ( ) -class PlatziIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - platzi\.com/clases| # es version - courses\.platzi\.com/classes # en version - )/[^/]+/(?P\d+)-[^/?\#&]+ - ''' +class PlatziBaseIE(InfoExtractor): _LOGIN_URL = 'https://platzi.com/login/' _NETRC_MACHINE = 'platzi' - _TESTS = [{ - 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', - 'md5': '8f56448241005b561c10f11a595b37e3', - 'info_dict': { - 'id': '12074', - 'ext': 'mp4', - 'title': 'Creando nuestra primera página', - 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', - 'duration': 420, - }, - 'skip': 'Requires platzi account credentials', - }, { - 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', - 'info_dict': { - 'id': '13430', - 'ext': 'mp4', - 'title': 'Background', - 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', - 'duration': 360, - }, - 'skip': 'Requires platzi account credentials', - 'params': { - 'skip_download': True, - }, - }] - def _real_initialize(self): self._login() @@ -97,6 +64,42 @@ class PlatziIE(InfoExtractor): 'Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') + +class PlatziIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/[^/]+/(?P\d+)-[^/?\#&]+ + ''' + + _TESTS = [{ + 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', + 'md5': '8f56448241005b561c10f11a595b37e3', + 'info_dict': { + 'id': '12074', + 'ext': 'mp4', + 'title': 'Creando nuestra primera página', + 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', + 'duration': 420, + }, + 'skip': 'Requires platzi account credentials', + }, { + 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', + 'info_dict': { + 'id': '13430', + 'ext': 'mp4', + 'title': 'Background', + 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', + 'duration': 360, + }, + 'skip': 'Requires platzi account credentials', + 'params': { + 'skip_download': True, + }, + }] + def _real_extract(self, url): lecture_id = self._match_id(url) @@ -146,7 +149,7 @@ class PlatziIE(InfoExtractor): } -class PlatziCourseIE(InfoExtractor): +class PlatziCourseIE(PlatziBaseIE): _VALID_URL = r'''(?x) https?:// (?: From 31dbd054c801ec14c1ea29a2167b70c980f1d782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 Sep 2019 01:24:20 +0700 Subject: [PATCH 0022/1705] [platzi] Improve client data extraction (closes #22290) --- youtube_dl/extractor/platzi.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py index cd6b966c5..602207beb 100644 --- a/youtube_dl/extractor/platzi.py +++ b/youtube_dl/extractor/platzi.py @@ -107,7 +107,11 @@ class PlatziIE(PlatziBaseIE): data = self._parse_json( self._search_regex( - r'client_data\s*=\s*({.+?})\s*;', webpage, 'client data'), + # client_data may contain "};" so that we have to try more + # strict regex first + (r'client_data\s*=\s*({.+?})\s*;\s*\n', + r'client_data\s*=\s*({.+?})\s*;'), + webpage, 'client data'), lecture_id) material = data['initialState']['material'] From bff90fc518d6ccadaafc26407a688dc1bbd32dff Mon Sep 17 00:00:00 2001 From: sofutru <54445344+sofutru@users.noreply.github.com> Date: Tue, 3 Sep 2019 01:35:32 +0700 Subject: [PATCH 0023/1705] [youtube] Add support for invidious tor instances (#22268) --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 25d056b3c..abafd5157 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -391,6 +391,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| (?:www\.)?yt\.elukerio\.org/| + (?:www\.)?kgg2m7yk5aybusll\.onion/| + (?:www\.)?qklhadlycap4cnod\.onion/| + (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| + (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| + (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| + (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: From bf1317d257d13188601c837c983830355c6203e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 11 Sep 2019 22:44:47 +0700 Subject: [PATCH 0024/1705] [youtube] Quick extraction tempfix (closes #22367, closes #22163) --- youtube_dl/extractor/youtube.py | 184 ++++++++++++++++++-------------- 1 file changed, 106 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index abafd5157..9d0058b2a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1915,6 +1915,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return int_or_none(self._search_regex( r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] + streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1923,10 +1926,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): + elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) + formats = [] formats_spec = {} fmt_list = video_info.get('fmt_list', [''])[0] if fmt_list: @@ -1941,90 +1945,105 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'height': int_or_none(width_height[1]), } q = qualities(['small', 'medium', 'hd720']) - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) - if streaming_formats: - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - formats = [] - for url_data_str in encoded_url_map.split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'): + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } + + for fmt in streaming_formats: + if fmt.get('drm_families'): + continue + url = url_or_none(fmt.get('url')) + + if not url: + cipher = fmt.get('cipher') + if not cipher: + continue + url_data = compat_parse_qs(cipher) + url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) + if not url: + continue + else: + cipher = None + url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) # Unsupported FORMAT_STREAM_TYPE_OTF if stream_type == 3: continue - format_id = url_data['itag'][0] - url = url_data['url'][0] - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') + format_id = fmt.get('itag') or url_data['itag'][0] + if not format_id: + continue + format_id = compat_str(format_id) + + if cipher: + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') + jsplayer_url_json = self._search_regex( + ASSETS_RE, embed_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): + player_url = json.loads(jsplayer_url_json) if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version + player_url_json = self._search_regex( + r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', + video_webpage, 'age gate player URL') + player_url = json.loads(player_url_json) + + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + + if self._downloader.params.get('verbose'): + if player_url is None: + player_version = 'unknown' + player_desc = 'unknown' else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version + if player_url.endswith('swf'): + player_version = self._search_regex( + r'-(.+?)(?:/watch_as3)?\.swf$', player_url, + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + player_url, + 'html5 player', fatal=False) + player_desc = 'html5 player %s' % player_version - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) + parts_sizes = self._signature_cache_id(encrypted_sig) + self.to_screen('{%s} signature length %s, %s' % + (format_id, parts_sizes, player_desc)) - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' + url += '&%s=%s' % (sp, signature) if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -2044,24 +2063,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + if width is None: + width = int_or_none(fmt.get('width')) + if height is None: + height = int_or_none(fmt.get('height')) + filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) - quality = url_data.get('quality', [None])[0] + quality = url_data.get('quality', [None])[0] or fmt.get('quality') + quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') + + tbr = float_or_none(url_data.get('bitrate', [None])[0], 1000) or float_or_none(fmt.get('bitrate'), 1000) + fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) more_fields = { 'filesize': filesize, - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), + 'tbr': tbr, 'width': width, 'height': height, - 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or quality, + 'fps': fps, + 'format_note': quality_label or quality, 'quality': q(quality), } for key, value in more_fields.items(): if value: dct[key] = value - type_ = url_data.get('type', [None])[0] + type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') if type_: type_split = type_.split(';') kind_ext = type_split[0].split('/') From 035c7a59e8eeb8725740b91f7cdd561e2d288a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Sep 2019 01:18:25 +0700 Subject: [PATCH 0025/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index e91e49854..c9e74e86d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [youtube] Quick extraction tempfix (#22367, #22163) + + version 2019.09.01 Core From bd10b229c0311c668a65ce32ed99836237d9896b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Sep 2019 01:21:21 +0700 Subject: [PATCH 0026/1705] release 2019.09.12 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- youtube_dl/version.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 52c5c1c32..01f5095ed 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.01 + [debug] youtube-dl version 2019.09.12 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 000d2f55a..88c8b5cc6 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 77b710606..d99c7e32d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ae112d965..346139f3e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.01 + [debug] youtube-dl version 2019.09.12 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index fccfdf71a..cce5bb037 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.01** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 98fa32286..212b5a7bc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.09.01' +__version__ = '2019.09.12' From 303d3e142cfface18e02fda4087661e3bbab9343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Sep 2019 02:05:54 +0700 Subject: [PATCH 0027/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index c9e74e86d..d0283cacb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.09.12 Extractors * [youtube] Quick extraction tempfix (#22367, #22163) From 4878759f3bcdbf0ed80ff9fe9aa9f682772d1a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Sep 2019 02:45:30 +0700 Subject: [PATCH 0028/1705] [youtube] Remove quality and tbr for itag 43 (closes #22372) --- youtube_dl/extractor/youtube.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9d0058b2a..f002d870e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -41,7 +41,6 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, - qualities, remove_quotes, remove_start, smuggle_url, @@ -1944,7 +1943,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } - q = qualities(['small', 'medium', 'hd720']) for fmt in streaming_formats: itag = str_or_none(fmt.get('itag')) if not itag: @@ -1957,7 +1955,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format_note': quality_label, 'fps': int_or_none(fmt.get('fps')), 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), # bitrate for itag 43 is always 2147483647 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, 'width': int_or_none(fmt.get('width')), @@ -2074,7 +2071,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): quality = url_data.get('quality', [None])[0] or fmt.get('quality') quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - tbr = float_or_none(url_data.get('bitrate', [None])[0], 1000) or float_or_none(fmt.get('bitrate'), 1000) + tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) + or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) more_fields = { @@ -2084,7 +2082,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'height': height, 'fps': fps, 'format_note': quality_label or quality, - 'quality': q(quality), } for key, value in more_fields.items(): if value: From 2f851a7d7dfbf2aabd78bd4863e2c6a33e3429bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Sep 2019 02:48:07 +0700 Subject: [PATCH 0029/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index d0283cacb..ba5f37c73 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [youtube] Remove quality and tbr for itag 43 (#22372) + + version 2019.09.12 Extractors From e1f692f0b39ecec180bb291d0ae5ee4a9289402e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 12 Sep 2019 02:53:52 +0700 Subject: [PATCH 0030/1705] release 2019.09.12.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 01f5095ed..0c24155e6 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.09.12** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.12 + [debug] youtube-dl version 2019.09.12.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 88c8b5cc6..9babe0360 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.09.12** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d99c7e32d..72322fe26 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.12** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 346139f3e..da7f2cf93 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.09.12** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.12 + [debug] youtube-dl version 2019.09.12.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index cce5bb037..d41022b9f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.12** +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index ba5f37c73..2aba02065 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.09.12.1 Extractors * [youtube] Remove quality and tbr for itag 43 (#22372) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 212b5a7bc..df82cdf0f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.09.12' +__version__ = '2019.09.12.1' From 20e11b70ac48fd5bffae194d829f7e31fcc65fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 18 Sep 2019 23:45:26 +0700 Subject: [PATCH 0031/1705] [tv4] Fix extraction and extract series metadata (closes #22443) --- youtube_dl/extractor/tv4.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 51923e44a..a819d048c 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -72,8 +72,13 @@ class TV4IE(InfoExtractor): video_id = self._match_id(url) info = self._download_json( - 'http://www.tv4play.se/player/assets/%s.json' % video_id, - video_id, 'Downloading video info JSON') + 'https://playback-api.b17g.net/asset/%s' % video_id, + video_id, 'Downloading video info JSON', query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls,dash', + 'drm': 'widevine', + })['metadata'] title = info['title'] @@ -111,5 +116,9 @@ class TV4IE(InfoExtractor): 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), 'thumbnail': info.get('image'), - 'is_live': info.get('is_live') is True, + 'is_live': info.get('isLive') is True, + 'series': info.get('seriesTitle'), + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode': info.get('episodeTitle'), + 'episode_number': int_or_none(info.get('episodeNumber')), } From 9cf26b6e1d0658eb0b252872ef011d765b8341a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 19 Sep 2019 01:11:52 +0700 Subject: [PATCH 0032/1705] [zdf] Bypass geo restriction --- youtube_dl/extractor/zdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index afa3f6c47..145c123a4 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -41,6 +41,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _GEO_COUNTRIES = ['DE'] _TESTS = [{ 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', From 6fd26a7d4a851b260c56974d77911804c09ff816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 19 Sep 2019 02:31:39 +0700 Subject: [PATCH 0033/1705] [9now] Fix extraction (closes #22361) --- youtube_dl/extractor/ninenow.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py index f32f530f7..6157dc7c1 100644 --- a/youtube_dl/extractor/ninenow.py +++ b/youtube_dl/extractor/ninenow.py @@ -45,7 +45,11 @@ class NineNowIE(InfoExtractor): webpage = self._download_webpage(url, display_id) page_data = self._parse_json(self._search_regex( r'window\.__data\s*=\s*({.*?});', webpage, - 'page data'), display_id) + 'page data', default='{}'), display_id, fatal=False) + if not page_data: + page_data = self._parse_json(self._parse_json(self._search_regex( + r'window\.__data\s*=\s*JSON\.parse\s*\(\s*(".+?")\s*\)\s*;', + webpage, 'page data'), display_id), display_id) for kind in ('episode', 'clip'): current_key = page_data.get(kind, {}).get( From 1cb812d3c257b3f6d3c07ccabcbdebd0190fbec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 19 Sep 2019 03:00:19 +0700 Subject: [PATCH 0034/1705] [hotstar] Extract more formats (closes #22323) --- youtube_dl/extractor/hotstar.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 79d5bbb2e..c7545a1e6 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import hashlib import hmac +import re import time import uuid @@ -126,6 +127,8 @@ class HotStarIE(HotStarBaseIE): format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue + format_url = re.sub( + r'(?<=//staragvod)(\d)', r'web\1', format_url) tags = str_or_none(playback_set.get('tagsCombination')) or '' if tags and 'encryption:plain' not in tags: continue From d9d3098675daf219e0e0f910fbd704beb1775ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 19 Sep 2019 03:02:15 +0700 Subject: [PATCH 0035/1705] [hotstar] Use native HLS downloader by default --- youtube_dl/extractor/hotstar.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index c7545a1e6..f9f7c5a64 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -136,7 +136,8 @@ class HotStarIE(HotStarBaseIE): try: if 'package:hls' in tags or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls')) + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) elif 'package:dash' in tags or ext == 'mpd': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id='dash')) From f455a934e9a44aaebdf925b5d78ffafa39e14c97 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 19 Sep 2019 18:02:26 +0100 Subject: [PATCH 0036/1705] [brightcove] delegate all supported BrightcoveLegacyIE URLs to BrightcoveNewIE closes #11523 closes #12842 closes #13912 closes #15669 closes #16303 --- youtube_dl/extractor/brightcove.py | 196 ++++------------------------- 1 file changed, 25 insertions(+), 171 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 58ec5c979..8e2f7217a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import base64 -import json import re import struct @@ -11,14 +10,12 @@ from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, - compat_str, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, compat_HTTPError, ) from ..utils import ( - determine_ext, ExtractorError, extract_attributes, find_xpath_attr, @@ -27,18 +24,19 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, + smuggle_url, unescapeHTML, unsmuggle_url, update_url_query, clean_html, mimetype2ext, + UnsupportedError, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' - _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -55,7 +53,8 @@ class BrightcoveLegacyIE(InfoExtractor): 'timestamp': 1368213670, 'upload_date': '20130510', 'uploader_id': '1589608506001', - } + }, + 'skip': 'The player has been deactivated by the content owner', }, { # From http://medianetwork.oracle.com/video/player/1785452137001 @@ -70,6 +69,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'upload_date': '20120814', 'uploader_id': '1460825906', }, + 'skip': 'video not playable', }, { # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ @@ -79,7 +79,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'ext': 'mp4', 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', - 'uploader': 'Mashable', + # 'uploader': 'Mashable', 'timestamp': 1382041798, 'upload_date': '20131017', 'uploader_id': '1130468786001', @@ -124,6 +124,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'id': '3550319591001', }, 'playlist_mincount': 7, + 'skip': 'Unsupported URL', }, { # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) @@ -133,6 +134,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'Lesson 08', }, 'playlist_mincount': 10, + 'skip': 'Unsupported URL', }, { # playerID inferred from bcpid @@ -141,12 +143,6 @@ class BrightcoveLegacyIE(InfoExtractor): 'only_matching': True, # Tested in GenericIE } ] - FLV_VCODECS = { - 1: 'SORENSON', - 2: 'ON2', - 3: 'H264', - 4: 'VP8', - } @classmethod def _build_brighcove_url(cls, object_str): @@ -238,7 +234,8 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - return update_url_query(cls._FEDERATED_URL, params) + return update_url_query( + 'http://c.brightcove.com/services/viewer/htmlFederated', params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -297,38 +294,12 @@ class BrightcoveLegacyIE(InfoExtractor): videoPlayer = query.get('@videoPlayer') if videoPlayer: # We set the original url as the default 'Referer' header - referer = smuggled_data.get('Referer', url) + referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) + video_id = videoPlayer[0] if 'playerID' not in query: mobj = re.search(r'/bcpid(\d+)', url) if mobj is not None: query['playerID'] = [mobj.group(1)] - return self._get_video_info( - videoPlayer[0], query, referer=referer) - elif 'playerKey' in query: - player_key = query['playerKey'] - return self._get_playlist_info(player_key[0]) - else: - raise ExtractorError( - 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', - expected=True) - - def _brightcove_new_url_result(self, publisher_id, video_id): - brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) - return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) - - def _get_video_info(self, video_id, query, referer=None): - headers = {} - linkBase = query.get('linkBaseURL') - if linkBase is not None: - referer = linkBase[0] - if referer is not None: - headers['Referer'] = referer - webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) - - error_msg = self._html_search_regex( - r"

We're sorry.

([\s\n]*

.*?

)+", webpage, - 'error message', default=None) - if error_msg is not None: publisher_id = query.get('publisherId') if publisher_id and publisher_id[0].isdigit(): publisher_id = publisher_id[0] @@ -339,6 +310,9 @@ class BrightcoveLegacyIE(InfoExtractor): else: player_id = query.get('playerID') if player_id and player_id[0].isdigit(): + headers = {} + if referer: + headers['Referer'] = referer player_page = self._download_webpage( 'http://link.brightcove.com/services/player/bcpid' + player_id[0], video_id, headers=headers, fatal=False) @@ -349,136 +323,16 @@ class BrightcoveLegacyIE(InfoExtractor): if player_key: enc_pub_id = player_key.split(',')[1].replace('~', '=') publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] - if publisher_id: - return self._brightcove_new_url_result(publisher_id, video_id) - raise ExtractorError( - 'brightcove said: %s' % error_msg, expected=True) - - self.report_extraction(video_id) - info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json') - info = json.loads(info)['data'] - video_info = info['programmedContent']['videoPlayer']['mediaDTO'] - video_info['_youtubedl_adServerURL'] = info.get('adServerURL') - - return self._extract_video_info(video_info) - - def _get_playlist_info(self, player_key): - info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key - playlist_info = self._download_webpage( - info_url, player_key, 'Downloading playlist information') - - json_data = json.loads(playlist_info) - if 'videoList' in json_data: - playlist_info = json_data['videoList'] - playlist_dto = playlist_info['mediaCollectionDTO'] - elif 'playlistTabs' in json_data: - playlist_info = json_data['playlistTabs'] - playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] - else: - raise ExtractorError('Empty playlist') - - videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']] - - return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], - playlist_title=playlist_dto['displayName']) - - def _extract_video_info(self, video_info): - video_id = compat_str(video_info['id']) - publisher_id = video_info.get('publisherId') - info = { - 'id': video_id, - 'title': video_info['displayName'].strip(), - 'description': video_info.get('shortDescription'), - 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), - 'uploader': video_info.get('publisherName'), - 'uploader_id': compat_str(publisher_id) if publisher_id else None, - 'duration': float_or_none(video_info.get('length'), 1000), - 'timestamp': int_or_none(video_info.get('creationDate'), 1000), - } - - renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) - if renditions: - formats = [] - for rend in renditions: - url = rend['defaultURL'] - if not url: - continue - ext = None - if rend['remote']: - url_comp = compat_urllib_parse_urlparse(url) - if url_comp.path.endswith('.m3u8'): - formats.extend( - self._extract_m3u8_formats( - url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - continue - elif 'akamaihd.net' in url_comp.netloc: - # This type of renditions are served through - # akamaihd.net, but they don't use f4m manifests - url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' - ext = 'flv' - if ext is None: - ext = determine_ext(url) - tbr = int_or_none(rend.get('encodingRate'), 1000) - a_format = { - 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), - 'url': url, - 'ext': ext, - 'filesize': int_or_none(rend.get('size')) or None, - 'tbr': tbr, - } - if rend.get('audioOnly'): - a_format.update({ - 'vcodec': 'none', - }) - else: - a_format.update({ - 'height': int_or_none(rend.get('frameHeight')), - 'width': int_or_none(rend.get('frameWidth')), - 'vcodec': rend.get('videoCodec'), - }) - - # m3u8 manifests with remote == false are media playlists - # Not calling _extract_m3u8_formats here to save network traffic - if ext == 'm3u8': - a_format.update({ - 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), - 'ext': 'mp4', - 'protocol': 'm3u8_native', - }) - - formats.append(a_format) - self._sort_formats(formats) - info['formats'] = formats - elif video_info.get('FLVFullLengthURL') is not None: - info.update({ - 'url': video_info['FLVFullLengthURL'], - 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), - 'filesize': int_or_none(video_info.get('FLVFullSize')), - }) - - if self._downloader.params.get('include_ads', False): - adServerURL = video_info.get('_youtubedl_adServerURL') - if adServerURL: - ad_info = { - '_type': 'url', - 'url': adServerURL, - } - if 'url' in info: - return { - '_type': 'playlist', - 'title': info['title'], - 'entries': [ad_info, info], - } - else: - return ad_info - - if not info.get('url') and not info.get('formats'): - uploader_id = info.get('uploader_id') - if uploader_id: - info.update(self._brightcove_new_url_result(uploader_id, video_id)) - else: - raise ExtractorError('Unable to extract video url for %s' % video_id) - return info + if publisher_id: + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + if referer: + brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + # TODO: figure out if it's possible to extract playlistId from playerKey + # elif 'playerKey' in query: + # player_key = query['playerKey'] + # return self._get_playlist_info(player_key[0]) + raise UnsupportedError(url) class BrightcoveNewIE(AdobePassIE): From 6cf6b357f581af841a7d109aad47e40acf7d7795 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 20 Sep 2019 11:14:24 +0100 Subject: [PATCH 0037/1705] [mixcloud] allow uppercase letters in format urls(closes #19280) --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index bcac13ec5..bf5353ef9 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -164,7 +164,7 @@ class MixcloudIE(InfoExtractor): def decrypt_url(f_url): for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): decrypted_url = self._decrypt_xor_cipher(k, f_url) - if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): + if re.search(r'^https?://[0-9A-Za-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): return decrypted_url for url_key in ('url', 'hlsUrl', 'dashUrl'): From edb2820ca5febb6bc88201e348bcdc904e7cbacb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 21 Sep 2019 21:57:45 +0100 Subject: [PATCH 0038/1705] [instagram] add support for tv URLs --- youtube_dl/extractor/instagram.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ffd87b55f..b061850a1 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -92,6 +92,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/tv/aye83DjauH/', + 'only_matching': True, }] @staticmethod From 4bc15a68d1f7e2f38267081d3d764470da3439fa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 22 Sep 2019 17:14:18 +0100 Subject: [PATCH 0039/1705] [bilibili] add support audio albums and songs(closes #21094) --- youtube_dl/extractor/bilibili.py | 113 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 2 + 2 files changed, 115 insertions(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 3746671d3..80bd696e2 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -15,6 +15,7 @@ from ..utils import ( float_or_none, parse_iso8601, smuggle_url, + str_or_none, strip_jsonp, unified_timestamp, unsmuggle_url, @@ -306,3 +307,115 @@ class BiliBiliBangumiIE(InfoExtractor): return self.playlist_result( entries, bangumi_id, season_info.get('bangumi_title'), season_info.get('evaluate')) + + +class BilibiliAudioBaseIE(InfoExtractor): + def _call_api(self, path, sid, query=None): + if not query: + query = {'sid': sid} + return self._download_json( + 'https://www.bilibili.com/audio/music-service-c/web/' + path, + sid, query=query)['data'] + + +class BilibiliAudioIE(BilibiliAudioBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P\d+)' + _TEST = { + 'url': 'https://www.bilibili.com/audio/au1003142', + 'md5': 'fec4987014ec94ef9e666d4d158ad03b', + 'info_dict': { + 'id': '1003142', + 'ext': 'm4a', + 'title': '【tsukimi】YELLOW / 神山羊', + 'artist': 'tsukimi', + 'comment_count': int, + 'description': 'YELLOW的mp3版!', + 'duration': 183, + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1564836614, + 'upload_date': '20190803', + 'uploader': 'tsukimi-つきみぐー', + 'view_count': int, + }, + } + + def _real_extract(self, url): + au_id = self._match_id(url) + + play_data = self._call_api('url', au_id) + formats = [{ + 'url': play_data['cdns'][0], + 'filesize': int_or_none(play_data.get('size')), + }] + + song = self._call_api('song/info', au_id) + title = song['title'] + statistic = song.get('statistic') or {} + + subtitles = None + lyric = song.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }] + } + + return { + 'id': au_id, + 'title': title, + 'formats': formats, + 'artist': song.get('author'), + 'comment_count': int_or_none(statistic.get('comment')), + 'description': song.get('intro'), + 'duration': int_or_none(song.get('duration')), + 'subtitles': subtitles, + 'thumbnail': song.get('cover'), + 'timestamp': int_or_none(song.get('passtime')), + 'uploader': song.get('uname'), + 'view_count': int_or_none(statistic.get('play')), + } + + +class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P\d+)' + _TEST = { + 'url': 'https://www.bilibili.com/audio/am10624', + 'info_dict': { + 'id': '10624', + 'title': '每日新曲推荐(每日11:00更新)', + 'description': '每天11:00更新,为你推送最新音乐', + }, + 'playlist_count': 19, + } + + def _real_extract(self, url): + am_id = self._match_id(url) + + songs = self._call_api( + 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data'] + + entries = [] + for song in songs: + sid = str_or_none(song.get('id')) + if not sid: + continue + entries.append(self.url_result( + 'https://www.bilibili.com/audio/au' + sid, + BilibiliAudioIE.ie_key(), sid)) + + if entries: + album_data = self._call_api('menu/info', am_id) or {} + album_title = album_data.get('title') + if album_title: + for entry in entries: + entry['album'] = album_title + return self.playlist_result( + entries, am_id, album_title, album_data.get('intro')) + + return self.playlist_result(entries, am_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4adcae1e5..44120cae2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -104,6 +104,8 @@ from .bild import BildIE from .bilibili import ( BiliBiliIE, BiliBiliBangumiIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( From 4e3f1f0469b4f3c64cde7105729f9c3acfd3f679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Sep 2019 00:20:52 +0700 Subject: [PATCH 0040/1705] [youtube:playlist] Unescape playlist uploader (closes #22483) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f002d870e..f3f0bf8db 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2740,7 +2740,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): page, 'title', default=None) _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*
  • \s*]+\bhref=' - uploader = self._search_regex( + uploader = self._html_search_regex( r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, page, 'uploader', default=None) mobj = re.search( From 7d327fea5b7a73dfa6bf0082aed5d96a7ceae9a6 Mon Sep 17 00:00:00 2001 From: ipaha Date: Mon, 23 Sep 2019 22:44:00 +0300 Subject: [PATCH 0041/1705] [jwplatfom] do not match video URLs(#20596) (#22148) --- youtube_dl/extractor/jwplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 647b905f1..2aabd98b5 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', From 666d808e7096e782aa6e4ff456120cb91d868120 Mon Sep 17 00:00:00 2001 From: sofutru <54445344+sofutru@users.noreply.github.com> Date: Tue, 24 Sep 2019 23:16:46 +0700 Subject: [PATCH 0042/1705] [youtube] Add support for invidious.drycat.fr (#22451) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f3f0bf8db..05eea0e4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -387,6 +387,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| (?:www\.)?invidious\.nixnet\.xyz/| + (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| (?:www\.)?yt\.elukerio\.org/| From 8e9fdcbe276d760ced8ac37120cd9e4687e5aa84 Mon Sep 17 00:00:00 2001 From: smed79 <1873139+smed79@users.noreply.github.com> Date: Tue, 24 Sep 2019 17:56:12 +0100 Subject: [PATCH 0043/1705] [openload] Add support for oload.online (#22304) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 679eaf6c3..46956e550 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -246,7 +246,7 @@ class OpenloadIE(InfoExtractor): _DOMAINS = r''' (?: openload\.(?:co|io|link|pw)| - oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website|vip)| + oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|online|press|pw|life|live|space|services|website|vip)| oladblock\.(?:services|xyz|me)|openloed\.co ) ''' @@ -362,6 +362,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.services/embed/bs1NWj1dCag/', 'only_matching': True, +}, { + 'url': 'https://oload.online/f/W8o2UfN1vNY/', + 'only_matching': True, }, { 'url': 'https://oload.press/embed/drTBl1aOTvk/', 'only_matching': True, From d06daf23dae24b0811be704283b4b63689035af3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Sep 2019 02:08:46 +0700 Subject: [PATCH 0044/1705] [YoutubeDL] Honour all --get-* options with --flat-playlist (closes #22493) --- youtube_dl/YoutubeDL.py | 62 +++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6a44bc7ba..c3d1407f9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -852,8 +852,9 @@ class YoutubeDL(object): extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(ie_result)) + self.__forced_printings( + ie_result, self.prepare_filename(ie_result), + incomplete=True) return ie_result if result_type == 'video': @@ -1693,6 +1694,36 @@ class YoutubeDL(object): subs[lang] = f return subs + def __forced_printings(self, info_dict, filename, incomplete): + def print_mandatory(field): + if (self.params.get('force%s' % field, False) + and (not incomplete or info_dict.get(field) is not None)): + self.to_stdout(info_dict[field]) + + def print_optional(field): + if (self.params.get('force%s' % field, False) + and info_dict.get(field) is not None): + self.to_stdout(info_dict[field]) + + print_mandatory('title') + print_mandatory('id') + if self.params.get('forceurl', False) and not incomplete: + if info_dict.get('requested_formats') is not None: + for f in info_dict['requested_formats']: + self.to_stdout(f['url'] + f.get('play_path', '')) + else: + # For RTMP URLs, also include the playpath + self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) + print_optional('thumbnail') + print_optional('description') + if self.params.get('forcefilename', False) and filename is not None: + self.to_stdout(filename) + if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) + print_mandatory('format') + if self.params.get('forcejson', False): + self.to_stdout(json.dumps(info_dict)) + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1703,9 +1734,8 @@ class YoutubeDL(object): if self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() + # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] - if len(info_dict['title']) > 200: - info_dict['title'] = info_dict['title'][:197] + '...' if 'format' not in info_dict: info_dict['format'] = info_dict['ext'] @@ -1720,29 +1750,7 @@ class YoutubeDL(object): info_dict['_filename'] = filename = self.prepare_filename(info_dict) # Forced printings - if self.params.get('forcetitle', False): - self.to_stdout(info_dict['fulltitle']) - if self.params.get('forceid', False): - self.to_stdout(info_dict['id']) - if self.params.get('forceurl', False): - if info_dict.get('requested_formats') is not None: - for f in info_dict['requested_formats']: - self.to_stdout(f['url'] + f.get('play_path', '')) - else: - # For RTMP URLs, also include the playpath - self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) - if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: - self.to_stdout(info_dict['thumbnail']) - if self.params.get('forcedescription', False) and info_dict.get('description') is not None: - self.to_stdout(info_dict['description']) - if self.params.get('forcefilename', False) and filename is not None: - self.to_stdout(filename) - if self.params.get('forceduration', False) and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - if self.params.get('forceformat', False): - self.to_stdout(info_dict['format']) - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) + self.__forced_printings(info_dict, filename, incomplete=False) # Do nothing else if in simulate mode if self.params.get('simulate', False): From df63cafe497d7530d887786d5a54ca11bf5e73db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Sep 2019 02:16:25 +0700 Subject: [PATCH 0045/1705] [byutv] Fix extraction (refs #22070) Downloading of new videos does not work due to DRM --- youtube_dl/extractor/byutv.py | 53 ++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 562c83af9..0b11bf11f 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -3,7 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_duration +from ..utils import ( + determine_ext, + merge_dicts, + parse_duration, + url_or_none, +) class BYUtvIE(InfoExtractor): @@ -51,7 +56,7 @@ class BYUtvIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - info = self._download_json( + video = self._download_json( 'https://api.byutv.org/api3/catalog/getvideosforcontent', display_id, query={ 'contentid': video_id, @@ -62,7 +67,7 @@ class BYUtvIE(InfoExtractor): 'x-byutv-platformkey': 'xsaaw9c7y5', }) - ep = info.get('ooyalaVOD') + ep = video.get('ooyalaVOD') if ep: return { '_type': 'url_transparent', @@ -75,18 +80,38 @@ class BYUtvIE(InfoExtractor): 'thumbnail': ep.get('imageThumbnail'), } - ep = info['dvr'] - title = ep['title'] - formats = self._extract_m3u8_formats( - ep['videoUrl'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + info = {} + formats = [] + for format_id, ep in video.items(): + if not isinstance(ep, dict): + continue + video_url = url_or_none(ep.get('videoUrl')) + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + merge_dicts(info, { + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + 'duration': parse_duration(ep.get('length')), + }) self._sort_formats(formats) - return { + + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': ep.get('description'), - 'thumbnail': ep.get('imageThumbnail'), - 'duration': parse_duration(ep.get('length')), + 'title': display_id, 'formats': formats, - } + }) From a373befa25bb521f94facb01c2cef45850c1e7c9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 24 Sep 2019 20:23:56 +0100 Subject: [PATCH 0046/1705] [nhk] fix video extraction(closes #22249)(closes #22353) --- youtube_dl/extractor/nhk.py | 4 ++-- youtube_dl/extractor/piksel.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 241412f98..cce4bb472 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -60,8 +60,8 @@ class NhkVodIE(InfoExtractor): if is_video: info.update({ '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:' + episode['vod_id'], + 'ie_key': 'Piksel', + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], }) else: audio = episode['audio'] diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py index 401298cb8..88b6859b0 100644 --- a/youtube_dl/extractor/piksel.py +++ b/youtube_dl/extractor/piksel.py @@ -15,7 +15,7 @@ from ..utils import ( class PikselIE(InfoExtractor): - _VALID_URL = r'https?://player\.piksel\.com/v/(?P[a-z0-9]+)' + _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P[a-z0-9_]+)' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -40,6 +40,11 @@ class PikselIE(InfoExtractor): 'timestamp': 1486171129, 'upload_date': '20170204' } + }, + { + # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ + 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', + 'only_matching': True, } ] @@ -52,8 +57,11 @@ class PikselIE(InfoExtractor): return mobj.group('url') def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-de-program-uuid=[\'"]([a-z0-9]+)', + webpage, 'program uuid', default=display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' From 21d3c21e6272d6ec089fc76461151f042f51aba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Sep 2019 02:39:25 +0700 Subject: [PATCH 0047/1705] [nhk] Add support for clips --- youtube_dl/extractor/nhk.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index cce4bb472..6a2c6cb7b 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -10,6 +10,18 @@ class NhkVodIE(InfoExtractor): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ + # clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', + 'md5': '256a1be14f48d960a7e61e2532d95ec3', + 'info_dict': { + 'id': 'a95j5iza', + 'ext': 'mp4', + 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', + 'timestamp': 1565965194, + 'upload_date': '20190816', + }, + }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, }, { @@ -19,7 +31,7 @@ class NhkVodIE(InfoExtractor): 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sodesdlist/v7/episode/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() @@ -28,7 +40,10 @@ class NhkVodIE(InfoExtractor): is_video = m_type == 'video' episode = self._download_json( - self._API_URL_TEMPLATE % ('v' if is_video else 'r', episode_id, lang, '/all' if is_video else ''), + self._API_URL_TEMPLATE % ( + 'v' if is_video else 'r', + 'clip' if episode_id[:4] == '9999' else 'esd', + episode_id, lang, '/all' if is_video else ''), episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0] title = episode.get('sub_title_clean') or episode['sub_title'] From 33c1c7d80fd99024879a5f087b55b24374385e43 Mon Sep 17 00:00:00 2001 From: sofutru <54445344+sofutru@users.noreply.github.com> Date: Wed, 25 Sep 2019 02:43:34 +0700 Subject: [PATCH 0048/1705] [youtube] Add support for owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya.b32.i2p (#22292) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 05eea0e4e..a3364a14e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -397,6 +397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| + (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: From 2a88a0c44d4b13eda5874e9b790891acce11ccf7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 26 Sep 2019 11:44:57 +0100 Subject: [PATCH 0049/1705] [ted] check for resources validity and extract subtitled downloads(closes #22513) --- youtube_dl/extractor/ted.py | 82 +++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index db5a4f44e..63e2455b2 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -182,20 +182,29 @@ class TEDIE(InfoExtractor): title = talk_info['title'].strip() - native_downloads = try_get( - talk_info, - (lambda x: x['downloads']['nativeDownloads'], - lambda x: x['nativeDownloads']), - dict) or {} + downloads = talk_info.get('downloads') or {} + native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {} formats = [{ 'url': format_url, 'format_id': format_id, - 'format': format_id, } for (format_id, format_url) in native_downloads.items() if format_url is not None] + + subtitled_downloads = downloads.get('subtitledDownloads') or {} + for lang, subtitled_download in subtitled_downloads.items(): + for q in self._NATIVE_FORMATS: + q_url = subtitled_download.get(q) + if not q_url: + continue + formats.append({ + 'url': q_url, + 'format_id': '%s-%s' % (q, lang), + 'language': lang, + }) + if formats: for f in formats: - finfo = self._NATIVE_FORMATS.get(f['format_id']) + finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0]) if finfo: f.update(finfo) @@ -215,34 +224,7 @@ class TEDIE(InfoExtractor): http_url = None for format_id, resources in resources_.items(): - if format_id == 'h264': - for resource in resources: - h264_url = resource.get('file') - if not h264_url: - continue - bitrate = int_or_none(resource.get('bitrate')) - formats.append({ - 'url': h264_url, - 'format_id': '%s-%sk' % (format_id, bitrate), - 'tbr': bitrate, - }) - if re.search(r'\d+k', h264_url): - http_url = h264_url - elif format_id == 'rtmp': - streamer = talk_info.get('streamer') - if not streamer: - continue - for resource in resources: - formats.append({ - 'format_id': '%s-%s' % (format_id, resource.get('name')), - 'url': streamer, - 'play_path': resource['file'], - 'ext': 'flv', - 'width': int_or_none(resource.get('width')), - 'height': int_or_none(resource.get('height')), - 'tbr': int_or_none(resource.get('bitrate')), - }) - elif format_id == 'hls': + if format_id == 'hls': if not isinstance(resources, dict): continue stream_url = url_or_none(resources.get('stream')) @@ -251,6 +233,36 @@ class TEDIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( stream_url, video_name, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if not isinstance(resources, list): + continue + if format_id == 'h264': + for resource in resources: + h264_url = resource.get('file') + if not h264_url: + continue + bitrate = int_or_none(resource.get('bitrate')) + formats.append({ + 'url': h264_url, + 'format_id': '%s-%sk' % (format_id, bitrate), + 'tbr': bitrate, + }) + if re.search(r'\d+k', h264_url): + http_url = h264_url + elif format_id == 'rtmp': + streamer = talk_info.get('streamer') + if not streamer: + continue + for resource in resources: + formats.append({ + 'format_id': '%s-%s' % (format_id, resource.get('name')), + 'url': streamer, + 'play_path': resource['file'], + 'ext': 'flv', + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + 'tbr': int_or_none(resource.get('bitrate')), + }) m3u8_formats = list(filter( lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', From cb3e4a2947a2df64f1192384e2460a0ee52fc1e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Sep 2019 23:11:02 +0700 Subject: [PATCH 0050/1705] [heise] Fix kaltura embeds extraction (closes #22514) --- youtube_dl/extractor/heise.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index d8a2f9d76..cbe564a3c 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -105,8 +105,7 @@ class HeiseIE(InfoExtractor): webpage, default=None) or self._html_search_meta( 'description', webpage) - kaltura_url = KalturaIE._extract_url(webpage) - if kaltura_url: + def _make_kaltura_result(kaltura_url): return { '_type': 'url_transparent', 'url': smuggle_url(kaltura_url, {'source_url': url}), @@ -115,6 +114,16 @@ class HeiseIE(InfoExtractor): 'description': description, } + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return _make_kaltura_result(kaltura_url) + + kaltura_id = self._search_regex( + r'entry-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura id', + default=None, group='id') + if kaltura_id: + return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) + yt_urls = YoutubeIE._extract_urls(webpage) if yt_urls: return self.playlist_from_matches( From 8130ac42e500643e03faaae512166fe71387473d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Sep 2019 23:15:06 +0700 Subject: [PATCH 0051/1705] [openload] PEP 8 --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 46956e550..1fe581780 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -362,7 +362,7 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.services/embed/bs1NWj1dCag/', 'only_matching': True, -}, { + }, { 'url': 'https://oload.online/f/W8o2UfN1vNY/', 'only_matching': True, }, { From 6483fbd336839c45a376d08d8f4a34e97d581e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Sep 2019 00:04:52 +0700 Subject: [PATCH 0052/1705] [vk] Fix extraction (closes #22522) --- youtube_dl/extractor/vk.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f57ed2288..8b6dc0e24 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -403,8 +403,17 @@ class VKIE(VKBaseIE): data = self._parse_json( self._search_regex( r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, - 'player params'), - video_id)['params'][0] + 'player params', default='{}'), + video_id) + if data: + data = data['params'][0] + + # - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've verified that I'm running youtube-dl version **2019.09.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.12.1 + [debug] youtube-dl version 2019.09.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9babe0360..6116acc79 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've verified that I'm running youtube-dl version **2019.09.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 72322fe26..79d1a7f3c 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've verified that I'm running youtube-dl version **2019.09.28** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index da7f2cf93..9bda3d440 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've verified that I'm running youtube-dl version **2019.09.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.12.1 + [debug] youtube-dl version 2019.09.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d41022b9f..581344917 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've verified that I'm running youtube-dl version **2019.09.28** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 20b0b9ae2..80681a9ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.09.28 Core * [YoutubeDL] Honour all --get-* options with --flat-playlist (#22493) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 18bddc138..35275278b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,6 +98,8 @@ - **Bigflix** - **Bild**: Bild.de - **BiliBili** + - **BilibiliAudio** + - **BilibiliAudioAlbum** - **BioBioChileTV** - **BIQLE** - **BitChute** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index df82cdf0f..c3eafb068 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.09.12.1' +__version__ = '2019.09.28' From 72fd4d0c6a926b4755dfb02a39c76766b0589213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Sep 2019 21:57:08 +0700 Subject: [PATCH 0055/1705] [nonktube] Fix extraction (closes #22544) --- youtube_dl/extractor/nonktube.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py index 63e58aae2..ca1424e06 100644 --- a/youtube_dl/extractor/nonktube.py +++ b/youtube_dl/extractor/nonktube.py @@ -25,9 +25,14 @@ class NonkTubeIE(NuevoBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - info = self._extract_nuevo( - 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s' - % video_id, video_id) + webpage = self._download_webpage(url, video_id) - info['age_limit'] = 18 + title = self._og_search_title(webpage) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + info.update({ + 'id': video_id, + 'title': title, + 'age_limit': 18, + }) return info From 326ae4ff96ca5663cca273126f95bd7eecf5a012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Sep 2019 23:03:39 +0700 Subject: [PATCH 0056/1705] [viewlift] Improve extraction (closes #22545) --- youtube_dl/extractor/viewlift.py | 46 +++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index c43d1a1e8..391419d9e 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -13,11 +13,12 @@ from ..utils import ( js_to_json, parse_age_limit, parse_duration, + try_get, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): @@ -113,7 +114,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): class ViewLiftIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P%s)/(?:films/title|show|(?:news/)?videos?)/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:www\.)?(?P%s)(?:/(?:films/title|show|(?:news/)?videos?))?/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -128,7 +129,7 @@ class ViewLiftIE(ViewLiftBaseIE): 'categories': 'mincount:3', 'age_limit': 14, 'upload_date': '20150421', - 'timestamp': 1429656819, + 'timestamp': 1429656820, } }, { 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', @@ -141,10 +142,26 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, - 'categories': 'mincount:2', 'timestamp': 1399478279, 'upload_date': '20140507', } + }, { + 'url': 'http://main.snagfilms.com/augie_alone/s_2_ep_12_love', + 'info_dict': { + 'id': '00000148-7b53-de26-a9fb-fbf306f70020', + 'display_id': 'augie_alone/s_2_ep_12_love', + 'ext': 'mp4', + 'title': 'Augie, Alone:S. 2 Ep. 12 - Love', + 'description': 'md5:db2a5c72d994f16a780c1eb353a8f403', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 107, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://main.snagfilms.com/films/title/the_freebie', + 'only_matching': True, }, { # Film is not playable in your area. 'url': 'http://www.snagfilms.com/films/title/inside_mecca', @@ -181,7 +198,21 @@ class ViewLiftIE(ViewLiftBaseIE): gist = content_data['gist'] film_id = gist['id'] title = gist['title'] - video_assets = content_data['streamingInfo']['videoAssets'] + video_assets = try_get( + content_data, lambda x: x['streamingInfo']['videoAssets'], dict) + if not video_assets: + token = self._download_json( + 'https://prod-api.viewlift.com/identity/anonymous-token', + film_id, 'Downloading authorization token', + query={'site': 'snagfilms'})['authorizationToken'] + video_assets = self._download_json( + 'https://prod-api.viewlift.com/entitlement/video/status', + film_id, headers={ + 'Authorization': token, + 'Referer': url, + }, query={ + 'id': film_id + })['video']['streamingInfo']['videoAssets'] formats = [] mpeg_video_assets = video_assets.get('mpeg') or [] @@ -241,8 +272,9 @@ class ViewLiftIE(ViewLiftBaseIE): if category.get('title')] break else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') + title = self._html_search_regex( + (r'itemprop="title">([^<]+)<', + r'(?s)itemprop="title">(.+?)(.+?)', webpage, 'description', default=None) or self._og_search_description(webpage) From 2906631e1230617883cdef8e227b369a9c98c9fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Oct 2019 23:16:46 +0700 Subject: [PATCH 0057/1705] [viewlift] Fix URL matching --- youtube_dl/extractor/viewlift.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 391419d9e..851ad936c 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -179,6 +179,10 @@ class ViewLiftIE(ViewLiftBaseIE): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) + def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() From 74bc299453884bc4e802ca225815d3134b9510cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Oct 2019 02:03:22 +0700 Subject: [PATCH 0058/1705] [teachable] Skip login when already logged in (closes #22572) --- youtube_dl/extractor/teachable.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index c1a9deafe..7d2e34b3b 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -48,6 +48,16 @@ class TeachableBaseIE(InfoExtractor): 'https://%s/sign_in' % site, None, 'Downloading %s login page' % site) + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']user-signout', + r']+\bhref=["\']/sign_out', + r'Log\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + login_url = compat_str(urlh.geturl()) login_form = self._hidden_inputs(login_page) @@ -78,10 +88,7 @@ class TeachableBaseIE(InfoExtractor): 'Go to https://%s/ and accept.' % (site, site), expected=True) # Successful login - if any(re.search(p, response) for p in ( - r'class=["\']user-signout', - r']+\bhref=["\']/sign_out', - r'>\s*Log out\s*<')): + if is_logged(response): self._logged_in = True return From 25e911a968f6675a2c06f0d60a09a86972aadc40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 3 Oct 2019 00:53:07 +0700 Subject: [PATCH 0059/1705] [extractor/common] Make _is_valid_url more relaxed --- youtube_dl/extractor/common.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 859786617..50d48c40d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1424,12 +1424,10 @@ class InfoExtractor(object): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError as e: - if isinstance(e.cause, compat_urllib_error.URLError): - self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) - return False - raise + except ExtractorError: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) + return False def http_scheme(self): """ Either "http:" or "https:", depending on the user's preferences """ From aaf9d904aa77bfe60714393c0ab413c32cca8a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 3 Oct 2019 00:55:46 +0700 Subject: [PATCH 0060/1705] [orf:tvthek] Make manifest requests non fatal (refs #22578) --- youtube_dl/extractor/orf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 499be0029..3425f7602 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -86,12 +86,13 @@ class ORFTVthekIE(InfoExtractor): if value: format_id_list.append(value) format_id = '-'.join(format_id_list) - if determine_ext(fd['src']) == 'm3u8': + ext = determine_ext(src) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - fd['src'], video_id, 'mp4', m3u8_id=format_id)) - elif determine_ext(fd['src']) == 'f4m': + src, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': formats.extend(self._extract_f4m_formats( - fd['src'], video_id, f4m_id=format_id)) + src, video_id, f4m_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, From 894b3826f5a2e1742010c20554a6a1b9e98a51ee Mon Sep 17 00:00:00 2001 From: sofutru <54445344+sofutru@users.noreply.github.com> Date: Fri, 4 Oct 2019 18:52:15 +0700 Subject: [PATCH 0061/1705] [youtube] Add support for yt.lelux.fi (#22597) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3364a14e..6bd56f340 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -391,6 +391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| (?:www\.)?yt\.elukerio\.org/| + (?:www\.)?yt\.lelux\.fi/| (?:www\.)?kgg2m7yk5aybusll\.onion/| (?:www\.)?qklhadlycap4cnod\.onion/| (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| From ca20b1304818a8d2d8eadfbe6f5387284e7ebc4d Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Fri, 4 Oct 2019 13:57:18 +0200 Subject: [PATCH 0062/1705] [nrktv:seriebase] Fix extraction (#22596) --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 5f43e692f..60933f069 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -406,7 +406,7 @@ class NRKTVSerieBaseIE(InfoExtractor): def _extract_series(self, webpage, display_id, fatal=True): config = self._parse_json( self._search_regex( - (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;', + (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False) From 9679a62a283f1384c7572ec78f7996e1276d5d7a Mon Sep 17 00:00:00 2001 From: kr4ssi <44404263+kr4ssi@users.noreply.github.com> Date: Fri, 4 Oct 2019 13:57:51 +0200 Subject: [PATCH 0063/1705] [openload] Add support for oload.monster (#22592) --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 1fe581780..66e38cdb4 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -246,7 +246,7 @@ class OpenloadIE(InfoExtractor): _DOMAINS = r''' (?: openload\.(?:co|io|link|pw)| - oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|online|press|pw|life|live|space|services|website|vip)| + oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|online|monster|press|pw|life|live|space|services|website|vip)| oladblock\.(?:services|xyz|me)|openloed\.co ) ''' @@ -365,6 +365,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.online/f/W8o2UfN1vNY/', 'only_matching': True, + }, { + 'url': 'https://oload.monster/f/W8o2UfN1vNY/', + 'only_matching': True, }, { 'url': 'https://oload.press/embed/drTBl1aOTvk/', 'only_matching': True, From 76e510b92c4a1c4b0001f892504ba2cbb4b8d486 Mon Sep 17 00:00:00 2001 From: sofutru <54445344+sofutru@users.noreply.github.com> Date: Fri, 4 Oct 2019 19:01:03 +0700 Subject: [PATCH 0064/1705] [youtube] Remove support for invidious.enkirton.net (#22543) --- youtube_dl/extractor/youtube.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6bd56f340..5e397324b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -383,7 +383,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:(?:www|no)\.)?invidiou\.sh/| (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?invidious\.enkirton\.net/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| (?:www\.)?invidious\.nixnet\.xyz/| From 4e72d02f39f0d8e9ae9bbe8233c157bef3b58bdf Mon Sep 17 00:00:00 2001 From: Stephan Date: Fri, 4 Oct 2019 14:05:35 +0200 Subject: [PATCH 0065/1705] [xvideos] Extend _VALID_URL (#22471) --- youtube_dl/extractor/xvideos.py | 39 ++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 166bcf443..8fc64914c 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -17,7 +17,8 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?xvideos\.com/video| + (?:[^/]+\.)?xvideos2?\.com/video| + (?:www\.)?xvideos\.es/video| flashservice\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) @@ -39,6 +40,42 @@ class XVideosIE(InfoExtractor): }, { 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', 'only_matching': True, + }, { + 'url': 'http://xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://www.xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://www.xvideos.es/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://fr.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://fr.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://it.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://it.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'http://de.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True + }, { + 'url': 'https://de.xvideos.com/video4588838/biker_takes_his_girl', + 'only_matching': True }] def _real_extract(self, url): From c2915de82e0ee793330d553899347ec54a4b834e Mon Sep 17 00:00:00 2001 From: Patrice Levesque Date: Fri, 4 Oct 2019 08:14:31 -0400 Subject: [PATCH 0066/1705] [telequebec] Add support for coucou.telequebec.tv (#22482) --- youtube_dl/extractor/telequebec.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index 6965c127b..911385d01 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -22,7 +22,13 @@ class TeleQuebecBaseIE(InfoExtractor): class TeleQuebecIE(TeleQuebecBaseIE): - _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + zonevideo\.telequebec\.tv/media| + coucou\.telequebec\.tv/videos + )/(?P\d+) + ''' _TESTS = [{ # available till 01.01.2023 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', @@ -41,6 +47,9 @@ class TeleQuebecIE(TeleQuebecBaseIE): # no description 'url': 'http://zonevideo.telequebec.tv/media/30261', 'only_matching': True, + }, { + 'url': 'https://coucou.telequebec.tv/videos/41788/idee-de-genie/l-heure-du-bain', + 'only_matching': True, }] def _real_extract(self, url): From b64045cd2a564bb44ef917803678ca362f412eb4 Mon Sep 17 00:00:00 2001 From: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com> Date: Fri, 4 Oct 2019 13:17:16 +0100 Subject: [PATCH 0067/1705] [peertube] Update instances (#22414) --- youtube_dl/extractor/peertube.py | 397 +++++++++++++++++++++++++++---- 1 file changed, 347 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index b50543e32..d3a83ea2b 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -18,81 +18,385 @@ from ..utils import ( class PeerTubeIE(InfoExtractor): _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances + peertube\.rainbowswingers\.net| + tube\.stanisic\.nl| + peer\.suiri\.us| + medias\.libox\.fr| + videomensoif\.ynh\.fr| + peertube\.travelpandas\.eu| + peertube\.rachetjay\.fr| + peertube\.montecsys\.fr| + tube\.eskuero\.me| + peer\.tube| + peertube\.umeahackerspace\.se| + tube\.nx-pod\.de| + video\.monsieurbidouille\.fr| tube\.openalgeria\.org| - peertube\.pointsecu\.fr| + vid\.lelux\.fi| + video\.anormallostpod\.ovh| + tube\.crapaud-fou\.org| + peertube\.stemy\.me| + lostpod\.space| + exode\.me| + peertube\.snargol\.com| + vis\.ion\.ovh| + videosdulib\.re| + v\.mbius\.io| + videos\.judrey\.eu| + peertube\.osureplayviewer\.xyz| + peertube\.mathieufamily\.ovh| + www\.videos-libr\.es| + fightforinfo\.com| + peertube\.fediverse\.ru| + peertube\.oiseauroch\.fr| + video\.nesven\.eu| + v\.bearvideo\.win| + video\.qoto\.org| + justporn\.cc| + video\.vny\.fr| + peervideo\.club| + tube\.taker\.fr| + peertube\.chantierlibre\.org| + tube\.ipfixe\.info| + tube\.kicou\.info| + tube\.dodsorf\.as| + videobit\.cc| + video\.yukari\.moe| + videos\.elbinario\.net| + hkvideo\.live| + pt\.tux\.tf| + www\.hkvideo\.live| + FIGHTFORINFO\.com| + pt\.765racing\.com| + peertube\.gnumeria\.eu\.org| + nordenmedia\.com| + peertube\.co\.uk| + tube\.darfweb\.eu| + tube\.kalah-france\.org| + 0ch\.in| + vod\.mochi\.academy| + film\.node9\.org| + peertube\.hatthieves\.es| + video\.fitchfamily\.org| + peertube\.ddns\.net| + video\.ifuncle\.kr| + video\.fdlibre\.eu| + tube\.22decembre\.eu| + peertube\.harmoniescreatives\.com| + tube\.fabrigli\.fr| + video\.thedwyers\.co| + video\.bruitbruit\.com| + peertube\.foxfam\.club| + peer\.philoxweb\.be| + videos\.bugs\.social| + peertube\.malbert\.xyz| + peertube\.bilange\.ca| + libretube\.net| + diytelevision\.com| + peertube\.fedilab\.app| + libre\.video| + video\.mstddntfdn\.online| + us\.tv| + peertube\.sl-network\.fr| + peertube\.dynlinux\.io| + peertube\.david\.durieux\.family| + peertube\.linuxrocks\.online| + peerwatch\.xyz| + v\.kretschmann\.social| + tube\.otter\.sh| + yt\.is\.nota\.live| + tube\.dragonpsi\.xyz| + peertube\.boneheadmedia\.com| + videos\.funkwhale\.audio| + watch\.44con\.com| + peertube\.gcaillaut\.fr| + peertube\.icu| + pony\.tube| + spacepub\.space| + tube\.stbr\.io| + v\.mom-gay\.faith| + tube\.port0\.xyz| + peertube\.simounet\.net| + play\.jergefelt\.se| + peertube\.zeteo\.me| + tube\.danq\.me| + peertube\.kerenon\.com| + tube\.fab-l3\.org| + tube\.calculate\.social| + peertube\.mckillop\.org| + tube\.netzspielplatz\.de| + vod\.ksite\.de| + peertube\.laas\.fr| + tube\.govital\.net| + peertube\.stephenson\.cc| + bistule\.nohost\.me| + peertube\.kajalinifi\.de| + video\.ploud\.jp| + video\.omniatv\.com| + peertube\.ffs2play\.fr| + peertube\.leboulaire\.ovh| + peertube\.tronic-studio\.com| + peertube\.public\.cat| + peertube\.metalbanana\.net| + video\.1000i100\.fr| + peertube\.alter-nativ-voll\.de| + tube\.pasa\.tf| + tube\.worldofhauru\.xyz| + pt\.kamp\.site| + peertube\.teleassist\.fr| + videos\.mleduc\.xyz| + conf\.tube| + media\.privacyinternational\.org| + pt\.forty-two\.nl| + video\.halle-leaks\.de| + video\.grosskopfgames\.de| + peertube\.schaeferit\.de| + peertube\.jackbot\.fr| + tube\.extinctionrebellion\.fr| + peertube\.f-si\.org| + video\.subak\.ovh| + videos\.koweb\.fr| + peertube\.zergy\.net| + peertube\.roflcopter\.fr| + peertube\.floss-marketing-school\.com| + vloggers\.social| + peertube\.iriseden\.eu| + videos\.ubuntu-paris\.org| + peertube\.mastodon\.host| + armstube\.com| + peertube\.s2s\.video| + peertube\.lol| + tube\.open-plug\.eu| + open\.tube| + peertube\.ch| + peertube\.normandie-libre\.fr| + peertube\.slat\.org| + video\.lacaveatonton\.ovh| + peertube\.uno| + peertube\.servebeer\.com| + peertube\.fedi\.quebec| + tube\.h3z\.jp| + tube\.plus200\.com| + peertube\.eric\.ovh| + tube\.metadocs\.cc| + tube\.unmondemeilleur\.eu| + gouttedeau\.space| + video\.antirep\.net| + nrop\.cant\.at| + tube\.ksl-bmx\.de| + tube\.plaf\.fr| + tube\.tchncs\.de| + video\.devinberg\.com| + hitchtube\.fr| + peertube\.kosebamse\.com| + yunopeertube\.myddns\.me| + peertube\.varney\.fr| + peertube\.anon-kenkai\.com| + tube\.maiti\.info| + tubee\.fr| + videos\.dinofly\.com| + toobnix\.org| + videotape\.me| + voca\.tube| + video\.heromuster\.com| + video\.lemediatv\.fr| + video\.up\.edu\.ph| + balafon\.video| + video\.ivel\.fr| + thickrips\.cloud| + pt\.laurentkruger\.fr| + video\.monarch-pass\.net| + peertube\.artica\.center| + video\.alternanet\.fr| + indymotion\.fr| + fanvid\.stopthatimp\.net| + video\.farci\.org| + v\.lesterpig\.com| + video\.okaris\.de| + tube\.pawelko\.net| + peertube\.mablr\.org| + tube\.fede\.re| + pytu\.be| + evertron\.tv| + devtube\.dev-wiki\.de| + raptube\.antipub\.org| + video\.selea\.se| + peertube\.mygaia\.org| + video\.oh14\.de| + peertube\.livingutopia\.org| + peertube\.the-penguin\.de| + tube\.thechangebook\.org| + tube\.anjara\.eu| + pt\.pube\.tk| + video\.samedi\.pm| + mplayer\.demouliere\.eu| + widemus\.de| + peertube\.me| + peertube\.zapashcanon\.fr| + video\.latavernedejohnjohn\.fr| + peertube\.pcservice46\.fr| + peertube\.mazzonetto\.eu| + video\.irem\.univ-paris-diderot\.fr| + video\.livecchi\.cloud| + alttube\.fr| + video\.coop\.tools| + video\.cabane-libre\.org| + peertube\.openstreetmap\.fr| + videos\.alolise\.org| + irrsinn\.video| + video\.antopie\.org| + scitech\.video| + tube2\.nemsia\.org| + video\.amic37\.fr| + peertube\.freeforge\.eu| + video\.arbitrarion\.com| + video\.datsemultimedia\.com| + stoptrackingus\.tv| + peertube\.ricostrongxxx\.com| + docker\.videos\.lecygnenoir\.info| + peertube\.togart\.de| + tube\.postblue\.info| + videos\.domainepublic\.net| + peertube\.cyber-tribal\.com| + video\.gresille\.org| + peertube\.dsmouse\.net| + cinema\.yunohost\.support| + tube\.theocevaer\.fr| + repro\.video| + tube\.4aem\.com| + quaziinc\.com| + peertube\.metawurst\.space| + videos\.wakapo\.com| + video\.ploud\.fr| + video\.freeradical\.zone| + tube\.valinor\.fr| + refuznik\.video| + pt\.kircheneuenburg\.de| + peertube\.asrun\.eu| + peertube\.lagob\.fr| + videos\.side-ways\.net| + 91video\.online| + video\.valme\.io| + video\.taboulisme\.com| + videos-libr\.es| + tv\.mooh\.fr| + nuage\.acostey\.fr| + video\.monsieur-a\.fr| + peertube\.librelois\.fr| + videos\.pair2jeux\.tube| + videos\.pueseso\.club| + peer\.mathdacloud\.ovh| + media\.assassinate-you\.net| + vidcommons\.org| + ptube\.rousset\.nom\.fr| + tube\.cyano\.at| + videos\.squat\.net| + video\.iphodase\.fr| + peertube\.makotoworkshop\.org| + peertube\.serveur\.slv-valbonne\.fr| + vault\.mle\.party| + hostyour\.tv| + videos\.hack2g2\.fr| + libre\.tube| + pire\.artisanlogiciel\.net| + videos\.numerique-en-commun\.fr| + video\.netsyms\.com| + video\.die-partei\.social| + video\.writeas\.org| + peertube\.swarm\.solvingmaz\.es| + tube\.pericoloso\.ovh| + watching\.cypherpunk\.observer| + videos\.adhocmusic\.com| + tube\.rfc1149\.net| + peertube\.librelabucm\.org| + videos\.numericoop\.fr| + peertube\.koehn\.com| + peertube\.anarchmusicall\.net| + tube\.kampftoast\.de| + vid\.y-y\.li| + peertube\.xtenz\.xyz| + diode\.zone| + tube\.egf\.mn| + peertube\.nomagic\.uk| + visionon\.tv| + videos\.koumoul\.com| + video\.rastapuls\.com| + video\.mantlepro\.com| + video\.deadsuperhero\.com| + peertube\.musicstudio\.pro| + peertube\.we-keys\.fr| + artitube\.artifaille\.fr| + peertube\.ethernia\.net| + tube\.midov\.pl| + peertube\.fr| + watch\.snoot\.tube| + peertube\.donnadieu\.fr| + argos\.aquilenet\.fr| + tube\.nemsia\.org| + tube\.bruniau\.net| + videos\.darckoune\.moe| + tube\.traydent\.info| + dev\.videos\.lecygnenoir\.info| + peertube\.nayya\.org| + peertube\.live| + peertube\.mofgao\.space| + video\.lequerrec\.eu| + peertube\.amicale\.net| + aperi\.tube| + tube\.ac-lyon\.fr| + video\.lw1\.at| + www\.yiny\.org| + videos\.pofilo\.fr| + tube\.lou\.lt| + choob\.h\.etbus\.ch| + tube\.hoga\.fr| + peertube\.heberge\.fr| + video\.obermui\.de| + videos\.cloudfrancois\.fr| + betamax\.video| + video\.typica\.us| + tube\.piweb\.be| + video\.blender\.org| + peertube\.cat| + tube\.kdy\.ch| + pe\.ertu\.be| + peertube\.social| + videos\.lescommuns\.org| + tv\.datamol\.org| + videonaute\.fr| + dialup\.express| peertube\.nogafa\.org| - peertube\.pl| megatube\.lilomoino\.fr| peertube\.tamanoir\.foucry\.net| - peertube\.inapurna\.org| - peertube\.netzspielplatz\.de| - video\.deadsuperhero\.com| peertube\.devosi\.org| peertube\.1312\.media| - tube\.worldofhauru\.xyz| tube\.bootlicker\.party| skeptikon\.fr| - peertube\.geekshell\.fr| - tube\.opportunis\.me| - peertube\.peshane\.net| video\.blueline\.mg| tube\.homecomputing\.fr| - videos\.cloudfrancois\.fr| - peertube\.viviers-fibre\.net| tube\.ouahpiti\.info| video\.tedomum\.net| video\.g3l\.org| fontube\.fr| peertube\.gaialabs\.ch| - peertube\.extremely\.online| - peertube\.public-infrastructure\.eu| tube\.kher\.nl| peertube\.qtg\.fr| - tube\.22decembre\.eu| - facegirl\.me| video\.migennes\.net| - janny\.moe| tube\.p2p\.legal| - video\.atlanti\.se| troll\.tv| - peertube\.geekael\.fr| - vid\.leotindall\.com| - video\.anormallostpod\.ovh| - p-tube\.h3z\.jp| - tube\.darfweb\.eu| videos\.iut-orsay\.fr| peertube\.solidev\.net| - videos\.symphonie-of-code\.fr| - testtube\.ortg\.de| videos\.cemea\.org| - peertube\.gwendalavir\.eu| video\.passageenseine\.fr| videos\.festivalparminous\.org| peertube\.touhoppai\.moe| - peertube\.duckdns\.org| sikke\.fi| - peertube\.mastodon\.host| - firedragonvideos\.com| - vidz\.dou\.bet| - peertube\.koehn\.com| peer\.hostux\.social| share\.tube| peertube\.walkingmountains\.fr| - medias\.libox\.fr| - peertube\.moe| - peertube\.xyz| - jp\.peertube\.network| videos\.benpro\.fr| - tube\.otter\.sh| - peertube\.angristan\.xyz| peertube\.parleur\.net| - peer\.ecutsa\.fr| peertube\.heraut\.eu| - peertube\.tifox\.fr| - peertube\.maly\.io| - vod\.mochi\.academy| - exode\.me| - coste\.video| tube\.aquilenet\.fr| peertube\.gegeweb\.eu| framatube\.org| @@ -100,18 +404,11 @@ class PeerTubeIE(InfoExtractor): tube\.conferences-gesticulees\.net| peertube\.datagueule\.tv| video\.lqdn\.fr| - meilleurtube\.delire\.party| tube\.mochi\.academy| - peertube\.dav\.li| media\.zat\.im| - pytu\.be| - peertube\.valvin\.fr| - peertube\.nsa\.ovh| video\.colibris-outilslibres\.org| - video\.hispagatos\.org| tube\.svnet\.fr| peertube\.video| - videos\.lecygnenoir\.info| peertube3\.cpy\.re| peertube2\.cpy\.re| videos\.tcit\.fr| @@ -126,7 +423,7 @@ class PeerTubeIE(InfoExtractor): (?P%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ - 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', 'info_dict': { 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', From fd4db1ebc231b65bea91add4cd55ce564b05eee3 Mon Sep 17 00:00:00 2001 From: axelerometer <54915681+axelerometer@users.noreply.github.com> Date: Fri, 4 Oct 2019 15:22:01 +0300 Subject: [PATCH 0068/1705] [chaturbate] Extend _VALID_URL (#22309) --- youtube_dl/extractor/chaturbate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index e2b828d8a..656e715ae 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -7,7 +7,7 @@ from ..utils import ExtractorError class ChaturbateIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://www.chaturbate.com/siswet19/', 'info_dict': { @@ -21,6 +21,9 @@ class ChaturbateIE(InfoExtractor): 'skip_download': True, }, 'skip': 'Room is offline', + }, { + 'url': 'https://chaturbate.com/fullvideo/?b=caylin', + 'only_matching': True, }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, @@ -32,7 +35,8 @@ class ChaturbateIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - url, video_id, headers=self.geo_verification_headers()) + 'https://chaturbate.com/%s/' % video_id, video_id, + headers=self.geo_verification_headers()) m3u8_urls = [] From 0b87beefe60fb6ae52529603fd5826364146dfb7 Mon Sep 17 00:00:00 2001 From: Anh Nhan Nguyen Date: Fri, 4 Oct 2019 14:27:58 +0200 Subject: [PATCH 0069/1705] [gfycat] Extend _VALID_URL (#22225) --- youtube_dl/extractor/gfycat.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index bbe3cb283..18a30fe67 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -11,7 +11,7 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P[^-/?#]+)' + _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P[^-/?#\.]+)' _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -53,6 +53,12 @@ class GfycatIE(InfoExtractor): }, { 'url': 'https://gfycat.com/acceptablehappygoluckyharborporpoise-baseball', 'only_matching': True + }, { + 'url': 'https://thumbs.gfycat.com/acceptablehappygoluckyharborporpoise-size_restricted.gif', + 'only_matching': True + }, { + 'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4', + 'only_matching': True }] def _real_extract(self, url): From 3a37f2c3be16bb75a12d0617b5bc80ee6cab0f61 Mon Sep 17 00:00:00 2001 From: bitraid Date: Fri, 4 Oct 2019 15:48:20 +0300 Subject: [PATCH 0070/1705] [wimp] Remove extractor (closes #22088) (#22091) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/wimp.py | 54 ------------------------------ 2 files changed, 55 deletions(-) delete mode 100644 youtube_dl/extractor/wimp.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 44120cae2..a2d6e5314 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1413,7 +1413,6 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE -from .wimp import WimpIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE from .wsj import ( diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py deleted file mode 100644 index ea234e3c5..000000000 --- a/youtube_dl/extractor/wimp.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .youtube import YoutubeIE - - -class WimpIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P[^/]+)' - _TESTS = [{ - 'url': 'http://www.wimp.com/maru-is-exhausted/', - 'md5': 'ee21217ffd66d058e8b16be340b74883', - 'info_dict': { - 'id': 'maru-is-exhausted', - 'ext': 'mp4', - 'title': 'Maru is exhausted.', - 'description': 'md5:57e099e857c0a4ea312542b684a869b8', - } - }, { - 'url': 'http://www.wimp.com/clowncar/', - 'md5': '5c31ad862a90dc5b1f023956faec13fe', - 'info_dict': { - 'id': 'cG4CEr2aiSg', - 'ext': 'webm', - 'title': 'Basset hound clown car...incredible!', - 'description': '5 of my Bassets crawled in this dog loo! www.bellinghambassets.com\n\nFor licensing/usage please contact: licensing(at)jukinmediadotcom', - 'upload_date': '20140303', - 'uploader': 'Gretchen Hoey', - 'uploader_id': 'gretchenandjeff1', - }, - 'add_ie': ['Youtube'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - youtube_id = self._search_regex( - (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", - r'data-id=["\']([0-9A-Za-z_-]{11})'), - webpage, 'video URL', default=None) - if youtube_id: - return self.url_result(youtube_id, YoutubeIE.ie_key()) - - info_dict = self._extract_jwplayer_data( - webpage, video_id, require_title=False) - - info_dict.update({ - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - }) - - return info_dict From 05446d483d089d0bc7fa3037900dadc856d3e687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Oct 2019 20:14:45 +0700 Subject: [PATCH 0071/1705] [telequebec:squat] Add support for squat.telequebec.tv (closes #18503) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/telequebec.py | 47 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a2d6e5314..8d3e433c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1129,6 +1129,7 @@ from .telegraaf import TelegraafIE from .telemb import TeleMBIE from .telequebec import ( TeleQuebecIE, + TeleQuebecSquatIE, TeleQuebecEmissionIE, TeleQuebecLiveIE, ) diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index 911385d01..ae9f66787 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, smuggle_url, try_get, + unified_timestamp, ) @@ -70,6 +71,52 @@ class TeleQuebecIE(TeleQuebecBaseIE): return info +class TeleQuebecSquatIE(InfoExtractor): + _VALID_URL = r'https://squat\.telequebec\.tv/videos/(?P\d+)' + _TESTS = [{ + 'url': 'https://squat.telequebec.tv/videos/9314', + 'info_dict': { + 'id': 'd59ae78112d542e793d83cc9d3a5b530', + 'ext': 'mp4', + 'title': 'Poupeflekta', + 'description': 'md5:2f0718f8d2f8fece1646ee25fb7bce75', + 'duration': 1351, + 'timestamp': 1569057600, + 'upload_date': '20190921', + 'series': 'Miraculous : Les Aventures de Ladybug et Chat Noir', + 'season': 'Saison 3', + 'season_number': 3, + 'episode_number': 57, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://squat.api.telequebec.tv/v1/videos/%s' % video_id, + video_id) + + media_id = video['sourceId'] + + return { + '_type': 'url_transparent', + 'url': 'http://zonevideo.telequebec.tv/media/%s' % media_id, + 'ie_key': TeleQuebecIE.ie_key(), + 'id': media_id, + 'title': video.get('titre'), + 'description': video.get('description'), + 'timestamp': unified_timestamp(video.get('datePublication')), + 'series': video.get('container'), + 'season': video.get('saison'), + 'season_number': int_or_none(video.get('noSaison')), + 'episode_number': int_or_none(video.get('episode')), + } + + class TeleQuebecEmissionIE(TeleQuebecBaseIE): _VALID_URL = r'''(?x) https?:// From 4bf568d36cf516b38e4634e07bd8b4c3d33324f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Oct 2019 21:43:31 +0700 Subject: [PATCH 0072/1705] [pornhub:uservideos:upload] Fix extraction (closes #22619) --- youtube_dl/extractor/pornhub.py | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 11b8cfcf7..ba0ad7da2 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -403,6 +403,15 @@ class PornHubUserIE(PornHubPlaylistBaseIE): class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): + @staticmethod + def _has_more(webpage): + return re.search( + r'''(?x) + ]+\bclass=["\']page_next| + ]+\brel=["\']next| + ]+\bid=["\']moreDataBtn + ''', webpage) is not None + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') @@ -411,13 +420,11 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page = int_or_none(self._search_regex( r'\bpage=(\d+)', url, 'page', default=None)) - page_url = self._make_page_url(url) - entries = [] for page_num in (page, ) if page is not None else itertools.count(1): try: webpage = self._download_webpage( - page_url, item_id, 'Downloading page %d' % page_num, + url, item_id, 'Downloading page %d' % page_num, query={'page': page_num}) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: @@ -547,18 +554,6 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) else super(PornHubPagedVideoListIE, cls).suitable(url)) - def _make_page_url(self, url): - return url - - @staticmethod - def _has_more(webpage): - return re.search( - r'''(?x) - ]+\bclass=["\']page_next| - ]+\brel=["\']next| - ]+\bid=["\']moreDataBtn - ''', webpage) is not None - class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' @@ -572,11 +567,3 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'only_matching': True, }] - - def _make_page_url(self, url): - mobj = re.match(self._VALID_URL, url) - return '%s/ajax' % mobj.group('url') - - @staticmethod - def _has_more(webpage): - return True From 560d3b7d7c86a0bfff36d59cb977fd3c01b10ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Oct 2019 22:04:49 +0700 Subject: [PATCH 0073/1705] [redtube] Improve metadata extraction (closes #22492, closes #22615) --- youtube_dl/extractor/redtube.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 10311a81a..5c84028ef 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + merge_dicts, str_to_int, unified_strdate, url_or_none, @@ -45,11 +46,14 @@ class RedTubeIE(InfoExtractor): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - title = self._html_search_regex( - (r']+class="(?:video_title_text|videoTitle)[^"]*">(?P(?:(?!\1).)+)</h\1>', - r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), - webpage, 'title', group='title', - default=None) or self._og_search_title(webpage) + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = self._html_search_regex( + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) formats = [] sources = self._parse_json( @@ -88,28 +92,28 @@ class RedTubeIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( - r'<span[^>]+>ADDED ([^<]+)<', - webpage, 'upload date', fatal=False)) + r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', + webpage, 'upload date', default=None)) duration = int_or_none(self._og_search_property( 'video:duration', webpage, default=None) or self._search_regex( r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', - r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)'), - webpage, 'view count', fatal=False)) + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', + r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), + webpage, 'view count', default=None)) # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 - return { + return merge_dicts(info, { 'id': video_id, 'ext': 'mp4', - 'title': title, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'age_limit': age_limit, 'formats': formats, - } + }) From d4bb825b83a87813f54d007febd79d2f3dcee7b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 9 Oct 2019 11:07:46 +0100 Subject: [PATCH 0074/1705] [globo] fix format extraction(closes #20319) --- youtube_dl/extractor/globo.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index fb8f7679b..b9c400a57 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -96,6 +96,8 @@ class GloboIE(InfoExtractor): video = self._download_json( 'http://api.globovideos.com/videos/%s/playlist' % video_id, video_id)['videos'][0] + if video.get('encrypted') is True: + raise ExtractorError('This video is DRM protected.', expected=True) title = video['title'] @@ -109,8 +111,8 @@ class GloboIE(InfoExtractor): security = self._download_json( 'http://security.video.globo.com/videos/%s/hash' % video_id, video_id, 'Downloading security hash for %s' % resource_id, query={ - 'player': 'flash', - 'version': '17.0.0.132', + 'player': 'desktop', + 'version': '5.19.1', 'resource_id': resource_id, }) @@ -122,19 +124,18 @@ class GloboIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, message), expected=True) continue - hash_code = security_hash[:2] - received_time = security_hash[2:12] - received_random = security_hash[12:22] - received_md5 = security_hash[22:] + assert security_hash[:2] in ('04', '14') + received_time = security_hash[3:13] + received_md5 = security_hash[24:] sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() + md5_data = (received_md5 + sign_time + padding + '0xAC10FD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 + signed_hash = security_hash[:23] + sign_time + padding + signed_md5 - signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') + signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', From 1907f06e7b0689840b75810e5ad2683581f83924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 Oct 2019 00:11:41 +0700 Subject: [PATCH 0075/1705] [kaltura] Fix embed info strip (refs #22658) --- youtube_dl/extractor/kaltura.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 0a733424c..1c486c038 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -151,7 +151,8 @@ class KalturaIE(InfoExtractor): if mobj: embed_info = mobj.groupdict() for k, v in embed_info.items(): - embed_info[k] = v.strip() + if v: + embed_info[k] = v.strip() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) service_url = re.search( From 07b50f616e407c8b7b2c183298acbb58e2ddf09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 Oct 2019 00:24:03 +0700 Subject: [PATCH 0076/1705] [kaltura] Fix service URL extraction (closes #22658) --- youtube_dl/extractor/kaltura.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 1c486c038..2d38b758b 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -155,11 +155,11 @@ class KalturaIE(InfoExtractor): embed_info[k] = v.strip() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) - service_url = re.search( - r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + service_mobj = re.search( + r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), webpage) - if service_url: - url = smuggle_url(url, {'service_url': service_url.group(1)}) + if service_mobj: + url = smuggle_url(url, {'service_url': service_mobj.group('id')}) return url def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): From 2765c47a8c4e7154fa0a9be0bb63f3bcba592b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 10 Oct 2019 03:40:01 +0700 Subject: [PATCH 0077/1705] [promptfile] Remove extractor (closes #6239) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/promptfile.py | 70 ------------------------------ 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/promptfile.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8d3e433c3..f393683da 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -893,7 +893,6 @@ from .puhutv import ( PuhuTVSerieIE, ) from .presstv import PressTVIE -from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE from .pyvideo import PyvideoIE diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py deleted file mode 100644 index 23ac93d7e..000000000 --- a/youtube_dl/extractor/promptfile.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - urlencode_postdata, -) - - -class PromptFileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P<id>[0-9A-Z\-]+)' - _TEST = { - 'url': 'http://www.promptfile.com/l/86D1CE8462-576CAAE416', - 'md5': '5a7e285a26e0d66d9a263fae91bc92ce', - 'info_dict': { - 'id': '86D1CE8462-576CAAE416', - 'ext': 'mp4', - 'title': 'oceans.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if re.search(r'<div.+id="not_found_msg".+>(?!We are).+</div>[^-]', webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - chash = self._search_regex( - r'val\("([^"]*)"\s*\+\s*\$\("#chash"\)', webpage, 'chash') - fields = self._hidden_inputs(webpage) - keys = list(fields.keys()) - chash_key = keys[0] if len(keys) == 1 else next( - key for key in keys if key.startswith('cha')) - fields[chash_key] = chash + fields[chash_key] - - webpage = self._download_webpage( - url, video_id, 'Downloading video page', - data=urlencode_postdata(fields), - headers={'Content-type': 'application/x-www-form-urlencoded'}) - - video_url = self._search_regex( - (r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*Download File', - r'<a[^>]+href=(["\'])(?P<url>https?://(?:www\.)?promptfile\.com/file/(?:(?!\1).)+)\1'), - webpage, 'video url', group='url') - title = self._html_search_regex( - r'<span.+title="([^"]+)">', webpage, 'title') - thumbnail = self._html_search_regex( - r'<div id="player_overlay">.*button>.*?<img src="([^"]+)"', - webpage, 'thumbnail', fatal=False, flags=re.DOTALL) - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': determine_ext(title), - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } From c317b6163b294f4cdc2d1dff96e1a63da1bae910 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 10 Oct 2019 00:01:37 +0100 Subject: [PATCH 0078/1705] [vessel] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/generic.py | 6 -- youtube_dl/extractor/vessel.py | 157 ----------------------------- 3 files changed, 164 deletions(-) delete mode 100644 youtube_dl/extractor/vessel.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f393683da..7a1e0dad6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1282,7 +1282,6 @@ from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE -from .vessel import VesselIE from .vesti import VestiIE from .vevo import ( VevoIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d1725d98b..ec43c5ae4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -77,7 +77,6 @@ from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE -from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE @@ -2491,11 +2490,6 @@ class GenericIE(InfoExtractor): if tp_urls: return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - # Look for Vessel embeds - vessel_urls = VesselIE._extract_urls(webpage) - if vessel_urls: - return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) - # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py deleted file mode 100644 index 31eee0ba7..000000000 --- a/youtube_dl/extractor/vessel.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_iso8601, - sanitized_Request, -) - - -class VesselIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z-_]+)' - _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' - _LOGIN_URL = 'https://www.vessel.com/api/account/login' - _NETRC_MACHINE = 'vessel' - _TESTS = [{ - 'url': 'https://www.vessel.com/videos/HDN7G5UMs', - 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', - 'info_dict': { - 'id': 'HDN7G5UMs', - 'ext': 'mp4', - 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20150317', - 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', - 'timestamp': int, - }, - }, { - 'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346', - 'only_matching': True, - }, { - 'url': 'https://www.vessel.com/videos/F01_dsLj1', - 'only_matching': True, - }, { - 'url': 'https://www.vessel.com/videos/RRX-sir-J', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z-_]+.*?)\1', - webpage)] - - @staticmethod - def make_json_request(url, data): - payload = json.dumps(data).encode('utf-8') - req = sanitized_Request(url, payload) - req.add_header('Content-Type', 'application/json; charset=utf-8') - return req - - @staticmethod - def find_assets(data, asset_type, asset_id=None): - for asset in data.get('assets', []): - if not asset.get('type') == asset_type: - continue - elif asset_id is not None and not asset.get('id') == asset_id: - continue - else: - yield asset - - def _check_access_rights(self, data): - access_info = data.get('__view', {}) - if not access_info.get('allow_access', True): - err_code = access_info.get('error_code') or '' - if err_code == 'ITEM_PAID_ONLY': - raise ExtractorError( - 'This video requires subscription.', expected=True) - else: - raise ExtractorError( - 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - self.report_login() - data = { - 'client_id': 'web', - 'type': 'password', - 'user_key': username, - 'password': password, - } - login_request = VesselIE.make_json_request(self._LOGIN_URL, data) - self._download_webpage(login_request, None, False, 'Wrong login info') - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) - asset_id = data['model']['data']['id'] - - req = VesselIE.make_json_request( - self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) - data = self._download_json(req, video_id) - video_asset_id = data.get('main_video_asset') - - self._check_access_rights(data) - - try: - video_asset = next( - VesselIE.find_assets(data, 'video', asset_id=video_asset_id)) - except StopIteration: - raise ExtractorError('No video assets found') - - formats = [] - for f in video_asset.get('sources', []): - location = f.get('location') - if not location: - continue - name = f.get('name') - if name == 'hls-index': - formats.extend(self._extract_m3u8_formats( - location, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False)) - elif name == 'dash-index': - formats.extend(self._extract_mpd_formats( - location, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': name, - 'tbr': f.get('bitrate'), - 'height': f.get('height'), - 'width': f.get('width'), - 'url': location, - }) - self._sort_formats(formats) - - thumbnails = [] - for im_asset in VesselIE.find_assets(data, 'image'): - thumbnails.append({ - 'url': im_asset['location'], - 'width': im_asset.get('width', 0), - 'height': im_asset.get('height', 0), - }) - - return { - 'id': video_id, - 'title': data['title'], - 'formats': formats, - 'thumbnails': thumbnails, - 'description': data.get('short_description'), - 'duration': data.get('duration'), - 'comment_count': data.get('comment_count'), - 'like_count': data.get('like_count'), - 'view_count': data.get('view_count'), - 'timestamp': parse_iso8601(data.get('released_at')), - } From 311ee457314359662c975cd29f2ee58ad068db49 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 14 Oct 2019 18:36:25 +0100 Subject: [PATCH 0079/1705] [nbc] switch to graphql api(closes #18581)(closes #22693)(closes #22701) --- youtube_dl/extractor/nbc.py | 39 ++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3282f84ee..10680b202 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( smuggle_url, - try_get, update_url_query, int_or_none, ) @@ -85,27 +84,41 @@ class NBCIE(AdobePassIE): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + compat_urllib_parse_unquote(permalink) response = self._download_json( - 'https://api.nbc.com/v3/videos', video_id, query={ - 'filter[permalink]': permalink, - 'fields[videos]': 'description,entitlement,episodeNumber,guid,keywords,seasonNumber,title,vChipRating', - 'fields[shows]': 'shortTitle', - 'include': 'show.shortTitle', + 'https://friendship.nbc.co/v2/graphql', video_id, query={ + 'query': '''{ + page(name: "%s", platform: web, type: VIDEO, userId: "0") { + data { + ... on VideoPageData { + description + episodeNumber + keywords + locked + mpxAccountId + mpxGuid + rating + seasonNumber + secondaryTitle + seriesShortTitle + } + } + } +}''' % permalink, }) - video_data = response['data'][0]['attributes'] + video_data = response['data']['page']['data'] query = { 'mbr': 'true', 'manifest': 'm3u', } - video_id = video_data['guid'] - title = video_data['title'] - if video_data.get('entitlement') == 'auth': + video_id = video_data['mpxGuid'] + title = video_data['secondaryTitle'] + if video_data.get('locked'): resource = self._get_mvpd_resource( 'nbcentertainment', title, video_id, - video_data.get('vChipRating')) + video_data.get('rating')) query['auth'] = self._extract_mvpd_auth( url, video_id, 'nbcentertainment', resource) theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, + 'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id), query), {'force_smil_url': True}) return { '_type': 'url_transparent', @@ -117,7 +130,7 @@ class NBCIE(AdobePassIE): 'season_number': int_or_none(video_data.get('seasonNumber')), 'episode_number': int_or_none(video_data.get('episodeNumber')), 'episode': title, - 'series': try_get(response, lambda x: x['included'][0]['attributes']['shortTitle']), + 'series': video_data.get('seriesShortTitle'), 'ie_key': 'ThePlatform', } From a1ee23e98fe2ec80b8726829927fcae1267e76b1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 14 Oct 2019 18:37:35 +0100 Subject: [PATCH 0080/1705] [vimeo] fix VHX embed extraction --- youtube_dl/extractor/vimeo.py | 97 ++++------------------------------- 1 file changed, 9 insertions(+), 88 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ddf375c6c..5dc38e243 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -23,7 +23,6 @@ from ..utils import ( NO_DEFAULT, OnDemandPagedList, parse_filesize, - qualities, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -211,6 +210,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_uploader_url = owner.get('url') return { + 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, 'uploader': owner.get('name'), 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, @@ -730,7 +730,6 @@ class VimeoIE(VimeoBaseInfoExtractor): channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None info_dict = { - 'id': video_id, 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, @@ -1061,7 +1060,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): if source_format: info_dict['formats'].append(source_format) self._vimeo_sort_formats(info_dict['formats']) - info_dict['id'] = video_id return info_dict @@ -1115,94 +1113,17 @@ class VimeoLikesIE(VimeoChannelIE): return self._extract_videos(user_id, 'https://vimeo.com/%s/likes' % user_id) -class VHXEmbedIE(InfoExtractor): +class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' - def _call_api(self, video_id, access_token, path='', query=None): - return self._download_json( - 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={ - 'Authorization': 'Bearer ' + access_token, - }, query=query) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - credentials = self._parse_json(self._search_regex( - r'(?s)credentials\s*:\s*({.+?}),', webpage, - 'config'), video_id, js_to_json) - access_token = credentials['access_token'] - - query = {} - for k, v in credentials.items(): - if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined': - if k == 'authUserToken': - query['auth_user_token'] = v - else: - query[k] = v - files = self._call_api(video_id, access_token, '/files', query) - - formats = [] - for f in files: - href = try_get(f, lambda x: x['_links']['source']['href']) - if not href: - continue - method = f.get('method') - if method == 'hls': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif method == 'dash': - formats.extend(self._extract_mpd_formats( - href, video_id, mpd_id='dash', fatal=False)) - else: - fmt = { - 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])), - 'format_id': 'http', - 'preference': 1, - 'url': href, - 'vcodec': f.get('codec'), - } - quality = f.get('quality') - if quality: - fmt.update({ - 'format_id': 'http-' + quality, - 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)), - }) - formats.append(fmt) - self._sort_formats(formats) - - video_data = self._call_api(video_id, access_token) - title = video_data.get('title') or video_data['name'] - - subtitles = {} - for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []: - lang = subtitle.get('srclang') or subtitle.get('label') - for _link in subtitle.get('_links', {}).values(): - href = _link.get('href') - if not href: - continue - subtitles.setdefault(lang, []).append({ - 'url': href, - }) - - q = qualities(['small', 'medium', 'large', 'source']) - thumbnails = [] - for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'preference': q(thumbnail_id), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'timestamp': unified_timestamp(video_data.get('created_at')), - 'view_count': int_or_none(video_data.get('plays_count')), - } + config_url = self._parse_json(self._search_regex( + r'window\.OTTData\s*=\s*({.+})', webpage, + 'ott data'), video_id, js_to_json)['config_url'] + config = self._download_json(config_url, video_id) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info From 7e05df71b7d8c0e1ea9beafff48275ef3c9e27d2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 15 Oct 2019 00:10:22 +0100 Subject: [PATCH 0081/1705] [nexx] handle result list(closes #22666) --- youtube_dl/extractor/nexx.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 82d526c22..f9aad83c4 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -295,13 +295,23 @@ class NexxIE(InfoExtractor): video = None + def find_video(result): + if isinstance(result, dict): + return result + elif isinstance(result, list): + vid = int(video_id) + for v in result: + if try_get(v, lambda x: x['general']['ID'], int) == vid: + return v + return None + response = self._download_json( 'https://arc.nexx.cloud/api/video/%s.json' % video_id, video_id, fatal=False) if response and isinstance(response, dict): result = response.get('result') - if result and isinstance(result, dict): - video = result + if result: + video = find_video(result) # not all videos work via arc, e.g. nexx:741:1269984 if not video: @@ -348,7 +358,7 @@ class NexxIE(InfoExtractor): request_token = hashlib.md5( ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() - video = self._call_api( + result = self._call_api( domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', 'addInteractionOptions': '1', @@ -363,6 +373,7 @@ class NexxIE(InfoExtractor): 'X-Request-CID': cid, 'X-Request-Token': request_token, }) + video = find_video(result) general = video['general'] title = general['title'] From 2af01c0293db53dc80c552df3986d0e088b65b76 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 15 Oct 2019 15:18:51 +0100 Subject: [PATCH 0082/1705] [bokecc] improve player params extraction(closes #22638) --- youtube_dl/extractor/bokecc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 86a7f4d7d..6017e8344 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -11,8 +11,8 @@ from ..utils import ExtractorError class BokeCCBaseIE(InfoExtractor): def _extract_bokecc_formats(self, webpage, video_id, format_id=None): player_params_str = self._html_search_regex( - r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', - webpage, 'player params') + r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', + webpage, 'player params', group='query') player_params = compat_parse_qs(player_params_str) @@ -36,9 +36,9 @@ class BokeCCIE(BokeCCBaseIE): _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' _TESTS = [{ - 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', 'info_dict': { - 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30', + 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', 'ext': 'flv', 'title': 'BokeCC Video', }, From 30eb05cb41d95a73f7baff8da9ec1d6a50b08f50 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 15 Oct 2019 19:54:53 +0100 Subject: [PATCH 0083/1705] [globo] extract subtitles(closes #22713) --- youtube_dl/extractor/globo.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index b9c400a57..9ad1d95fb 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -102,10 +102,18 @@ class GloboIE(InfoExtractor): title = video['title'] formats = [] + subtitles = {} for resource in video['resources']: resource_id = resource.get('_id') resource_url = resource.get('url') - if not resource_id or not resource_url: + resource_type = resource.get('type') + if not resource_url or (resource_type == 'media' and not resource_id) or resource_type not in ('subtitle', 'media'): + continue + + if resource_type == 'subtitle': + subtitles.setdefault(resource.get('language') or 'por', []).append({ + 'url': resource_url, + }) continue security = self._download_json( @@ -165,7 +173,8 @@ class GloboIE(InfoExtractor): 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, } From 974311b5aa1a53564a00915b9228af30e2a5b40d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 15 Oct 2019 21:01:59 +0100 Subject: [PATCH 0084/1705] [vimeo] improve album videos id extraction(closes #22599) --- youtube_dl/extractor/vimeo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5dc38e243..9abd59d98 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -938,7 +938,7 @@ class VimeoAlbumIE(VimeoChannelIE): def _fetch_page(self, album_id, authorizaion, hashed_pass, page): api_page = page + 1 query = { - 'fields': 'link', + 'fields': 'link,uri', 'page': api_page, 'per_page': self._PAGE_SIZE, } @@ -953,7 +953,9 @@ class VimeoAlbumIE(VimeoChannelIE): link = video.get('link') if not link: continue - yield self.url_result(link, VimeoIE.ie_key(), VimeoIE._match_id(link)) + uri = video.get('uri') + video_id = self._search_regex(r'/videos/(\d+)', uri, 'video_id', default=None) if uri else None + yield self.url_result(link, VimeoIE.ie_key(), video_id) def _real_extract(self, url): album_id = self._match_id(url) From 173190f5e3946173daea0539cf0e749cb14acd12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Oct 2019 03:25:13 +0700 Subject: [PATCH 0085/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/ChangeLog b/ChangeLog index 80681a9ae..8a59398d9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,39 @@ +version <unreleased> + +Core +* [extractor/common] Make _is_valid_url more relaxed + +Extractors +* [vimeo] Improve album videos id extraction (#22599) ++ [globo] Extract subtitles (#22713) +* [bokecc] Improve player params extraction (#22638) +* [nexx] Handle result list (#22666) +* [vimeo] Fix VHX embed extraction +* [nbc] Switch to graphql API (#18581, #22693, #22701) +- [vessel] Remove extractor +- [promptfile] Remove extractor (#6239) +* [kaltura] Fix service URL extraction (#22658) +* [kaltura] Fix embed info strip (#22658) +* [globo] Fix format extraction (#20319) +* [redtube] Improve metadata extraction (#22492, #22615) +* [pornhub:uservideos:upload] Fix extraction (#22619) ++ [telequebec:squat] Add support for squat.telequebec.tv (#18503) +- [wimp] Remove extractor (#22088, #22091) ++ [gfycat] Extend URL regular expression (#22225) ++ [chaturbate] Extend URL regular expression (#22309) +* [peertube] Update instances (#22414) ++ [telequebec] Add support for coucou.telequebec.tv (#22482) ++ [xvideos] Extend URL regular expression (#22471) +- [youtube] Remove support for invidious.enkirton.net (#22543) ++ [openload] Add support for oload.monster (#22592) +* [nrktv:seriebase] Fix extraction (#22596) ++ [youtube] Add support for yt.lelux.fi (#22597) +* [orf:tvthek] Make manifest requests non fatal (#22578) +* [teachable] Skip login when already logged in (#22572) +* [viewlift] Improve extraction (#22545) +* [nonktube] Fix extraction (#22544) + + version 2019.09.28 Core From 7815d6b74373feb90d969b5fcde7df11702fa5d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Oct 2019 03:26:47 +0700 Subject: [PATCH 0086/1705] release 2019.10.16 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 +--- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2fea0120e..5cd9f0dc0 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.09.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.28 + [debug] youtube-dl version 2019.10.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 6116acc79..6cc34796a 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.09.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 79d1a7f3c..0b7911e79 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.09.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 9bda3d440..a6f417d38 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.09.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.09.28 + [debug] youtube-dl version 2019.10.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 581344917..3fe753b62 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.09.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.09.28** +- [ ] I've verified that I'm running youtube-dl version **2019.10.16** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 8a59398d9..dc5c32a1f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.10.16 Core * [extractor/common] Make _is_valid_url more relaxed diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 35275278b..0cbad28ea 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -694,7 +694,6 @@ - **PornoXO** - **PornTube** - **PressTV** - - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital - **puhutv** - **puhutv:serie** @@ -884,6 +883,7 @@ - **TeleQuebec** - **TeleQuebecEmission** - **TeleQuebecLive** + - **TeleQuebecSquat** - **TeleTask** - **Telewebion** - **TennisTV** @@ -991,7 +991,6 @@ - **VeeHD** - **Veoh** - **verystream** - - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** @@ -1090,7 +1089,6 @@ - **Weibo** - **WeiboMobile** - **WeiqiTV**: WQTV - - **Wimp** - **Wistia** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c3eafb068..53889b7cb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.09.28' +__version__ = '2019.10.16' From 6d394a66f54216cc2b0b68fadd958eaf455c2778 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 Oct 2019 12:03:46 +0100 Subject: [PATCH 0087/1705] [atresplayer] fix extraction(closes #16277)(closes #16716) --- youtube_dl/extractor/atresplayer.py | 213 +++++++++------------------- 1 file changed, 64 insertions(+), 149 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index ae1c09427..b96218f6c 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -1,202 +1,117 @@ from __future__ import unicode_literals -import time -import hmac -import hashlib import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - float_or_none, int_or_none, - sanitized_Request, urlencode_postdata, - xpath_text, ) class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P<display_id>.+?)_(?P<id>[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' _TESTS = [ { - 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', - 'md5': 'efd56753cda1bb64df52a3074f62e38a', + 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', 'info_dict': { - 'id': 'capitulo-10-especial-solidario-nochebuena', + 'id': '5d4aa2c57ed1a88fc715a615', 'ext': 'mp4', - 'title': 'Especial Solidario de Nochebuena', - 'description': 'md5:e2d52ff12214fa937107d21064075bf1', - 'duration': 5527.6, - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Capítulo 7: Asuntos pendientes', + 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', + 'duration': 3413, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'This video is only available for registered users' }, { - 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', - 'md5': '6e52cbb513c405e403dbacb7aacf8747', - 'info_dict': { - 'id': 'capitulo-112-david-bustamante', - 'ext': 'flv', - 'title': 'David Bustamante', - 'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', - 'duration': 1439.0, - 'thumbnail': r're:^https?://.*\.jpg$', - }, + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, }, { - 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', 'only_matching': True, }, ] - - _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J' - _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)' - _TIMESTAMP_SHIFT = 30000 - - _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json' - _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json' - _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s' - _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s' - - _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' - - _ERRORS = { - 'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', - 'DELETED': 'This video has expired and is no longer available for online streaming.', - 'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', - # 'PREMIUM': 'PREMIUM', - } + _API_BASE = 'https://api.atresplayer.com/' def _real_initialize(self): self._login() + def _handle_error(self, e, code): + if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: + error = self._parse_json(e.cause.read(), None) + if error.get('error') == 'required_registered': + self.raise_login_required() + raise ExtractorError(error['error_description'], expected=True) + raise + def _login(self): username, password = self._get_login_info() if username is None: return - login_form = { - 'j_username': username, - 'j_password': password, - } + self._request_webpage( + self._API_BASE + 'login', None, 'Downloading login page') - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - response = self._download_webpage( - request, None, 'Logging in') + try: + target_url = self._download_json( + 'https://account.atresmedia.com/api/login', None, + 'Logging in', headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=urlencode_postdata({ + 'username': username, + 'password': password, + }))['targetUrl'] + except ExtractorError as e: + self._handle_error(e, 400) - error = self._html_search_regex( - r'(?s)<ul[^>]+class="[^"]*\blist_error\b[^"]*">(.+?)</ul>', - response, 'error', default=None) - if error: - raise ExtractorError( - 'Unable to login: %s' % error, expected=True) + self._request_webpage(target_url, None, 'Following Target URL') def _real_extract(self, url): - video_id = self._match_id(url) + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) + try: + episode = self._download_json( + self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + except ExtractorError as e: + self._handle_error(e, 403) - episode_id = self._search_regex( - r'episode="([^"]+)"', webpage, 'episode id') - - request = sanitized_Request( - self._PLAYER_URL_TEMPLATE % episode_id, - headers={'User-Agent': self._USER_AGENT}) - player = self._download_json(request, episode_id, 'Downloading player JSON') - - episode_type = player.get('typeOfEpisode') - error_message = self._ERRORS.get(episode_type) - if error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + title = episode['titulo'] formats = [] - video_url = player.get('urlVideo') - if video_url: - format_info = { - 'url': video_url, - 'format_id': 'http', - } - mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) - if mobj: - format_info.update({ - 'width': int_or_none(mobj.group('width')), - 'height': int_or_none(mobj.group('height')), - 'tbr': int_or_none(mobj.group('bitrate')), - }) - formats.append(format_info) - - timestamp = int_or_none(self._download_webpage( - self._TIME_API_URL, - video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) - timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) - token = hmac.new( - self._MAGIC.encode('ascii'), - (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5 - ).hexdigest() - - request = sanitized_Request( - self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), - headers={'User-Agent': self._USER_AGENT}) - - fmt_json = self._download_json( - request, video_id, 'Downloading windows video JSON') - - result = fmt_json.get('resultDes') - if result.lower() != 'ok': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, result), expected=True) - - for format_id, video_url in fmt_json['resultObject'].items(): - if format_id == 'token' or not video_url.startswith('http'): + for source in episode.get('sources', []): + src = source.get('src') + if not src: continue - if 'geodeswowsmpra3player' in video_url: - # f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - # f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) - # this videos are protected by DRM, the f4m downloader doesn't support them - continue - video_url_hd = video_url.replace('free_es', 'es') - formats.extend(self._extract_f4m_formats( - video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds', - fatal=False)) - formats.extend(self._extract_mpd_formats( - video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash', - fatal=False)) + src_type = source.get('type') + if src_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif src_type == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) - path_data = player.get('pathData') - - episode = self._download_xml( - self._EPISODE_URL_TEMPLATE % path_data, video_id, - 'Downloading episode XML') - - duration = float_or_none(xpath_text( - episode, './media/asset/info/technical/contentDuration', 'duration')) - - art = episode.find('./media/asset/info/art') - title = xpath_text(art, './name', 'title') - description = xpath_text(art, './description', 'description') - thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') - - subtitles = {} - subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') - if subtitle_url: - subtitles['es'] = [{ - 'ext': 'srt', - 'url': subtitle_url, - }] + heartbeat = episode.get('heartbeat') or {} + omniture = episode.get('omniture') or {} + get_meta = lambda x: heartbeat.get(x) or omniture.get(x) return { + 'display_id': display_id, 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, + 'description': episode.get('descripcion'), + 'thumbnail': episode.get('imgPoster'), + 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'subtitles': subtitles, + 'channel': get_meta('channel'), + 'season': get_meta('season'), + 'episode_number': int_or_none(get_meta('episodeNumber')), } From e29e96a9f5bc390789d176d509f592e208aa30d8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 Oct 2019 15:06:48 +0100 Subject: [PATCH 0088/1705] [dumpert] fix extraction(closes #22428)(closes #22564) --- youtube_dl/extractor/dumpert.py | 83 +++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index be2e3d378..d9d9afdec 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -1,20 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import compat_b64decode from ..utils import ( + int_or_none, qualities, - sanitized_Request, ) class DumpertIE(InfoExtractor): - _VALID_URL = r'(?P<protocol>https?)://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P<id>[0-9]+/[0-9a-zA-Z]+)' + _VALID_URL = r'(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P<id>[0-9]+[/_][0-9a-zA-Z]+)' _TESTS = [{ - 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', + 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', 'info_dict': { 'id': '6646981/951bc60f', @@ -24,46 +21,60 @@ class DumpertIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', } }, { - 'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/', + 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', + 'only_matching': True, + }, { + 'url': 'http://legacy.dumpert.nl/mediabase/6646981/951bc60f', + 'only_matching': True, + }, { + 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - protocol = mobj.group('protocol') - - url = '%s://www.dumpert.nl/mediabase/%s' % (protocol, video_id) - req = sanitized_Request(url) - req.add_header('Cookie', 'nsfw=1; cpc=10') - webpage = self._download_webpage(req, video_id) - - files_base64 = self._search_regex( - r'data-files="([^"]+)"', webpage, 'data files') - - files = self._parse_json( - compat_b64decode(files_base64).decode('utf-8'), - video_id) + video_id = self._match_id(url).replace('_', '/') + item = self._download_json( + 'http://api-live.dumpert.nl/mobile_api/json/info/' + video_id.replace('/', '_'), + video_id)['items'][0] + title = item['title'] + media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') quality = qualities(['flv', 'mobile', 'tablet', '720p']) - - formats = [{ - 'url': video_url, - 'format_id': format_id, - 'quality': quality(format_id), - } for format_id, video_url in files.items() if format_id != 'still'] + formats = [] + for variant in media.get('variants', []): + uri = variant.get('uri') + if not uri: + continue + version = variant.get('version') + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': quality(version), + }) self._sort_formats(formats) - title = self._html_search_meta( - 'title', webpage) or self._og_search_title(webpage) - description = self._html_search_meta( - 'description', webpage) or self._og_search_description(webpage) - thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + thumbnails = [] + stills = item.get('stills') or {} + for t in ('thumb', 'still'): + for s in ('', '-medium', '-large'): + still_id = t + s + still_url = stills.get(still_id) + if not still_url: + continue + thumbnails.append({ + 'id': still_id, + 'url': still_url, + }) + + stats = item.get('stats') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats + 'description': item.get('description'), + 'thumbnails': thumbnails, + 'formats': formats, + 'duration': int_or_none(media.get('duration')), + 'like_count': int_or_none(stats.get('kudos_total')), + 'view_count': int_or_none(stats.get('views_total')), } From 2b115b9460502944d6088cf42810c440495128a3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 Oct 2019 15:41:58 +0100 Subject: [PATCH 0089/1705] [servingsys] Remove extractor(closes #22639) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/servingsys.py | 72 ------------------------------ 2 files changed, 73 deletions(-) delete mode 100644 youtube_dl/extractor/servingsys.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7a1e0dad6..53d527440 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -995,7 +995,6 @@ from .scrippsnetworks import ScrippsNetworksWatchIE from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE -from .servingsys import ServingSysIE from .servus import ServusIE from .sevenplus import SevenPlusIE from .sexu import SexuIE diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py deleted file mode 100644 index c013d678f..000000000 --- a/youtube_dl/extractor/servingsys.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, -) - - -class ServingSysIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)' - - _TEST = { - 'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?', - 'info_dict': { - 'id': '5349193', - 'title': 'AdAPPter_Hyundai_demo', - }, - 'playlist': [{ - 'md5': 'baed851342df6846eb8677a60a011a0f', - 'info_dict': { - 'id': '29955898', - 'ext': 'flv', - 'title': 'AdAPPter_Hyundai_demo (1)', - 'duration': 74, - 'tbr': 1378, - 'width': 640, - 'height': 400, - }, - }, { - 'md5': '979b4da2655c4bc2d81aeb915a8c5014', - 'info_dict': { - 'id': '29907998', - 'ext': 'flv', - 'title': 'AdAPPter_Hyundai_demo (2)', - 'duration': 34, - 'width': 854, - 'height': 480, - 'tbr': 516, - }, - }], - 'params': { - 'playlistend': 2, - }, - '_skip': 'Blocked in the US [sic]', - } - - def _real_extract(self, url): - pl_id = self._match_id(url) - vast_doc = self._download_xml(url, pl_id) - - title = vast_doc.find('.//AdTitle').text - media = vast_doc.find('.//MediaFile').text - info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL') - - doc = self._download_xml(info_url, pl_id, 'Downloading video info') - entries = [{ - '_type': 'video', - 'id': a.attrib['id'], - 'title': '%s (%s)' % (title, a.attrib['assetID']), - 'url': a.attrib['URL'], - 'duration': int_or_none(a.attrib.get('length')), - 'tbr': int_or_none(a.attrib.get('bitrate')), - 'height': int_or_none(a.attrib.get('height')), - 'width': int_or_none(a.attrib.get('width')), - } for a in doc.findall('.//AdditionalAssets/asset')] - - return { - '_type': 'playlist', - 'id': pl_id, - 'title': title, - 'entries': entries, - } From d07866f13efac39bf3f0b331870a15e0f5e98057 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 Oct 2019 15:45:45 +0100 Subject: [PATCH 0090/1705] [mit] Remove support for video.mit.edu(closes #22403) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/mit.py | 24 ------------------------ 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 53d527440..ea47b99f6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -644,7 +644,7 @@ from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE -from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mit import TechTVMITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import ( MixcloudIE, diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 1aea78d11..e1506a745 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -65,30 +65,6 @@ class TechTVMITIE(InfoExtractor): } -class MITIE(TechTVMITIE): - IE_NAME = 'video.mit.edu' - _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' - - _TEST = { - 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', - 'md5': '7db01d5ccc1895fc5010e9c9e13648da', - 'info_dict': { - 'id': '21783', - 'ext': 'mp4', - 'title': 'The Government is Profiling You', - 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_title = mobj.group('title') - webpage = self._download_webpage(url, page_title) - embed_url = self._search_regex( - r'<iframe .*?src="(.+?)"', webpage, 'embed url') - return self.url_result(embed_url) - - class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' From bc48773ed4c068adfe67078714814035660e5ca4 Mon Sep 17 00:00:00 2001 From: MobiDotS <msaad615@gmail.com> Date: Wed, 16 Oct 2019 10:13:35 -0500 Subject: [PATCH 0091/1705] [twitch] update VOD URL matching (closes #22395) (#22727) --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 0500e33a6..ca7676fe2 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -248,7 +248,7 @@ class TwitchVodIE(TwitchItemBaseIE): https?:// (?: (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| - player\.twitch\.tv/\?.*?\bvideo=v + player\.twitch\.tv/\?.*?\bvideo=v? ) (?P<id>\d+) ''' @@ -306,6 +306,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'https://www.twitch.tv/northernlion/video/291940395', 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?video=480452374', + 'only_matching': True, }] def _real_extract(self, url): From 000115759485797be719c71716c1ac35f003ba6c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 16 Oct 2019 23:57:40 +0100 Subject: [PATCH 0092/1705] [atresplayer] Add coding cookie --- youtube_dl/extractor/atresplayer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index b96218f6c..c2cec9845 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re From 86f63633c8e7c62ce245d1352d4d381efb614466 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 13:20:16 +0100 Subject: [PATCH 0093/1705] [audioboom] improve metadata extraction --- youtube_dl/extractor/audioboom.py | 34 +++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 393f381c6..c51837b40 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -2,22 +2,25 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + clean_html, + float_or_none, +) class AudioBoomIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', - 'md5': '63a8d73a055c6ed0f1e51921a10a5a76', + 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', + 'md5': '7b00192e593ff227e6a315486979a42d', 'info_dict': { - 'id': '4279833', + 'id': '7398103', 'ext': 'mp3', - 'title': '3/09/2016 Czaban Hour 3', - 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', - 'duration': 2245.72, - 'uploader': 'SB Nation A.M.', - 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', + 'title': 'Asim Chaudhry', + 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'duration': 4000.99, + 'uploader': 'Sue Perkins: An hour or so with...', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', } }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', @@ -32,8 +35,8 @@ class AudioBoomIE(InfoExtractor): clip = None clip_store = self._parse_json( - self._search_regex( - r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, + self._html_search_regex( + r'data-new-clip-store=(["\'])(?P<json>{.+?})\1', webpage, 'clip store', default='{}', group='json'), video_id, fatal=False) if clip_store: @@ -47,14 +50,15 @@ class AudioBoomIE(InfoExtractor): audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( 'audio', webpage, 'audio url') - title = from_clip('title') or self._og_search_title(webpage) - description = from_clip('description') or self._og_search_description(webpage) + title = from_clip('title') or self._html_search_meta( + ['og:title', 'og:audio:title', 'audio_title'], webpage) + description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) duration = float_or_none(from_clip('duration') or self._html_search_meta( 'weibo:audio:duration', webpage)) - uploader = from_clip('author') or self._og_search_property( - 'audio:artist', webpage, 'uploader', fatal=False) + uploader = from_clip('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') uploader_url = from_clip('author_url') or self._html_search_meta( 'audioboo:channel', webpage, 'uploader url') From 755541a4c8ac3dd4e8b9abd0c7df95182a1f3fd4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 13:21:44 +0100 Subject: [PATCH 0094/1705] [mangomolo] fix video format extraction and add support for player URLs --- youtube_dl/extractor/generic.py | 8 ++++++-- youtube_dl/extractor/mangomolo.py | 17 +++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec43c5ae4..5ed952b29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2962,10 +2962,14 @@ class GenericIE(InfoExtractor): # Look for Mangomolo embeds mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/ + r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// + (?: + admin\.mangomolo\.com/analytics/index\.php/customers/embed| + player\.mangomolo\.com/v1 + )/ (?: video\?.*?\bid=(?P<video_id>\d+)| - index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) + (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) ).+?)\1''', webpage) if mobj is not None: info = { diff --git a/youtube_dl/extractor/mangomolo.py b/youtube_dl/extractor/mangomolo.py index 482175a34..acee370e9 100644 --- a/youtube_dl/extractor/mangomolo.py +++ b/youtube_dl/extractor/mangomolo.py @@ -10,18 +10,21 @@ from ..utils import int_or_none class MangomoloBaseIE(InfoExtractor): + _BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' + def _get_real_id(self, page_id): return page_id def _real_extract(self, url): page_id = self._get_real_id(self._match_id(url)) - webpage = self._download_webpage(url, page_id) + webpage = self._download_webpage( + 'https://player.mangomolo.com/v1/%s?%s' % (self._TYPE, url.split('?')[1]), page_id) hidden_inputs = self._hidden_inputs(webpage) m3u8_entry_protocol = 'm3u8' if self._IS_LIVE else 'm3u8_native' format_url = self._html_search_regex( [ - r'file\s*:\s*"(https?://[^"]+?/playlist\.m3u8)', + r'(?:file|src)\s*:\s*"(https?://[^"]+?/playlist\.m3u8)', r'<a[^>]+href="(rtsp://[^"]+)"' ], webpage, 'format url') formats = self._extract_wowza_formats( @@ -39,14 +42,16 @@ class MangomoloBaseIE(InfoExtractor): class MangomoloVideoIE(MangomoloBaseIE): - IE_NAME = 'mangomolo:video' - _VALID_URL = r'https?://admin\.mangomolo\.com/analytics/index\.php/customers/embed/video\?.*?\bid=(?P<id>\d+)' + _TYPE = 'video' + IE_NAME = 'mangomolo:' + _TYPE + _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)' _IS_LIVE = False class MangomoloLiveIE(MangomoloBaseIE): - IE_NAME = 'mangomolo:live' - _VALID_URL = r'https?://admin\.mangomolo\.com/analytics/index\.php/customers/embed/index\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' + _TYPE = 'live' + IE_NAME = 'mangomolo:' + _TYPE + _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' _IS_LIVE = True def _get_real_id(self, page_id): From 59296bae7ec6d15b0df37dce34bdd96381c0e743 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 13:26:45 +0100 Subject: [PATCH 0095/1705] [xfileshare] clean extractor - update the list of domains - add support for aa-encoded video data - improve jwplayer format extraction - add support for Clappr sources closes #17032 closes #17906 closes #18237 closes #18239 --- youtube_dl/extractor/xfileshare.py | 192 +++++++++++++---------------- 1 file changed, 86 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index b38c7a7b3..48ef07ed1 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -4,37 +4,64 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_chr from ..utils import ( decode_packed_codes, determine_ext, ExtractorError, int_or_none, - NO_DEFAULT, + js_to_json, urlencode_postdata, ) +# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 +def aa_decode(aa_code): + symbol_table = [ + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('2', '((o^_^o) - (゚Θ゚))'), + ('4', '(゚ー゚)'), + ('3', '(o^_^o)'), + ('1', '(゚Θ゚)'), + ('0', '(c^_^o)'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aa_char in aa_code.split(delim): + for val, pat in symbol_table: + aa_char = aa_char.replace(pat, val) + aa_char = aa_char.replace('+ ', '') + m = re.match(r'^\d+', aa_char) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aa_char) + if m: + ret += compat_chr(int(m.group(1), 16)) + return ret + + class XFileShareIE(InfoExtractor): _SITES = ( - (r'daclips\.(?:in|com)', 'DaClips'), - (r'filehoot\.com', 'FileHoot'), - (r'gorillavid\.(?:in|com)', 'GorillaVid'), - (r'movpod\.in', 'MovPod'), - (r'powerwatch\.pw', 'PowerWatch'), - (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'clipwatching\.com', 'ClipWatching'), + (r'gounlimited\.to', 'GoUnlimited'), + (r'govid\.me', 'GoVid'), + (r'holavid\.com', 'HolaVid'), + (r'streamty\.com', 'Streamty'), (r'thevideobee\.to', 'TheVideoBee'), - (r'vidto\.(?:me|se)', 'Vidto'), - (r'streamin\.to', 'Streamin.To'), - (r'xvidstage\.com', 'XVIDSTAGE'), - (r'vidabc\.com', 'Vid ABC'), + (r'uqload\.com', 'Uqload'), (r'vidbom\.com', 'VidBom'), (r'vidlo\.us', 'vidlo'), - (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), - (r'fastvideo\.me', 'FastVideo.me'), + (r'vidlocker\.xyz', 'VidLocker'), + (r'vidshare\.tv', 'VidShare'), + (r'vup\.to', 'VUp'), + (r'xvideosharing\.com', 'XVideoSharing'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) - _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEXES = ( @@ -43,82 +70,14 @@ class XFileShareIE(InfoExtractor): ) _TESTS = [{ - 'url': 'http://gorillavid.in/06y9juieqpmi', - 'md5': '5ae4a3580620380619678ee4875893ba', + 'url': 'http://xvideosharing.com/fq65f94nd2ve', + 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', 'info_dict': { - 'id': '06y9juieqpmi', + 'id': 'fq65f94nd2ve', 'ext': 'mp4', - 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', + 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, - }, { - 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', - 'only_matching': True, - }, { - 'url': 'http://daclips.in/3rso4kdn6f9m', - 'md5': '1ad8fd39bb976eeb66004d3a4895f106', - 'info_dict': { - 'id': '3rso4kdn6f9m', - 'ext': 'mp4', - 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', - 'thumbnail': r're:http://.*\.jpg', - } - }, { - 'url': 'http://movpod.in/0wguyyxi1yca', - 'only_matching': True, - }, { - 'url': 'http://filehoot.com/3ivfabn7573c.html', - 'info_dict': { - 'id': '3ivfabn7573c', - 'ext': 'mp4', - 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', - 'thumbnail': r're:http://.*\.jpg', - }, - 'skip': 'Video removed', - }, { - 'url': 'http://vidto.me/ku5glz52nqe1.html', - 'info_dict': { - 'id': 'ku5glz52nqe1', - 'ext': 'mp4', - 'title': 'test' - } - }, { - 'url': 'http://powerwatch.pw/duecjibvicbu', - 'info_dict': { - 'id': 'duecjibvicbu', - 'ext': 'mp4', - 'title': 'Big Buck Bunny trailer', - }, - }, { - 'url': 'http://xvidstage.com/e0qcnl03co6z', - 'info_dict': { - 'id': 'e0qcnl03co6z', - 'ext': 'mp4', - 'title': 'Chucky Prank 2015.mp4', - }, - }, { - # removed by administrator - 'url': 'http://xvidstage.com/amfy7atlkx25', - 'only_matching': True, - }, { - 'url': 'http://vidabc.com/i8ybqscrphfv', - 'info_dict': { - 'id': 'i8ybqscrphfv', - 'ext': 'mp4', - 'title': 're:Beauty and the Beast 2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.rapidvideo.cool/b667kprndr8w', - 'only_matching': True, - }, { - 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', - 'only_matching': True, - }, { - 'url': 'http://vidto.se/1tx1pf6t12cg.html', - 'only_matching': True, }] @staticmethod @@ -131,10 +90,9 @@ class XFileShareIE(InfoExtractor): webpage)] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - url = 'http://%s/%s' % (mobj.group('host'), video_id) + url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) webpage = self._download_webpage(url, video_id) if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): @@ -142,7 +100,7 @@ class XFileShareIE(InfoExtractor): fields = self._hidden_inputs(webpage) - if fields['op'] == 'download1': + if fields.get('op') == 'download1': countdown = int_or_none(self._search_regex( r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>', webpage, 'countdown', default=None)) @@ -160,13 +118,37 @@ class XFileShareIE(InfoExtractor): (r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+) ', + r'>Watch (.+)[ <]', r'<h2 class="video-page-head">([^<]+)</h2>', - r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<'), # streamin.to + r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to + r'title\s*:\s*"([^"]+)"'), # govid.me webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_formats(default=NO_DEFAULT): + for regex, func in ( + (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + + formats = [] + + jwplayer_data = self._search_regex( + [ + r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', + r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', + ], webpage, + 'jwplayer data', default=None) + if jwplayer_data: + jwplayer_data = self._parse_json( + jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + if jwplayer_data: + formats = self._parse_jwplayer_data( + jwplayer_data, video_id, False, + m3u8_id='hls', mpd_id='dash')['formats'] + + if not formats: urls = [] for regex in ( r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', @@ -177,6 +159,12 @@ class XFileShareIE(InfoExtractor): video_url = mobj.group('url') if video_url not in urls: urls.append(video_url) + + sources = self._search_regex( + r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) + if sources: + urls.extend(self._parse_json(sources, video_id)) + formats = [] for video_url in urls: if determine_ext(video_url) == 'm3u8': @@ -189,21 +177,13 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - if not formats and default is not NO_DEFAULT: - return default - self._sort_formats(formats) - return formats - - formats = extract_formats(default=None) - - if not formats: - webpage = decode_packed_codes(self._search_regex( - r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", - webpage, 'packed code')) - formats = extract_formats() + self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) + [ + r'<video[^>]+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ], webpage, 'thumbnail', default=None) return { 'id': video_id, From 34e3885bc9e3aecab104b96eabce03854ac8f7a2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 17 Oct 2019 15:55:44 +0100 Subject: [PATCH 0096/1705] [viewster->contv] remove viewster extractor and add support for contv.com --- youtube_dl/extractor/contv.py | 118 ++++++++++++++++ youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/viewster.py | 217 ----------------------------- 3 files changed, 119 insertions(+), 218 deletions(-) create mode 100644 youtube_dl/extractor/contv.py delete mode 100644 youtube_dl/extractor/viewster.py diff --git a/youtube_dl/extractor/contv.py b/youtube_dl/extractor/contv.py new file mode 100644 index 000000000..84b462d40 --- /dev/null +++ b/youtube_dl/extractor/contv.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, +) + + +class CONtvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?contv\.com/details-movie/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.contv.com/details-movie/CEG10022949/days-of-thrills-&-laughter', + 'info_dict': { + 'id': 'CEG10022949', + 'ext': 'mp4', + 'title': 'Days Of Thrills & Laughter', + 'description': 'md5:5d6b3d0b1829bb93eb72898c734802eb', + 'upload_date': '20180703', + 'timestamp': 1530634789.61, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.contv.com/details-movie/CLIP-show_fotld_bts/fight-of-the-living-dead:-behind-the-scenes-bites', + 'info_dict': { + 'id': 'CLIP-show_fotld_bts', + 'title': 'Fight of the Living Dead: Behind the Scenes Bites', + }, + 'playlist_mincount': 7, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + details = self._download_json( + 'http://metax.contv.live.junctiontv.net/metax/2.5/details/' + video_id, + video_id, query={'device': 'web'}) + + if details.get('type') == 'episodic': + seasons = self._download_json( + 'http://metax.contv.live.junctiontv.net/metax/2.5/seriesfeed/json/' + video_id, + video_id) + entries = [] + for season in seasons: + for episode in season.get('episodes', []): + episode_id = episode.get('id') + if not episode_id: + continue + entries.append(self.url_result( + 'https://www.contv.com/details-movie/' + episode_id, + CONtvIE.ie_key(), episode_id)) + return self.playlist_result(entries, video_id, details.get('title')) + + m_details = details['details'] + title = details['title'] + + formats = [] + + media_hls_url = m_details.get('media_hls_url') + if media_hls_url: + formats.extend(self._extract_m3u8_formats( + media_hls_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + + media_mp4_url = m_details.get('media_mp4_url') + if media_mp4_url: + formats.append({ + 'format_id': 'http', + 'url': media_mp4_url, + }) + + self._sort_formats(formats) + + subtitles = {} + captions = m_details.get('captions') or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({ + 'url': caption_url + }) + + thumbnails = [] + for image in m_details.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + description = None + for p in ('large_', 'medium_', 'small_', ''): + d = m_details.get(p + 'description') + if d: + description = d + break + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': description, + 'timestamp': float_or_none(details.get('metax_added_on'), 1000), + 'subtitles': subtitles, + 'duration': float_or_none(m_details.get('duration'), 1000), + 'view_count': int_or_none(details.get('num_watched')), + 'like_count': int_or_none(details.get('num_fav')), + 'categories': details.get('category'), + 'tags': details.get('tags'), + 'season_number': int_or_none(details.get('season')), + 'episode_number': int_or_none(details.get('episode')), + 'release_year': int_or_none(details.get('pub_year')), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ea47b99f6..1db21529f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -231,6 +231,7 @@ from .commonprotocols import ( RtmpIE, ) from .condenast import CondeNastIE +from .contv import CONtvIE from .corus import CorusIE from .cracked import CrackedIE from .crackle import CrackleIE @@ -1322,7 +1323,6 @@ from .viewlift import ( ViewLiftIE, ViewLiftEmbedIE, ) -from .viewster import ViewsterIE from .viidea import ViideaIE from .vimeo import ( VimeoIE, diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py deleted file mode 100644 index 6e318479c..000000000 --- a/youtube_dl/extractor/viewster.py +++ /dev/null @@ -1,217 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_unquote, -) -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_iso8601, - sanitized_Request, - HEADRequest, - url_basename, -) - - -class ViewsterIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)' - _TESTS = [{ - # movie, Type=Movie - 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', - 'md5': 'e642d1b27fcf3a4ffa79f194f5adde36', - 'info_dict': { - 'id': '1140-11855-000', - 'ext': 'mp4', - 'title': 'The listening Project', - 'description': 'md5:bac720244afd1a8ea279864e67baa071', - 'timestamp': 1214870400, - 'upload_date': '20080701', - 'duration': 4680, - }, - }, { - # series episode, Type=Episode - 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/', - 'md5': '9243079a8531809efe1b089db102c069', - 'info_dict': { - 'id': '1284-19427-001', - 'ext': 'mp4', - 'title': 'The World and a Wall', - 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3', - 'timestamp': 1428192000, - 'upload_date': '20150405', - 'duration': 1500, - }, - }, { - # serie, Type=Serie - 'url': 'http://www.viewster.com/serie/1303-19426-000/', - 'info_dict': { - 'id': '1303-19426-000', - 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?', - 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11', - }, - 'playlist_count': 13, - }, { - # unfinished serie, no Type - 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/', - 'info_dict': { - 'id': '1284-19427-000', - 'title': 'Baby Steps—Season 2', - 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1', - }, - 'playlist_mincount': 16, - }, { - # geo restricted series - 'url': 'https://www.viewster.com/serie/1280-18794-002/', - 'only_matching': True, - }, { - # geo restricted video - 'url': 'https://www.viewster.com/serie/1280-18794-002/what-is-extraterritoriality-lawo/', - 'only_matching': True, - }] - - _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - - def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}): - request = sanitized_Request(url) - request.add_header('Accept', self._ACCEPT_HEADER) - request.add_header('Auth-token', self._AUTH_TOKEN) - return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query) - - def _real_extract(self, url): - video_id = self._match_id(url) - # Get 'api_token' cookie - self._request_webpage( - HEADRequest('http://www.viewster.com/'), - video_id, headers=self.geo_verification_headers()) - cookies = self._get_cookies('http://www.viewster.com/') - self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) - - info = self._download_json( - 'https://public-api.viewster.com/search/%s' % video_id, - video_id, 'Downloading entry JSON') - - entry_id = info.get('Id') or info['id'] - - # unfinished serie has no Type - if info.get('Type') in ('Serie', None): - try: - episodes = self._download_json( - 'https://public-api.viewster.com/series/%s/episodes' % entry_id, - video_id, 'Downloading series JSON') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - self.raise_geo_restricted() - else: - raise - entries = [ - self.url_result( - 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') - for episode in episodes] - title = (info.get('Title') or info['Synopsis']['Title']).strip() - description = info.get('Synopsis', {}).get('Detailed') - return self.playlist_result(entries, video_id, title, description) - - formats = [] - for language_set in info.get('LanguageSets', []): - manifest_url = None - m3u8_formats = [] - audio = language_set.get('Audio') or '' - subtitle = language_set.get('Subtitle') or '' - base_format_id = audio - if subtitle: - base_format_id += '-%s' % subtitle - - def concat(suffix, sep='-'): - return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix - - medias = self._download_json( - 'https://public-api.viewster.com/movies/%s/videos' % entry_id, - video_id, fatal=False, query={ - 'mediaTypes': ['application/f4m+xml', 'application/x-mpegURL', 'video/mp4'], - 'language': audio, - 'subtitle': subtitle, - }) - if not medias: - continue - for media in medias: - video_url = media.get('Uri') - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'f4m': - manifest_url = video_url - video_url += '&' if '?' in video_url else '?' - video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=concat('hds'))) - elif ext == 'm3u8': - manifest_url = video_url - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=concat('hls'), - fatal=False) # m3u8 sometimes fail - if m3u8_formats: - formats.extend(m3u8_formats) - else: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if not qualities_basename: - continue - QUALITIES_RE = r'((,\d+k)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if not qualities: - continue - qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) - http_url_basename = url_basename(video_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) - - if not formats and not info.get('VODSettings'): - self.raise_geo_restricted() - - self._sort_formats(formats) - - synopsis = info.get('Synopsis') or {} - # Prefer title outside synopsis since it's less messy - title = (info.get('Title') or synopsis['Title']).strip() - description = synopsis.get('Detailed') or (info.get('Synopsis') or {}).get('Short') - duration = int_or_none(info.get('Duration')) - timestamp = parse_iso8601(info.get('ReleaseDate')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - } From 824fa51165d92ceee01589bf995ebbf009df328c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Oct 2019 04:03:53 +0700 Subject: [PATCH 0097/1705] [utils] Improve subtitles_filename (closes #22753) --- test/test_utils.py | 6 ++++++ youtube_dl/YoutubeDL.py | 2 +- youtube_dl/postprocessor/ffmpeg.py | 8 ++++---- youtube_dl/utils.py | 4 ++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 659c6ece5..3920542bb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -74,6 +74,7 @@ from youtube_dl.utils import ( str_to_int, strip_jsonp, strip_or_none, + subtitles_filename, timeconvert, unescapeHTML, unified_strdate, @@ -261,6 +262,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') + def test_subtitles_filename(self): + self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt'), 'abc.en.vtt') + self.assertEqual(subtitles_filename('abc.ext', 'en', 'vtt', 'ext'), 'abc.en.vtt') + self.assertEqual(subtitles_filename('abc.unexpected_ext', 'en', 'vtt', 'ext'), 'abc.unexpected_ext.en.vtt') + def test_remove_start(self): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c3d1407f9..f5cb46308 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1814,7 +1814,7 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - sub_filename = subtitles_filename(filename, sub_lang, sub_format) + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 70416c25e..fd3f921a8 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -393,7 +393,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_ext = sub_info['ext'] if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) - sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': webm_vtt_warn = True @@ -606,9 +606,9 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): self._downloader.to_screen( '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue - old_file = subtitles_filename(filename, lang, ext) + old_file = subtitles_filename(filename, lang, ext, info.get('ext')) sub_filenames.append(old_file) - new_file = subtitles_filename(filename, lang, new_ext) + new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( @@ -616,7 +616,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): 'which results in style information loss') dfxp_file = old_file - srt_file = subtitles_filename(filename, lang, 'srt') + srt_file = subtitles_filename(filename, lang, 'srt', info.get('ext')) with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 798757241..53117ea90 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2906,8 +2906,8 @@ def determine_ext(url, default_ext='unknown_video'): return default_ext -def subtitles_filename(filename, sub_lang, sub_format): - return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format +def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): + return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) def date_from_str(date_str): From 2297c0d7d977921dca865e6c9cbc7ee5282ba8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Oct 2019 23:56:36 +0700 Subject: [PATCH 0098/1705] [facebook] Bypass download rate limits (closes #21018) --- youtube_dl/extractor/facebook.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index a3dcdca3e..a56f85c21 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -405,6 +405,11 @@ class FacebookIE(InfoExtractor): if not formats: raise ExtractorError('Cannot find video formats') + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + self._sort_formats(formats) video_title = self._html_search_regex( From b4818e3c7a718428d3366c34da8e21e2f416f5e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Oct 2019 00:02:22 +0700 Subject: [PATCH 0099/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index dc5c32a1f..045349b05 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version <unreleased> + +Core +* [utils] Improve subtitles_filename (#22753) + +Extractors +* [facebook] Bypass download rate limits (#21018) ++ [contv] Add support for contv.com +- [viewster] Remove extractor +* [xfileshare] Improve extractor (#17032, #17906, #18237, #18239) + * Update the list of domains + + Add support for aa-encoded video data + * Improve jwplayer format extraction + + Add support for Clappr sources +* [mangomolo] Fix video format extraction and add support for player URLs +* [audioboom] Improve metadata extraction +* [twitch] Update VOD URL matching (#22395, #22727) +- [mit] Remove support for video.mit.edu (#22403) +- [servingsys] Remove extractor (#22639) +* [dumpert] Fix extraction (#22428, #22564) +* [atresplayer] Fix extraction (#16277, #16716) + + version 2019.10.16 Core From 820215f0e34813089d559fed24a398d9e91810e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Oct 2019 00:09:02 +0700 Subject: [PATCH 0100/1705] release 2019.10.22 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 ++---- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 18 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 5cd9f0dc0..f1afe704c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.16 + [debug] youtube-dl version 2019.10.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 6cc34796a..a4dc9b005 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 0b7911e79..5bf86adce 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a6f417d38..7aa5534e5 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.16 + [debug] youtube-dl version 2019.10.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 3fe753b62..5d3645e3d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.16** +- [ ] I've verified that I'm running youtube-dl version **2019.10.22** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 045349b05..64233b03b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.10.22 Core * [utils] Improve subtitles_filename (#22753) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0cbad28ea..a1b0edeeb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -183,6 +183,7 @@ - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED + - **CONtv** - **Corus** - **Coub** - **Cracked** @@ -784,7 +785,6 @@ - **Seeker** - **SenateISVP** - **SendtoNews** - - **ServingSys** - **Servus** - **Sexu** - **SeznamZpravy** @@ -1005,7 +1005,6 @@ - **Viddler** - **Videa** - **video.google:search**: Google Video search - - **video.mit.edu** - **VideoDetective** - **videofy.me** - **videomore** @@ -1023,7 +1022,6 @@ - **vier:videos** - **ViewLift** - **ViewLiftEmbed** - - **Viewster** - **Viidea** - **viki** - **viki:channel** @@ -1097,7 +1095,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me + - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 53889b7cb..39b355b9e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.10.16' +__version__ = '2019.10.22' From 0c2d10d225f61ac1fb534d8ed1788250401465b2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 22 Oct 2019 17:49:50 +0100 Subject: [PATCH 0101/1705] [globo] handle alternative hash signing method --- youtube_dl/extractor/globo.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 9ad1d95fb..60d842d3a 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -132,18 +132,24 @@ class GloboIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, message), expected=True) continue - assert security_hash[:2] in ('04', '14') - received_time = security_hash[3:13] - received_md5 = security_hash[24:] - - sign_time = compat_str(int(received_time) + 86400) + hash_code = security_hash[:2] padding = '%010d' % random.randint(1, 10000000000) + if hash_code in ('04', '14'): + received_time = security_hash[3:13] + received_md5 = security_hash[24:] + hash_prefix = security_hash[:23] + elif hash_code in ('02', '12', '03', '13'): + received_time = security_hash[2:12] + received_md5 = security_hash[22:] + padding += '1' + hash_prefix = '05' + security_hash[:22] - md5_data = (received_md5 + sign_time + padding + '0xAC10FD').encode() + padded_sign_time = compat_str(int(received_time) + 86400) + padding + md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = security_hash[:23] + sign_time + padding + signed_md5 - + signed_hash = hash_prefix + padded_sign_time + signed_md5 signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '') + if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', From 07154c793065bca816793186590d8d6461e07478 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 22 Oct 2019 17:53:47 +0100 Subject: [PATCH 0102/1705] [facebook] extract subtitles(closes #22777) --- youtube_dl/extractor/ceskatelevize.py | 2 ++ youtube_dl/extractor/facebook.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 1ec58f7d8..7cb4efb74 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -147,6 +147,8 @@ class CeskaTelevizeIE(InfoExtractor): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'drmOnly=true' in stream_url: + continue if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index a56f85c21..c723726b7 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -379,6 +379,7 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + subtitles = {} formats = [] for f in video_data: format_id = f['stream_type'] @@ -402,6 +403,9 @@ class FacebookIE(InfoExtractor): if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + subtitles_src = f[0].get('subtitles_src') + if subtitles_src: + subtitles.setdefault('en', []).append({'url': subtitles_src}) if not formats: raise ExtractorError('Cannot find video formats') @@ -447,6 +451,7 @@ class FacebookIE(InfoExtractor): 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, + 'subtitles': subtitles, } return webpage, info_dict From 162bcc68dc73706699b559fffdd8bed3db6643b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 24 Oct 2019 12:53:33 +0100 Subject: [PATCH 0103/1705] [puhutv] improve extraction - fix subtitles extraction - transform HLS URLs to http URLs - improve metadata extraction --- youtube_dl/extractor/puhutv.py | 90 ++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py index 5465e8ab7..fb704a3c4 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/youtube_dl/extractor/puhutv.py @@ -25,21 +25,21 @@ class PuhuTVIE(InfoExtractor): _TESTS = [{ # film 'url': 'https://puhutv.com/sut-kardesler-izle', - 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', 'info_dict': { 'id': '5085', 'display_id': 'sut-kardesler', 'ext': 'mp4', 'title': 'Süt Kardeşler', - 'description': 'md5:405fd024df916ca16731114eb18e511a', + 'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 4832.44, 'creator': 'Arzu Film', - 'timestamp': 1469778212, - 'upload_date': '20160729', + 'timestamp': 1561062602, + 'upload_date': '20190620', 'release_year': 1976, 'view_count': int, - 'tags': ['Aile', 'Komedi', 'Klasikler'], + 'tags': list, }, }, { # episode, geo restricted, bypassable with --geo-verification-proxy @@ -64,9 +64,10 @@ class PuhuTVIE(InfoExtractor): display_id)['data'] video_id = compat_str(info['id']) - title = info.get('name') or info['title']['name'] + show = info.get('title') or {} + title = info.get('name') or show['name'] if info.get('display_name'): - title = '%s %s' % (title, info.get('display_name')) + title = '%s %s' % (title, info['display_name']) try: videos = self._download_json( @@ -78,17 +79,36 @@ class PuhuTVIE(InfoExtractor): self.raise_geo_restricted() raise + urls = [] formats = [] + + def add_http_from_hls(m3u8_f): + http_url = m3u8_f['url'].replace('/hls/', '/mp4/').replace('/chunklist.m3u8', '.mp4') + if http_url != m3u8_f['url']: + f = m3u8_f.copy() + f.update({ + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + 'url': http_url, + }) + formats.append(f) + for video in videos['data']['videos']: media_url = url_or_none(video.get('url')) - if not media_url: + if not media_url or media_url in urls: continue + urls.append(media_url) + playlist = video.get('is_playlist') - if video.get('stream_type') == 'hls' and playlist is True: - formats.extend(self._extract_m3u8_formats( + if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: + m3u8_formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + for m3u8_f in m3u8_formats: + formats.append(m3u8_f) + add_http_from_hls(m3u8_f) continue + quality = int_or_none(video.get('quality')) f = { 'url': media_url, @@ -96,34 +116,29 @@ class PuhuTVIE(InfoExtractor): 'height': quality } video_format = video.get('video_format') - if video_format == 'hls' and playlist is False: + is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False + if is_hls: format_id = 'hls' f['protocol'] = 'm3u8_native' elif video_format == 'mp4': format_id = 'http' - else: continue if quality: format_id += '-%sp' % quality f['format_id'] = format_id formats.append(f) + if is_hls: + add_http_from_hls(f) self._sort_formats(formats) - description = try_get( - info, lambda x: x['title']['description'], - compat_str) or info.get('description') - timestamp = unified_timestamp(info.get('created_at')) creator = try_get( - info, lambda x: x['title']['producer']['name'], compat_str) + show, lambda x: x['producer']['name'], compat_str) - duration = float_or_none( - try_get(info, lambda x: x['content']['duration_in_ms'], int), - scale=1000) - view_count = try_get(info, lambda x: x['content']['watch_count'], int) + content = info.get('content') or {} images = try_get( - info, lambda x: x['content']['images']['wide'], dict) or {} + content, lambda x: x['images']['wide'], dict) or {} thumbnails = [] for image_id, image_url in images.items(): if not isinstance(image_url, compat_str): @@ -137,14 +152,8 @@ class PuhuTVIE(InfoExtractor): }) thumbnails.append(t) - release_year = try_get(info, lambda x: x['title']['released_at'], int) - - season_number = int_or_none(info.get('season_number')) - season_id = str_or_none(info.get('season_id')) - episode_number = int_or_none(info.get('episode_number')) - tags = [] - for genre in try_get(info, lambda x: x['title']['genres'], list) or []: + for genre in show.get('genres') or []: if not isinstance(genre, dict): continue genre_name = genre.get('name') @@ -152,12 +161,11 @@ class PuhuTVIE(InfoExtractor): tags.append(genre_name) subtitles = {} - for subtitle in try_get( - info, lambda x: x['content']['subtitles'], list) or []: + for subtitle in content.get('subtitles') or []: if not isinstance(subtitle, dict): continue lang = subtitle.get('language') - sub_url = url_or_none(subtitle.get('url')) + sub_url = url_or_none(subtitle.get('url') or subtitle.get('file')) if not lang or not isinstance(lang, compat_str) or not sub_url: continue subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ @@ -168,15 +176,15 @@ class PuhuTVIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'season_id': season_id, - 'season_number': season_number, - 'episode_number': episode_number, - 'release_year': release_year, - 'timestamp': timestamp, + 'description': info.get('description') or show.get('description'), + 'season_id': str_or_none(info.get('season_id')), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'release_year': int_or_none(show.get('released_at')), + 'timestamp': unified_timestamp(info.get('created_at')), 'creator': creator, - 'view_count': view_count, - 'duration': duration, + 'view_count': int_or_none(content.get('watch_count')), + 'duration': float_or_none(content.get('duration_in_ms'), 1000), 'tags': tags, 'subtitles': subtitles, 'thumbnails': thumbnails, From 416c3ca7f53dab76b9e5ec46a0c0335698252c2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 25 Oct 2019 19:27:28 +0100 Subject: [PATCH 0104/1705] [odnoklassniki] add support for Schemeless embed extraction --- youtube_dl/extractor/generic.py | 7 ++++--- youtube_dl/extractor/odnoklassniki.py | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5ed952b29..f66cae0eb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -118,6 +118,7 @@ from .foxnews import FoxNewsIE from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE +from .odnoklassniki import OdnoklassnikiIE class GenericIE(InfoExtractor): @@ -2627,9 +2628,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'VK') # Look for embedded Odnoklassniki player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Odnoklassniki') + odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) # Look for embedded ivi player mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 114b93c07..7ed9fac55 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -121,6 +123,13 @@ class OdnoklassnikiIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): start_time = int_or_none(compat_parse_qs( compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) From 3c989818e7dc7706da069312bbdd040165a97517 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 25 Oct 2019 19:35:07 +0100 Subject: [PATCH 0105/1705] [vk] improve extraction - add support for Odnoklassniki embeds - update tests - extract more video from user lists(closes #4470) - fix wall post audio extraction(closes #18332) - improve error detection(closes #22568) --- youtube_dl/extractor/vk.py | 329 +++++++++++++++++++------------------ 1 file changed, 173 insertions(+), 156 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 8b6dc0e24..c289fcad3 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -12,7 +12,6 @@ from ..utils import ( get_element_by_class, int_or_none, orderedSet, - remove_start, str_or_none, str_to_int, unescapeHTML, @@ -21,6 +20,7 @@ from ..utils import ( urlencode_postdata, ) from .dailymotion import DailymotionIE +from .odnoklassniki import OdnoklassnikiIE from .pladform import PladformIE from .vimeo import VimeoIE from .youtube import YoutubeIE @@ -60,6 +60,18 @@ class VKBaseIE(InfoExtractor): def _real_initialize(self): self._login() + def _download_payload(self, path, video_id, data, fatal=True): + data['al'] = 1 + code, payload = self._download_json( + 'https://vk.com/%s.php' % path, video_id, + data=urlencode_postdata(data), fatal=fatal, + headers={'X-Requested-With': 'XMLHttpRequest'})['payload'] + if code == '3': + self.raise_login_required() + elif code == '8': + raise ExtractorError(clean_html(payload[0][1:-1]), expected=True) + return payload + class VKIE(VKBaseIE): IE_NAME = 'vk' @@ -96,7 +108,6 @@ class VKIE(VKBaseIE): }, { 'url': 'http://vk.com/video205387401_165548505', - 'md5': '6c0aeb2e90396ba97035b9cbde548700', 'info_dict': { 'id': '205387401_165548505', 'ext': 'mp4', @@ -110,18 +121,18 @@ class VKIE(VKBaseIE): }, { 'note': 'Embedded video', - 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', - 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', + 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { - 'id': '32194266_162925554', + 'id': '-77521_162222515', 'ext': 'mp4', - 'uploader': 'Vladimir Gavrin', - 'title': 'Lin Dan', - 'duration': 101, - 'upload_date': '20120730', - 'view_count': int, + 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', + 'title': 'ProtivoGunz - Хуёвая песня', + 'duration': 195, + 'upload_date': '20120212', + 'timestamp': 1329049880, + 'uploader_id': '-77521', }, - 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -138,18 +149,19 @@ class VKIE(VKBaseIE): 'upload_date': '20121218', 'view_count': int, }, - 'skip': 'Requires vk account credentials', + 'skip': 'Removed', }, { 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', - 'md5': '4d7a5ef8cf114dfa09577e57b2993202', 'info_dict': { 'id': '-43215063_168067957', 'ext': 'mp4', - 'uploader': 'Киномания - лучшее из мира кино', + 'uploader': 'Bro Mazter', 'title': ' ', 'duration': 7291, 'upload_date': '20140328', + 'uploader_id': '223413403', + 'timestamp': 1396018030, }, 'skip': 'Requires vk account credentials', }, @@ -165,7 +177,7 @@ class VKIE(VKBaseIE): 'upload_date': '20140626', 'view_count': int, }, - 'skip': 'Only works from Russia', + 'skip': 'Removed', }, { # video (removed?) only available with list id @@ -247,6 +259,9 @@ class VKIE(VKBaseIE): 'uploader_id': '-387766', 'timestamp': 1475137527, }, + 'params': { + 'skip_download': True, + }, }, { # live stream, hls and rtmp links, most likely already finished live @@ -288,80 +303,94 @@ class VKIE(VKBaseIE): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') + mv_data = {} if video_id: - info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id + data = { + 'act': 'show_inline', + 'video': video_id, + } # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: - info_url += '&list=%s' % list_id + data['list'] = list_id + + payload = self._download_payload('al_video', video_id, data) + info_page = payload[1] + opts = payload[-1] + mv_data = opts.get('mvData') or {} + player = opts.get('player') or {} else: - info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_page = self._download_webpage(info_url, video_id) + info_page = self._download_webpage( + 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) - error_message = self._html_search_regex( - [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', - r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], - info_page, 'error message', default=None) - if error_message: - raise ExtractorError(error_message, expected=True) + error_message = self._html_search_regex( + [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], + info_page, 'error message', default=None) + if error_message: + raise ExtractorError(error_message, expected=True) - if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): - raise ExtractorError( - 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', - expected=True) + if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): + raise ExtractorError( + 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', + expected=True) - ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' + ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' - ERRORS = { - r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': - ERROR_COPYRIGHT, + ERRORS = { + r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': + ERROR_COPYRIGHT, - r'>The video .*? was removed from public access by request of the copyright holder.<': - ERROR_COPYRIGHT, + r'>The video .*? was removed from public access by request of the copyright holder.<': + ERROR_COPYRIGHT, - r'<!>Please log in or <': - 'Video %s is only available for registered users, ' - 'use --username and --password options to provide account credentials.', + r'<!>Please log in or <': + 'Video %s is only available for registered users, ' + 'use --username and --password options to provide account credentials.', - r'<!>Unknown error': - 'Video %s does not exist.', + r'<!>Unknown error': + 'Video %s does not exist.', - r'<!>Видео временно недоступно': - 'Video %s is temporarily unavailable.', + r'<!>Видео временно недоступно': + 'Video %s is temporarily unavailable.', - r'<!>Access denied': - 'Access denied to video %s.', + r'<!>Access denied': + 'Access denied to video %s.', - r'<!>Видеозапись недоступна, так как её автор был заблокирован.': - 'Video %s is no longer available, because its author has been blocked.', + r'<!>Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', - r'<!>This video is no longer available, because its author has been blocked.': - 'Video %s is no longer available, because its author has been blocked.', + r'<!>This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', - r'<!>This video is no longer available, because it has been deleted.': - 'Video %s is no longer available, because it has been deleted.', + r'<!>This video is no longer available, because it has been deleted.': + 'Video %s is no longer available, because it has been deleted.', - r'<!>The video .+? is not available in your region.': - 'Video %s is not available in your region.', - } + r'<!>The video .+? is not available in your region.': + 'Video %s is not available in your region.', + } - for error_re, error_msg in ERRORS.items(): - if re.search(error_re, info_page): - raise ExtractorError(error_msg % video_id, expected=True) + for error_re, error_msg in ERRORS.items(): + if re.search(error_re, info_page): + raise ExtractorError(error_msg % video_id, expected=True) + + player = self._parse_json(self._search_regex( + r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', + info_page, 'player params'), video_id) youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + return self.url_result(youtube_url, YoutubeIE.ie_key()) vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: - return self.url_result(vimeo_url) + return self.url_result(vimeo_url, VimeoIE.ie_key()) pladform_url = PladformIE._extract_url(info_page) if pladform_url: - return self.url_result(pladform_url) + return self.url_result(pladform_url, PladformIE.ie_key()) m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) @@ -374,6 +403,10 @@ class VKIE(VKBaseIE): if dailymotion_urls: return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) @@ -383,38 +416,7 @@ class VKIE(VKBaseIE): opts_url = 'http:' + opts_url return self.url_result(opts_url) - # vars does not look to be served anymore since 24.10.2016 - data = self._parse_json( - self._search_regex( - r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'), - video_id, fatal=False) - - # <!json> is served instead - if not data: - data = self._parse_json( - self._search_regex( - [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'], - info_page, 'json', default='{}'), - video_id) - if data: - data = data['player']['params'][0] - - if not data: - data = self._parse_json( - self._search_regex( - r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page, - 'player params', default='{}'), - video_id) - if data: - data = data['params'][0] - - # <!--{...} - if not data: - data = self._parse_json( - self._search_regex( - r'<!--\s*({.+})', info_page, 'payload'), - video_id)['payload'][-1][-1]['player']['params'][0] - + data = player['params'][0] title = unescapeHTML(data['md_title']) # 2 = live @@ -463,12 +465,12 @@ class VKIE(VKBaseIE): 'title': title, 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), - 'uploader_id': str_or_none(data.get('author_id')), - 'duration': data.get('duration'), + 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), + 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), 'timestamp': timestamp, 'view_count': view_count, - 'like_count': int_or_none(data.get('liked')), - 'dislike_count': int_or_none(data.get('nolikes')), + 'like_count': int_or_none(mv_data.get('likes')), + 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, } @@ -482,7 +484,6 @@ class VKUserVideosIE(VKBaseIE): 'url': 'http://vk.com/videos205387401', 'info_dict': { 'id': '205387401', - 'title': "Tom Cruise's Videos", }, 'playlist_mincount': 4, }, { @@ -498,22 +499,25 @@ class VKUserVideosIE(VKBaseIE): 'url': 'http://new.vk.com/videos205387401', 'only_matching': True, }] + _VIDEO = collections.namedtuple( + 'Video', ['owner_id', 'id', 'thumb', 'title', 'flags', 'duration', 'hash', 'moder_acts', 'owner', 'date', 'views', 'platform', 'blocked', 'music_video_meta']) def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + l = self._download_payload('al_video', page_id, { + 'act': 'load_videos_silent', + 'oid': page_id, + })[0]['']['list'] - entries = [ - self.url_result( - 'http://vk.com/video' + video_id, 'VK', video_id=video_id) - for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] + entries = [] + for video in l: + v = self._VIDEO._make(video) + video_id = '%d_%d' % (v.owner_id, v.id) + entries.append(self.url_result( + 'http://vk.com/video' + video_id, 'VK', video_id=video_id)) - title = unescapeHTML(self._search_regex( - r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', - webpage, 'title', default=page_id)) - - return self.playlist_result(entries, page_id, title) + return self.playlist_result(entries, page_id) class VKWallPostIE(VKBaseIE): @@ -523,15 +527,15 @@ class VKWallPostIE(VKBaseIE): # public page URL, audio playlist 'url': 'https://vk.com/bs.official?w=wall-23538238_35', 'info_dict': { - 'id': '23538238_35', - 'title': 'Black Shadow - Wall post 23538238_35', + 'id': '-23538238_35', + 'title': 'Black Shadow - Wall post -23538238_35', 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', }, 'playlist': [{ 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'info_dict': { 'id': '135220665_111806521', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Black Shadow - Слепое Верование', 'duration': 370, 'uploader': 'Black Shadow', @@ -542,18 +546,16 @@ class VKWallPostIE(VKBaseIE): 'md5': '4cc7e804579122b17ea95af7834c9233', 'info_dict': { 'id': '135220665_111802303', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'duration': 423, 'uploader': 'Black Shadow', 'artist': 'Black Shadow', 'track': 'Война - Негасимое Бездны Пламя!', }, - 'params': { - 'skip_download': True, - }, }], 'params': { + 'skip_download': True, 'usenetrc': True, }, 'skip': 'Requires vk account credentials', @@ -562,7 +564,7 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://vk.com/wall85155021_6319', 'info_dict': { 'id': '85155021_6319', - 'title': 'Sergey Gorbunov - Wall post 85155021_6319', + 'title': 'Сергей Горбунов - Wall post 85155021_6319', }, 'playlist_count': 1, 'params': { @@ -578,58 +580,73 @@ class VKWallPostIE(VKBaseIE): 'url': 'https://m.vk.com/wall-23538238_35', 'only_matching': True, }] + _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/=' + _AUDIO = collections.namedtuple( + 'Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads', 'subtitle', 'main_artists', 'feat_artists', 'album', 'track_code', 'restriction', 'album_part', 'new_stats', 'access_key']) + + def _decode(self, enc): + dec = '' + e = n = 0 + for c in enc: + r = self._BASE64_CHARS.index(c) + cond = n % 4 + e = 64 * e + r if cond else r + n += 1 + if cond: + dec += chr(255 & e >> (-2 * n & 6)) + return dec + + def _unmask_url(self, mask_url, vk_id): + if 'audio_api_unavailable' in mask_url: + extra = mask_url.split('?extra=')[1].split('#') + func, base = self._decode(extra[1]).split(chr(11)) + assert (func == 'i') + mask_url = list(self._decode(extra[0])) + url_len = len(mask_url) + indexes = [None] * url_len + index = int(base) ^ vk_id + for n in range(url_len - 1, -1, -1): + index = (url_len * (n + 1) ^ index + n) % url_len + indexes[n] = index + for n in range(1, url_len): + c = mask_url[n] + index = indexes[url_len - 1 - n] + mask_url[n] = mask_url[index] + mask_url[index] = c + mask_url = ''.join(mask_url) + return mask_url def _real_extract(self, url): post_id = self._match_id(url) - wall_url = 'https://vk.com/wall%s' % post_id - - post_id = remove_start(post_id, '-') - - webpage = self._download_webpage(wall_url, post_id) - - error = self._html_search_regex( - r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)', - webpage, 'error', default=None) - if error: - raise ExtractorError('VK said: %s' % error, expected=True) + webpage = self._download_payload('wkview', post_id, { + 'act': 'show', + 'w': 'wall' + post_id, + })[1] description = clean_html(get_element_by_class('wall_post_text', webpage)) uploader = clean_html(get_element_by_class('author', webpage)) - thumbnail = self._og_search_thumbnail(webpage) entries = [] - audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) - if audio_ids: - al_audio = self._download_webpage( - 'https://vk.com/al_audio.php', post_id, - note='Downloading audio info', fatal=False, - data=urlencode_postdata({ - 'act': 'reload_audio', - 'al': '1', - 'ids': ','.join(audio_ids) - })) - if al_audio: - Audio = collections.namedtuple( - 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) - audios = self._parse_json( - self._search_regex( - r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'), - post_id, fatal=False, transform_source=unescapeHTML) - if isinstance(audios, list): - for audio in audios: - a = Audio._make(audio[:6]) - entries.append({ - 'id': '%s_%s' % (a.user_id, a.id), - 'url': a.url, - 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, - 'thumbnail': thumbnail, - 'duration': a.duration, - 'uploader': uploader, - 'artist': a.artist, - 'track': a.track, - }) + for audio in re.findall(r'data-audio="([^"]+)', webpage): + audio = self._parse_json(unescapeHTML(audio), post_id) + a = self._AUDIO._make(audio) + if not a.url: + continue + title = unescapeHTML(a.title) + entries.append({ + 'id': '%s_%s' % (a.owner_id, a.id), + 'url': self._unmask_url(a.url, a.ads['vk_id']), + 'title': '%s - %s' % (a.performer, title) if a.performer else title, + 'thumbnail': a.cover_url.split(',') if a.cover_url else None, + 'duration': a.duration, + 'uploader': uploader, + 'artist': a.performer, + 'track': title, + 'ext': 'mp4', + 'protocol': 'm3u8', + }) for video in re.finditer( r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): From 42cd0824b3975e6ce500d8cecd60e1fc077a758b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Oct 2019 00:06:05 +0100 Subject: [PATCH 0106/1705] [vk] remove assert statement --- youtube_dl/extractor/vk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index c289fcad3..4c8ca4f41 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -600,7 +600,6 @@ class VKWallPostIE(VKBaseIE): if 'audio_api_unavailable' in mask_url: extra = mask_url.split('?extra=')[1].split('#') func, base = self._decode(extra[1]).split(chr(11)) - assert (func == 'i') mask_url = list(self._decode(extra[0])) url_len = len(mask_url) indexes = [None] * url_len From 235dbb434bfa724718c37d8af0a61baf93b775be Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Oct 2019 14:57:42 +0100 Subject: [PATCH 0107/1705] [discoverynetworks] add support for dplay.co.uk --- youtube_dl/extractor/discoverynetworks.py | 63 +++++++---------------- 1 file changed, 19 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py index fba1ef221..607a54948 100644 --- a/youtube_dl/extractor/discoverynetworks.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -3,63 +3,38 @@ from __future__ import unicode_literals import re -from .brightcove import BrightcoveLegacyIE from .dplay import DPlayIE -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) -from ..utils import smuggle_url class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/ - (?: - .*\#(?P<id>\d+)| - (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)| - programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+) - )''' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' _TESTS = [{ - 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', + 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', 'info_dict': { - 'id': '3235167922001', + 'id': '78867', 'ext': 'mp4', - 'title': 'Breaking Amish: Die Welt da draußen', - 'description': ( - 'Vier Amische und eine Mennonitin wagen in New York' - ' den Sprung in ein komplett anderes Leben. Begleitet sie auf' - ' ihrem spannenden Weg.'), - 'timestamp': 1396598084, - 'upload_date': '20140404', - 'uploader_id': '1659832546', + 'title': 'Die Welt da draußen', + 'description': 'md5:61033c12b73286e409d99a41742ef608', + 'timestamp': 1554069600, + 'upload_date': '20190331', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, }, }, { - 'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/', + 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', 'only_matching': True, }, { - 'url': 'http://www.discovery.de/#5332316765001', + 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - alternate_id = mobj.group('alternate_id') - if alternate_id: - self._initialize_geo_bypass({ - 'countries': ['DE'], - }) - return self._get_disco_api_info( - url, '%s/%s' % (mobj.group('programme'), alternate_id), - 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de') - brightcove_id = mobj.group('id') - if not brightcove_id: - title = mobj.group('title') - webpage = self._download_webpage(url, title) - brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse( - brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}), - 'BrightcoveNew', brightcove_id) + domain, programme, alternate_id = re.match(self._VALID_URL, url).groups() + country = 'GB' if domain == 'dplay.co.uk' else 'DE' + realm = 'questuk' if country == 'GB' else domain.replace('.', '') + return self._get_disco_api_info( + url, '%s/%s' % (programme, alternate_id), + 'sonic-eu1-prod.disco-api.com', realm, country) From 0b98f3a7517601b7d2aabc789997016b9c3c24f2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 26 Oct 2019 14:58:29 +0100 Subject: [PATCH 0108/1705] [dplay] improve extraction - add support for dplay.fi, dplay.jp and es.dplay.com(closes #16969) - fix it.dplay.com extraction(closes #22826) - update tests - extract creator, tags and thumbnails - handle playback API call errors --- youtube_dl/extractor/dplay.py | 397 ++++++++++------------------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 133 insertions(+), 269 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index ebf59512c..d9c3d59cd 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,74 +1,68 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re -import time from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urlparse, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, float_or_none, int_or_none, - remove_end, - try_get, - unified_strdate, unified_timestamp, - update_url_query, - urljoin, - USER_AGENTS, ) class DPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?P<domain>www\.(?P<host>dplay\.(?P<country>dk|se|no)))/(?:video(?:er|s)/)?(?P<id>[^/]+/[^/?#]+)' + _VALID_URL = r'''(?x)https?:// + (?P<domain> + (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))| + (?P<subdomain_country>es|it)\.dplay\.com + )/[^/]+/(?P<id>[^/]+/[^/?#]+)''' _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL - 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', + 'url': 'https://www.dplay.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', 'info_dict': { - 'id': '3172', - 'display_id': 'nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet', + 'id': '13628', + 'display_id': 'nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', 'ext': 'mp4', 'title': 'Svensken lär sig njuta av livet', 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', - 'duration': 2650, - 'timestamp': 1365454320, + 'duration': 2649.856, + 'timestamp': 1365453720, 'upload_date': '20130408', - 'creator': 'Kanal 5 (Home)', + 'creator': 'Kanal 5', 'series': 'Nugammalt - 77 händelser som format Sverige', 'season_number': 1, 'episode_number': 1, - 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, }, }, { # geo restricted, via secure api, unsigned download hls URL - 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', + 'url': 'http://www.dplay.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', 'info_dict': { - 'id': '70816', - 'display_id': 'mig-og-min-mor/season-6-episode-12', + 'id': '104465', + 'display_id': 'ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', 'ext': 'mp4', - 'title': 'Episode 12', - 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', - 'duration': 2563, - 'timestamp': 1429696800, - 'upload_date': '20150422', - 'creator': 'Kanal 4 (Home)', - 'series': 'Mig og min mor', - 'season_number': 6, - 'episode_number': 12, - 'age_limit': 0, + 'title': 'Ted Bundy: Mind Of A Monster', + 'description': 'md5:8b780f6f18de4dae631668b8a9637995', + 'duration': 5290.027, + 'timestamp': 1570694400, + 'upload_date': '20191010', + 'creator': 'ID - Investigation Discovery', + 'series': 'Ted Bundy: Mind Of A Monster', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, }, - }, { - # geo restricted, via direct unsigned hls URL - 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', - 'only_matching': True, }, { # disco-api 'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7', @@ -89,19 +83,59 @@ class DPlayIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, }, + 'skip': 'Available for Premium users', }, { - - 'url': 'https://www.dplay.dk/videoer/singleliv/season-5-episode-3', + 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', + 'md5': '2b808ffb00fc47b884a172ca5d13053c', + 'info_dict': { + 'id': '6918', + 'display_id': 'biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'ext': 'mp4', + 'title': 'Luigi Di Maio: la psicosi di Stanislawskij', + 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'upload_date': '20160524', + 'timestamp': 1464076800, + 'series': 'Biografie imbarazzanti', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + }, { + 'url': 'https://es.dplay.com/dmax/la-fiebre-del-oro/temporada-8-episodio-1/', + 'info_dict': { + 'id': '21652', + 'display_id': 'la-fiebre-del-oro/temporada-8-episodio-1', + 'ext': 'mp4', + 'title': 'Episodio 1', + 'description': 'md5:b9dcff2071086e003737485210675f69', + 'thumbnail': r're:^https?://.*\.png', + 'upload_date': '20180709', + 'timestamp': 1531173540, + 'series': 'La fiebre del oro', + 'season_number': 8, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dplay.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', 'only_matching': True, }, { - 'url': 'https://www.dplay.se/videos/sofias-anglar/sofias-anglar-1001', + 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, }] - def _get_disco_api_info(self, url, display_id, disco_host, realm): - disco_base = 'https://' + disco_host + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): + geo_countries = [country.upper()] + self._initialize_geo_bypass({ + 'countries': geo_countries, + }) + disco_base = 'https://%s/' % disco_host token = self._download_json( - '%s/token' % disco_base, display_id, 'Downloading token', + disco_base + 'token', display_id, 'Downloading token', query={ 'realm': realm, })['data']['attributes']['token'] @@ -110,17 +144,30 @@ class DPlayIE(InfoExtractor): 'Authorization': 'Bearer ' + token, } video = self._download_json( - '%s/content/videos/%s' % (disco_base, display_id), display_id, + disco_base + 'content/videos/' + display_id, display_id, headers=headers, query={ - 'include': 'show' + 'include': 'images,primaryChannel,show,tags' }) video_id = video['data']['id'] info = video['data']['attributes'] - title = info['name'] + title = info['name'].strip() formats = [] - for format_id, format_dict in self._download_json( - '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id), - display_id, headers=headers)['data']['attributes']['streaming'].items(): + try: + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + display_id, headers=headers)['data']['attributes']['streaming'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + info = self._parse_json(e.cause.read().decode('utf-8'), display_id) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code == 'access.denied.missingpackage': + self.raise_login_required() + raise ExtractorError(info['errors'][0]['detail'], expected=True) + raise + for format_id, format_dict in streaming.items(): if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') @@ -142,235 +189,55 @@ class DPlayIE(InfoExtractor): }) self._sort_formats(formats) - series = None - try: - included = video.get('included') - if isinstance(included, list): - show = next(e for e in included if e.get('type') == 'show') - series = try_get( - show, lambda x: x['attributes']['name'], compat_str) - except StopIteration: - pass + creator = series = None + tags = [] + thumbnails = [] + included = video.get('included') or [] + if isinstance(included, list): + for e in included: + attributes = e.get('attributes') + if not attributes: + continue + e_type = e.get('type') + if e_type == 'channel': + creator = attributes.get('name') + elif e_type == 'image': + src = attributes.get('src') + if src: + thumbnails.append({ + 'url': src, + 'width': int_or_none(attributes.get('width')), + 'height': int_or_none(attributes.get('height')), + }) + if e_type == 'show': + series = attributes.get('name') + elif e_type == 'tag': + name = attributes.get('name') + if name: + tags.append(name) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': info.get('description'), - 'duration': float_or_none( - info.get('videoDuration'), scale=1000), + 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, 'season_number': int_or_none(info.get('seasonNumber')), 'episode_number': int_or_none(info.get('episodeNumber')), 'age_limit': int_or_none(info.get('minimum_age')), + 'creator': creator, + 'tags': tags, + 'thumbnails': thumbnails, 'formats': formats, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') - domain = mobj.group('domain') - - self._initialize_geo_bypass({ - 'countries': [mobj.group('country').upper()], - }) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'video id', default=None) - - if not video_id: - host = mobj.group('host') - return self._get_disco_api_info( - url, display_id, 'disco-api.' + host, host.replace('.', '')) - - info = self._download_json( - 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), - video_id)['data'][0] - - title = info['title'] - - PROTOCOLS = ('hls', 'hds') - formats = [] - - def extract_formats(protocol, manifest_url): - if protocol == 'hls': - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False) - # Sometimes final URLs inside m3u8 are unsigned, let's fix this - # ourselves. Also fragments' URLs are only served signed for - # Safari user agent. - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query) - for m3u8_format in m3u8_formats: - m3u8_format.update({ - 'url': update_url_query(m3u8_format['url'], query), - 'http_headers': { - 'User-Agent': USER_AGENTS['Safari'], - }, - }) - formats.extend(m3u8_formats) - elif protocol == 'hds': - formats.extend(self._extract_f4m_formats( - manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', - video_id, f4m_id=protocol, fatal=False)) - - domain_tld = domain.split('.')[-1] - if domain_tld in ('se', 'dk', 'no'): - for protocol in PROTOCOLS: - # Providing dsc-geo allows to bypass geo restriction in some cases - self._set_cookie( - 'secure.dplay.%s' % domain_tld, 'dsc-geo', - json.dumps({ - 'countryCode': domain_tld.upper(), - 'expiry': (time.time() + 20 * 60) * 1000, - })) - stream = self._download_json( - 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s' - % (domain_tld, video_id, protocol), video_id, - 'Downloading %s stream JSON' % protocol, fatal=False) - if stream and stream.get(protocol): - extract_formats(protocol, stream[protocol]) - - # The last resort is to try direct unsigned hls/hds URLs from info dictionary. - # Sometimes this does work even when secure API with dsc-geo has failed (e.g. - # http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/). - if not formats: - for protocol in PROTOCOLS: - if info.get(protocol): - extract_formats(protocol, info[protocol]) - - self._sort_formats(formats) - - subtitles = {} - for lang in ('se', 'sv', 'da', 'nl', 'no'): - for format_id in ('web_vtt', 'vtt', 'srt'): - subtitle_url = info.get('subtitles_%s_%s' % (lang, format_id)) - if subtitle_url: - subtitles.setdefault(lang, []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': info.get('video_metadata_longDescription'), - 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), - 'timestamp': int_or_none(info.get('video_publish_date')), - 'creator': info.get('video_metadata_homeChannel'), - 'series': info.get('video_metadata_show'), - 'season_number': int_or_none(info.get('season')), - 'episode_number': int_or_none(info.get('episode')), - 'age_limit': int_or_none(info.get('minimum_age')), - 'formats': formats, - 'subtitles': subtitles, - } - - -class DPlayItIE(InfoExtractor): - _VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)' - _GEO_COUNTRIES = ['IT'] - _TEST = { - 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', - 'md5': '2b808ffb00fc47b884a172ca5d13053c', - 'info_dict': { - 'id': '6918', - 'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij', - 'ext': 'mp4', - 'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij', - 'description': 'md5:3c7a4303aef85868f867a26f5cc14813', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'upload_date': '20160524', - 'series': 'Biografie imbarazzanti', - 'season_number': 1, - 'episode': 'Luigi Di Maio: la psicosi di Stanislawskij', - 'episode_number': 1, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - title = remove_end(self._og_search_title(webpage), ' | Dplay') - - video_id = None - - info = self._search_regex( - r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")', - webpage, 'playback JSON', default=None) - if info: - for _ in range(2): - info = self._parse_json(info, display_id, fatal=False) - if not info: - break - else: - video_id = try_get(info, lambda x: x['data']['id']) - - if not info: - info_url = self._search_regex( - (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', - r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'), - webpage, 'info url', group='url') - - info_url = urljoin(url, info_url) - video_id = info_url.rpartition('/')[-1] - - try: - info = self._download_json( - info_url, display_id, headers={ - 'Authorization': 'Bearer %s' % self._get_cookies(url).get( - 'dplayit_token').value, - 'Referer': url, - }) - if isinstance(info, compat_str): - info = self._parse_json(info, display_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - if error.get('code') == 'access.denied.geoblocked': - self.raise_geo_restricted( - msg=error.get('detail'), countries=self._GEO_COUNTRIES) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - raise - - hls_url = info['data']['attributes']['streaming']['hls']['url'] - - formats = self._extract_m3u8_formats( - hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) - - series = self._html_search_regex( - r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', - webpage, 'series', fatal=False) - episode = self._search_regex( - r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)', - webpage, 'episode', fatal=False) - - mobj = re.search( - r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})', - webpage) - if mobj: - season_number = int(mobj.group('season_number')) - episode_number = int(mobj.group('episode_number')) - upload_date = unified_strdate(mobj.group('upload_date')) - else: - season_number = episode_number = upload_date = None - - return { - 'id': compat_str(video_id or display_id), - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'upload_date': upload_date, - 'formats': formats, - } + domain = mobj.group('domain').lstrip('www.') + country = mobj.group('country') or mobj.group('subdomain_country') + host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + return self._get_disco_api_info( + url, display_id, host, 'dplay' + country, country) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1db21529f..a8fe0de1a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -277,10 +277,7 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import ( - DPlayIE, - DPlayItIE, -) +from .dplay import DPlayIE from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE From 548c395716b1d5aa215e526fcb052a03926c1573 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 27 Oct 2019 17:52:46 +0100 Subject: [PATCH 0109/1705] [soundcloud] improve extraction - improve format extraction(closes #22123) - extract uploader_id and uploader_url(closes #21916) - extract all known thumbnails(closes #19071)(closes #20659) - fix extration for private playlists(closes #20976) - add support for playlist embeds(#20976) - skip preview formats(closes #22806) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 6 +- youtube_dl/extractor/soundcloud.py | 497 ++++++++++++++--------------- 3 files changed, 248 insertions(+), 256 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a8fe0de1a..388c1ebe6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1033,6 +1033,7 @@ from .snotr import SnotrIE from .sohu import SohuIE from .sonyliv import SonyLIVIE from .soundcloud import ( + SoundcloudEmbedIE, SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f66cae0eb..1c0780e98 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -80,7 +80,7 @@ from .theplatform import ThePlatformIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE -from .soundcloud import SoundcloudIE +from .soundcloud import SoundcloudEmbedIE from .tunein import TuneInBaseIE from .vbox7 import Vbox7IE from .dbtv import DBTVIE @@ -2749,9 +2749,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - soundcloud_urls = SoundcloudIE._extract_urls(webpage) + soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) # Look for tunein player tunein_urls = TuneInBaseIE._extract_urls(webpage) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 05538f3d6..875b9d887 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -11,14 +11,13 @@ from .common import ( from ..compat import ( compat_str, compat_urlparse, - compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, float_or_none, + HEADRequest, int_or_none, KNOWN_EXTENSIONS, - merge_dicts, mimetype2ext, str_or_none, try_get, @@ -28,6 +27,20 @@ from ..utils import ( ) +class SoundcloudEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?url=(?P<id>.*)' + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + + def _real_extract(self, url): + return self.url_result(compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query)['url'][0]) + + class SoundcloudIE(InfoExtractor): """Information extractor for soundcloud.com To access the media, the uid of the song and a stream token @@ -44,9 +57,8 @@ class SoundcloudIE(InfoExtractor): (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) - |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) + |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) - |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = 'soundcloud' @@ -60,6 +72,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', + 'uploader_id': '1571244', 'timestamp': 1349920598, 'upload_date': '20121011', 'duration': 143.216, @@ -79,6 +92,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', + 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', 'duration': 30, @@ -92,6 +106,7 @@ class SoundcloudIE(InfoExtractor): # rtmp 'skip_download': True, }, + 'skip': 'Preview', }, # private link { @@ -103,6 +118,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'description': 'test chars: \"\'/\\ä↭', 'uploader': 'jaimeMF', + 'uploader_id': '69767071', 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9.927, @@ -123,6 +139,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'description': 'test chars: \"\'/\\ä↭', 'uploader': 'jaimeMF', + 'uploader_id': '69767071', 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9.927, @@ -143,6 +160,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', + 'uploader_id': '73680509', 'timestamp': 1389232924, 'upload_date': '20140109', 'duration': 17.346, @@ -163,6 +181,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', 'uploader': 'Ori Uplift Music', + 'uploader_id': '12563093', 'timestamp': 1504206263, 'upload_date': '20170831', 'duration': 7449.096, @@ -183,6 +202,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Sideways (Prod. Mad Real)', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'uploader': 'garyvee', + 'uploader_id': '2366352', 'timestamp': 1488152409, 'upload_date': '20170226', 'duration': 207.012, @@ -207,6 +227,7 @@ class SoundcloudIE(InfoExtractor): 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', 'uploader': 'Giovanni Sarani', + 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', 'duration': 180.157, @@ -221,114 +242,81 @@ class SoundcloudIE(InfoExtractor): } ] + _API_BASE = 'https://api.soundcloud.com/' + _API_V2_BASE = 'https://api-v2.soundcloud.com/' + _BASE_URL = 'https://soundcloud.com/' _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI' + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', - webpage)] + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } @classmethod def _resolv_url(cls, url): - return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): + def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): track_id = compat_str(info['id']) title = info['title'] - name = full_title or track_id - if quiet: - self.report_extraction(name) - thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') - if isinstance(thumbnail, compat_str): - thumbnail = thumbnail.replace('-large', '-t500x500') - username = try_get(info, lambda x: x['user']['username'], compat_str) - - def extract_count(key): - return int_or_none(info.get('%s_count' % key)) - - like_count = extract_count('favoritings') - if like_count is None: - like_count = extract_count('likes') - - result = { - 'id': track_id, - 'uploader': username, - 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, - 'description': info.get('description'), - 'thumbnail': thumbnail, - 'duration': float_or_none(info.get('duration'), 1000), - 'webpage_url': info.get('permalink_url'), - 'license': info.get('license'), - 'view_count': extract_count('playback'), - 'like_count': like_count, - 'comment_count': extract_count('comment'), - 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), - } + track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] query = {'client_id': self._CLIENT_ID} - if secret_token is not None: + if secret_token: query['secret_token'] = secret_token - if info.get('downloadable', False): - # We can build a direct link to the song + + if info.get('downloadable'): format_url = update_url_query( - 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) + info.get('download_url') or track_base_url + '/download', query) format_urls.add(format_url) + if version == 2: + v1_info = self._download_json( + track_base_url, track_id, query=query, fatal=False) or {} + else: + v1_info = info formats.append({ 'format_id': 'download', - 'ext': info.get('original_format', 'mp3'), + 'ext': v1_info.get('original_format') or 'mp3', + 'filesize': int_or_none(v1_info.get('original_content_size')), 'url': format_url, - 'vcodec': 'none', 'preference': 10, }) - # Old API, does not work for some tracks (e.g. - # https://soundcloud.com/giovannisarani/mezzo-valzer) - format_dict = self._download_json( - 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query=query, fatal=False) + def invalid_url(url): + return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url) - if format_dict: - for key, stream_url in format_dict.items(): - if stream_url in format_urls: - continue - format_urls.add(stream_url) - ext, abr = 'mp3', None - mobj = re.search(r'_([^_]+)_(\d+)_url', key) - if mobj: - ext, abr = mobj.groups() - abr = int(abr) - if key.startswith('http'): - stream_formats = [{ - 'format_id': key, - 'ext': ext, - 'url': stream_url, - }] - elif key.startswith('rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - stream_formats = [{ - 'format_id': key, - 'url': url, - 'play_path': 'mp3:' + path, - 'ext': 'flv', - }] - elif key.startswith('hls'): - stream_formats = self._extract_m3u8_formats( - stream_url, track_id, ext, entry_protocol='m3u8_native', - m3u8_id=key, fatal=False) - else: - continue - - if abr: - for f in stream_formats: - f['abr'] = abr - - formats.extend(stream_formats) + def add_format(f, protocol): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + }) + formats.append(f) # New API transcodings = try_get( @@ -337,129 +325,165 @@ class SoundcloudIE(InfoExtractor): if not isinstance(t, dict): continue format_url = url_or_none(t.get('url')) - if not format_url: + if not format_url or t.get('snipped') or '/preview/' in format_url: continue stream = self._download_json( - update_url_query(format_url, query), track_id, fatal=False) + format_url, track_id, query=query, fatal=False) if not isinstance(stream, dict): continue stream_url = url_or_none(stream.get('url')) - if not stream_url: - continue - if stream_url in format_urls: + if invalid_url(stream_url): continue format_urls.add(stream_url) - protocol = try_get(t, lambda x: x['format']['protocol'], compat_str) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') if protocol != 'hls' and '/hls' in format_url: protocol = 'hls' ext = None preset = str_or_none(t.get('preset')) if preset: ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - mimetype = try_get( - t, lambda x: x['format']['mime_type'], compat_str) - ext = mimetype2ext(mimetype) or 'mp3' - format_id_list = [] - if protocol: - format_id_list.append(protocol) - format_id_list.append(ext) - format_id = '_'.join(format_id_list) - formats.append({ + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ 'url': stream_url, - 'format_id': format_id, 'ext': ext, - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - }) + }, 'http' if protocol == 'progressive' else protocol) + + if not formats: + # Old API, does not work for some tracks (e.g. + # https://soundcloud.com/giovannisarani/mezzo-valzer) + # and might serve preview URLs (e.g. + # http://www.soundcloud.com/snbrn/ele) + format_dict = self._download_json( + track_base_url + '/streams', track_id, + 'Downloading track url', query=query, fatal=False) or {} + + for key, stream_url in format_dict.items(): + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) + if mobj: + protocol, ext, abr = mobj.groups() + add_format({ + 'abr': abr, + 'ext': ext, + 'url': stream_url, + }, protocol) if not formats: # We fallback to the stream_url in the original info, this # cannot be always used, sometimes it can give an HTTP 404 error - formats.append({ - 'format_id': 'fallback', - 'url': update_url_query(info['stream_url'], query), - 'ext': 'mp3', - }) - self._check_formats(formats, track_id) + urlh = self._request_webpage( + HEADRequest(info.get('stream_url') or track_base_url + '/stream'), + track_id, query=query, fatal=False) + if urlh: + stream_url = urlh.geturl() + if not invalid_url(stream_url): + add_format({'url': stream_url}, 'http') for f in formats: f['vcodec'] = 'none' self._sort_formats(formats) - result['formats'] = formats - return result + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) + mobj = re.match(self._VALID_URL, url) track_id = mobj.group('track_id') - new_info = {} - if track_id is not None: - info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + query = { + 'client_id': self._CLIENT_ID, + } + if track_id: + info_json_url = self._API_V2_BASE + 'tracks/' + track_id full_title = track_id token = mobj.group('secret_token') if token: - info_json_url += '&secret_token=' + token - elif mobj.group('player'): - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - real_url = query['url'][0] - # If the token is in the query of the original url we have to - # manually add it - if 'secret_token' in query: - real_url += '?secret_token=' + query['secret_token'][0] - return self.url_result(real_url) + query['secret_token'] = token else: - # extract uploader (which is in the url) - uploader = mobj.group('uploader') - # extract simple title (uploader + slug of song title) - slug_title = mobj.group('title') + full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') token = mobj.group('token') - full_title = resolve_title = '%s/%s' % (uploader, slug_title) if token: resolve_title += '/%s' % token + info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - webpage = self._download_webpage(url, full_title, fatal=False) - if webpage: - entries = self._parse_json( - self._search_regex( - r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage, - 'data', default='[]'), full_title, fatal=False) - if entries: - for e in entries: - if not isinstance(e, dict): - continue - if e.get('id') != 67: - continue - data = try_get(e, lambda x: x['data'][0], dict) - if data: - new_info = data - break - info_json_url = self._resolv_url( - 'https://soundcloud.com/%s' % resolve_title) - - # Contains some additional info missing from new_info + version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON') + info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) + if not info: + info = self._download_json( + info_json_url.replace(self._API_V2_BASE, self._API_BASE), + full_title, 'Downloading info JSON', query=query) + version = 1 - return self._extract_info_dict( - merge_dicts(info, new_info), full_title, secret_token=token) + return self._extract_info_dict(info, full_title, token, version) class SoundcloudPlaylistBaseIE(SoundcloudIE): - @staticmethod - def _extract_id(e): - return compat_str(e['id']) if e.get('id') else None - - def _extract_track_entries(self, tracks): - return [ - self.url_result( - track['permalink_url'], SoundcloudIE.ie_key(), - video_id=self._extract_id(track)) - for track in tracks if track.get('permalink_url')] + def _extract_track_entries(self, tracks, token=None): + entries = [] + for track in tracks: + track_id = str_or_none(track.get('id')) + url = track.get('permalink_url') + if not url: + if not track_id: + continue + url = self._API_V2_BASE + 'tracks/' + track_id + if token: + url += '?secret_token=' + token + entries.append(self.url_result( + url, SoundcloudIE.ie_key(), track_id)) + return entries class SoundcloudSetIE(SoundcloudPlaylistBaseIE): @@ -480,41 +504,28 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - # extract uploader (which is in the url) - uploader = mobj.group('uploader') - # extract simple title (uploader + slug of song title) - slug_title = mobj.group('slug_title') - full_title = '%s/sets/%s' % (uploader, slug_title) - url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title) - + full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') token = mobj.group('token') if token: full_title += '/' + token - url += '/' + token - resolv_url = self._resolv_url(url) - info = self._download_json(resolv_url, full_title) + info = self._download_json(self._resolv_url( + self._BASE_URL + full_title), full_title) if 'errors' in info: msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = self._extract_track_entries(info['tracks']) + entries = self._extract_track_entries(info['tracks'], token) - return { - '_type': 'playlist', - 'entries': entries, - 'id': '%s' % info['id'], - 'title': info['title'], - } + return self.playlist_result( + entries, str_or_none(info.get('id')), info.get('title')) class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): - _API_V2_BASE = 'https://api-v2.soundcloud.com' - def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { - 'limit': 50, + 'limit': 2000000000, 'client_id': self._CLIENT_ID, 'linked_partitioning': '1', } @@ -522,12 +533,13 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): query = COMMON_QUERY.copy() query['offset'] = 0 - next_href = base_url + '?' + compat_urllib_parse_urlencode(query) + next_href = base_url entries = [] for i in itertools.count(): response = self._download_json( - next_href, playlist_id, 'Downloading track page %s' % (i + 1)) + next_href, playlist_id, + 'Downloading track page %s' % (i + 1), query=query) collection = response['collection'] @@ -546,9 +558,8 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): continue return self.url_result( permalink_url, - ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, - video_id=self._extract_id(cand), - video_title=cand.get('title')) + SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + str_or_none(cand.get('id')), cand.get('title')) for e in collection: entry = resolve_entry((e, e.get('track'), e.get('playlist'))) @@ -559,11 +570,10 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): if not next_href: break - parsed_next_href = compat_urlparse.urlparse(response['next_href']) - qs = compat_urlparse.parse_qs(parsed_next_href.query) - qs.update(COMMON_QUERY) - next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + next_href = response['next_href'] + parsed_next_href = compat_urlparse.urlparse(next_href) + query = compat_urlparse.parse_qs(parsed_next_href.query) + query.update(COMMON_QUERY) return { '_type': 'playlist', @@ -609,7 +619,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): 'url': 'https://soundcloud.com/jcv246/sets', 'info_dict': { 'id': '12982173', - 'title': 'Jordi / cv (Playlists)', + 'title': 'Jordi / cv (Sets)', }, 'playlist_mincount': 2, }, { @@ -636,39 +646,29 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): }] _BASE_URL_MAP = { - 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, - } - - _TITLE_MAP = { - 'all': 'All', - 'tracks': 'Tracks', - 'albums': 'Albums', - 'sets': 'Playlists', - 'reposts': 'Reposts', - 'likes': 'Likes', - 'spotlight': 'Spotlight', + 'all': 'stream/users/%s', + 'tracks': 'users/%s/tracks', + 'albums': 'users/%s/albums', + 'sets': 'users/%s/playlists', + 'reposts': 'stream/users/%s/reposts', + 'likes': 'users/%s/likes', + 'spotlight': 'users/%s/spotlight', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - url = 'https://soundcloud.com/%s/' % uploader - resolv_url = self._resolv_url(url) user = self._download_json( - resolv_url, uploader, 'Downloading user info') + self._resolv_url(self._BASE_URL + uploader), + uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' return self._extract_playlist( - self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), - '%s (%s)' % (user['username'], self._TITLE_MAP[resource])) + self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], + str_or_none(user.get('id')), + '%s (%s)' % (user['username'], resource.capitalize())) class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): @@ -678,7 +678,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', 'info_dict': { 'id': '286017854', - 'title': 'Track station: your-text', + 'title': 'Track station: your text', }, 'playlist_mincount': 47, }] @@ -686,19 +686,17 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): track_name = self._match_id(url) - webpage = self._download_webpage(url, track_name) - + track = self._download_json(self._resolv_url(url), track_name) track_id = self._search_regex( - r'soundcloud:track-stations:(\d+)', webpage, 'track id') + r'soundcloud:track-stations:(\d+)', track['id'], 'track id') return self._extract_playlist( - '%s/stations/soundcloud:track-stations:%s/tracks' - % (self._API_V2_BASE, track_id), - track_id, 'Track station: %s' % track_name) + self._API_V2_BASE + 'stations/%s/tracks' % track['id'], + track_id, 'Track station: %s' % track['title']) class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): - _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' + _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ 'url': 'https://api.soundcloud.com/playlists/4110309', @@ -713,29 +711,22 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') - base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id) - data_dict = { + query = { 'client_id': self._CLIENT_ID, } token = mobj.group('token') - if token: - data_dict['secret_token'] = token + query['secret_token'] = token - data = compat_urllib_parse_urlencode(data_dict) data = self._download_json( - base_url + data, playlist_id, 'Downloading playlist') + self._API_V2_BASE + 'playlists/' + playlist_id, + playlist_id, 'Downloading playlist', query=query) - entries = self._extract_track_entries(data['tracks']) + entries = self._extract_track_entries(data['tracks'], token) - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': data.get('title'), - 'description': data.get('description'), - 'entries': entries, - } + return self.playlist_result( + entries, playlist_id, data.get('title'), data.get('description')) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): @@ -753,18 +744,18 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): _SEARCH_KEY = 'scsearch' _MAX_RESULTS_PER_PAGE = 200 _DEFAULT_RESULTS_PER_PAGE = 50 - _API_V2_BASE = 'https://api-v2.soundcloud.com' def _get_collection(self, endpoint, collection_id, **query): limit = min( query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), self._MAX_RESULTS_PER_PAGE) - query['limit'] = limit - query['client_id'] = self._CLIENT_ID - query['linked_partitioning'] = '1' - query['offset'] = 0 - data = compat_urllib_parse_urlencode(query) - next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) + query.update({ + 'limit': limit, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': 1, + 'offset': 0, + }) + next_url = update_url_query(self._API_V2_BASE + endpoint, query) collected_results = 0 @@ -791,5 +782,5 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): break def _get_n_results(self, query, n): - tracks = self._get_collection('/search/tracks', query, limit=n, q=query) + tracks = self._get_collection('search/tracks', query, limit=n, q=query) return self.playlist_result(tracks, playlist_title=query) From dd90451f0f4867480c5ed8cb3588b30312204e3f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 27 Oct 2019 22:02:46 +0100 Subject: [PATCH 0110/1705] [tenplay] Add new extractor(closes #21446) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tenplay.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/tenplay.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 388c1ebe6..339a141a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1133,6 +1133,7 @@ from .telequebec import ( from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .tennistv import TennisTVIE +from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py new file mode 100644 index 000000000..dff44a4e2 --- /dev/null +++ b/youtube_dl/extractor/tenplay.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_age_limit, + parse_iso8601, + smuggle_url, +) + + +class TenPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/[^/]+/episodes/[^/]+/[^/]+/(?P<id>tpv\d{6}[a-z]{5})' + _TEST = { + 'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga', + 'info_dict': { + 'id': '6060533435001', + 'ext': 'mp4', + 'title': 'MasterChef - S1 Ep. 1', + 'description': 'md5:4fe7b78e28af8f2d900cd20d900ef95c', + 'age_limit': 10, + 'timestamp': 1240828200, + 'upload_date': '20090427', + 'uploader_id': '2199827728001', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + } + BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + + def _real_extract(self, url): + content_id = self._match_id(url) + data = self._download_json( + 'https://10play.com.au/api/video/' + content_id, content_id) + video = data.get('video') or {} + metadata = data.get('metaData') or {} + brightcove_id = video.get('videoId') or metadata['showContentVideoId'] + brightcove_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['AU']}) + + return { + '_type': 'url_transparent', + 'url': brightcove_url, + 'id': content_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + 'description': video.get('description'), + 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), + 'series': metadata.get('showName'), + 'season': metadata.get('showContentSeason'), + 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), + 'ie_key': 'BrightcoveNew', + } From 71fa0b04f9099090f43f6747632a9bdc3a4b1015 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 13:30:30 +0100 Subject: [PATCH 0111/1705] [makertv] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/makertv.py | 32 ------------------------------ 2 files changed, 33 deletions(-) delete mode 100644 youtube_dl/extractor/makertv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 339a141a5..4229518fd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -604,7 +604,6 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) -from .makertv import MakerTVIE from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py deleted file mode 100644 index 8eda69cfc..000000000 --- a/youtube_dl/extractor/makertv.py +++ /dev/null @@ -1,32 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class MakerTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer\.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' - _TEST = { - 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', - 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', - 'info_dict': { - 'id': 'Fh3QgymL9gsc', - 'ext': 'mp4', - 'title': 'Maze Runner: The Scorch Trials Official Movie Review', - 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', - 'upload_date': '20150918', - 'timestamp': 1442549540, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'jwplatform:%s' % jwplatform_id, - 'ie_key': 'JWPlatform', - } From 80c2126e80bc41f7b66d325c4c67c61887c58fb0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 13:32:35 +0100 Subject: [PATCH 0112/1705] [thesun] fix extraction(closes #16966) --- youtube_dl/extractor/thesun.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py index 22d003776..15d4a6932 100644 --- a/youtube_dl/extractor/thesun.py +++ b/youtube_dl/extractor/thesun.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .ooyala import OoyalaIE +from ..utils import extract_attributes class TheSunIE(InfoExtractor): @@ -16,6 +16,7 @@ class TheSunIE(InfoExtractor): }, 'playlist_count': 2, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): article_id = self._match_id(url) @@ -23,10 +24,15 @@ class TheSunIE(InfoExtractor): webpage = self._download_webpage(url, article_id) entries = [] - for ooyala_id in re.findall( - r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)', + for video in re.findall( + r'<video[^>]+data-video-id-pending=[^>]+>', webpage): - entries.append(OoyalaIE._build_url_result(ooyala_id)) + attrs = extract_attributes(video) + video_id = attrs['data-video-id-pending'] + account_id = attrs.get('data-account', '5067014667001') + entries.append(self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), + 'BrightcoveNew', video_id)) return self.playlist_result( entries, article_id, self._og_search_title(webpage, fatal=False)) From 0f9d53566a5956854af77173c0e910ed7454aadf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 15:17:06 +0100 Subject: [PATCH 0113/1705] [la7] update Kaltura service URL(closes #22358) --- youtube_dl/extractor/la7.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index 6373268c4..c3b4ffa7e 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -20,7 +20,7 @@ class LA7IE(InfoExtractor): 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': 'inccool8-02-10-2015-163722', + 'id': '0_42j6wd36', 'ext': 'mp4', 'title': 'Inc.Cool8', 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', @@ -57,7 +57,7 @@ class LA7IE(InfoExtractor): return { '_type': 'url_transparent', 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { - 'service_url': 'http://kdam.iltrovatore.it', + 'service_url': 'http://nkdam.iltrovatore.it', }), 'id': video_id, 'title': player_data['title'], From 3e252cca0e81aef55b0288f86991bb566878a9fc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 28 Oct 2019 17:39:01 +0100 Subject: [PATCH 0114/1705] [macgamestore] remove extractor Covered by generic extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/macgamestore.py | 42 ---------------------------- 2 files changed, 43 deletions(-) delete mode 100644 youtube_dl/extractor/macgamestore.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4229518fd..1807744be 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -598,7 +598,6 @@ from .lynda import ( LyndaCourseIE ) from .m6 import M6IE -from .macgamestore import MacGameStoreIE from .mailru import ( MailRuIE, MailRuMusicIE, diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py deleted file mode 100644 index 43db9929c..000000000 --- a/youtube_dl/extractor/macgamestore.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class MacGameStoreIE(InfoExtractor): - IE_NAME = 'macgamestore' - IE_DESC = 'MacGameStore trailers' - _VALID_URL = r'https?://(?:www\.)?macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', - 'md5': '8649b8ea684b6666b4c5be736ecddc61', - 'info_dict': { - 'id': '2450', - 'ext': 'm4v', - 'title': 'Crow', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, 'Downloading trailer page') - - if '>Missing Media<' in webpage: - raise ExtractorError( - 'Trailer %s does not exist' % video_id, expected=True) - - video_title = self._html_search_regex( - r'<title>MacGameStore: (.*?) Trailer', webpage, 'title') - - video_url = self._html_search_regex( - r'(?s)', - webpage, 'video URL') - - return { - 'id': video_id, - 'url': video_url, - 'title': video_title - } From 831b732da1d0796a1927af8767d76af780cc90f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 17:41:17 +0100 Subject: [PATCH 0115/1705] [learnr] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/learnr.py | 33 ------------------------------ 2 files changed, 34 deletions(-) delete mode 100644 youtube_dl/extractor/learnr.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1807744be..9f3a5f8a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -546,7 +546,6 @@ from .lcp import ( LcpPlayIE, LcpIE, ) -from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lecturio import ( LecturioIE, diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py deleted file mode 100644 index 1435e090e..000000000 --- a/youtube_dl/extractor/learnr.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class LearnrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript', - 'md5': '3719fdf0a68397f49899e82c308a89de', - 'info_dict': { - 'id': '51624', - 'ext': 'mp4', - 'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript', - 'description': 'md5:b36dbfa92350176cdf12b4d388485503', - 'uploader': 'LearnCode.academy', - 'uploader_id': 'learncodeacademy', - 'upload_date': '20131021', - }, - 'add_ie': ['Youtube'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - return { - '_type': 'url_transparent', - 'url': self._search_regex( - r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'), - 'id': video_id, - } From b3c2fa6dad607da6455a13d232461d4380e4b53c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 17:42:33 +0100 Subject: [PATCH 0116/1705] [tutv] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/tutv.py | 36 ------------------------------ 2 files changed, 37 deletions(-) delete mode 100644 youtube_dl/extractor/tutv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f3a5f8a5..39282b785 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1183,7 +1183,6 @@ from .tunein import ( ) from .tunepk import TunePkIE from .turbo import TurboIE -from .tutv import TutvIE from .tv2 import ( TV2IE, TV2ArticleIE, diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py deleted file mode 100644 index 362318b24..000000000 --- a/youtube_dl/extractor/tutv.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_parse_qs, -) - - -class TutvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P[^/?]+)' - _TEST = { - 'url': 'http://tu.tv/videos/robots-futbolistas', - 'md5': '0cd9e28ad270488911b0d2a72323395d', - 'info_dict': { - 'id': '2973058', - 'ext': 'mp4', - 'title': 'Robots futbolistas', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') - - data_content = self._download_webpage( - 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = compat_b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') - - return { - 'id': internal_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - } From 702984eca955f61811078c33337faf9eebeb48c8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 17:49:05 +0100 Subject: [PATCH 0117/1705] [hark] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/hark.py | 33 ------------------------------ 2 files changed, 34 deletions(-) delete mode 100644 youtube_dl/extractor/hark.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 39282b785..114ede8b9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -428,7 +428,6 @@ from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE -from .hark import HarkIE from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py deleted file mode 100644 index 342a6130e..000000000 --- a/youtube_dl/extractor/hark.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class HarkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hark\.com/clips/(?P.+?)-.+' - _TEST = { - 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', - 'md5': '6783a58491b47b92c7c1af5a77d4cbee', - 'info_dict': { - 'id': 'mmbzyhkgny', - 'ext': 'mp3', - 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', - 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', - 'duration': 11, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'http://www.hark.com/clips/%s.json' % video_id, video_id) - - return { - 'id': video_id, - 'url': data['url'], - 'title': data['name'], - 'description': data.get('description'), - 'thumbnail': data.get('image_original'), - 'duration': data.get('duration'), - } From 895e5c03db310ee97d585360ef8e6ae117e4cbd6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 19:31:20 +0100 Subject: [PATCH 0118/1705] [nbcnews] fix extraction closes #12569 closes #12576 closes #21703 closes #21923 --- youtube_dl/extractor/nbc.py | 86 +++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 10680b202..5bc39d002 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,9 +9,13 @@ from .theplatform import ThePlatformIE from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( - smuggle_url, - update_url_query, int_or_none, + js_to_json, + parse_duration, + smuggle_url, + try_get, + unified_timestamp, + update_url_query, ) @@ -285,13 +289,12 @@ class NBCNewsIE(ThePlatformIE): _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'af1adfa51312291a017720403826bb64', + 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', 'info_dict': { 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', - 'uploader': 'NBCU-NEWS', 'timestamp': 1401363060, 'upload_date': '20140529', }, @@ -309,28 +312,26 @@ class NBCNewsIE(ThePlatformIE): }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '73135a2e0ef819107bbb55a5a9b2a802', + 'md5': '8eb831eca25bfa7d25ddd83e85946548', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, - 'uploader': 'NBCU-NEWS', 'upload_date': '20150205', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', 'info_dict': { - 'id': '529953347624', + 'id': 'n431456', 'ext': 'mp4', - 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', - 'description': 'md5:c8be487b2d80ff0594c005add88d8351', + 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'", + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, - 'uploader': 'NBCU-NEWS', }, }, { @@ -343,7 +344,6 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, - 'uploader': 'NBCU-NEWS', }, }, { @@ -357,7 +357,6 @@ class NBCNewsIE(ThePlatformIE): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', - 'uploader': 'NBCU-NEWS', }, }, { @@ -373,20 +372,61 @@ class NBCNewsIE(ThePlatformIE): def _real_extract(self, url): video_id = self._match_id(url) - if not video_id.isdigit(): - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id) - video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id'] + data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.+});', webpage, + 'bootstrap json'), video_id, js_to_json) + video_data = try_get(data, lambda x: x['video']['current'], dict) + if not video_data: + video_data = data['article']['content'][0]['primaryMedia']['video'] + title = video_data['headline']['primary'] + + formats = [] + for va in video_data.get('videoAssets', []): + public_url = va.get('publicUrl') + if not public_url: + continue + if '://link.theplatform.com/' in public_url: + public_url = update_url_query(public_url, {'format': 'redirect'}) + format_id = va.get('format') + if format_id == 'M3U': + formats.extend(self._extract_m3u8_formats( + public_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + continue + tbr = int_or_none(va.get('bitrate'), 1000) + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': public_url, + 'width': int_or_none(va.get('width')), + 'height': int_or_none(va.get('height')), + 'tbr': tbr, + 'ext': 'mp4', + }) + self._sort_formats(formats) + + subtitles = {} + closed_captioning = video_data.get('closedCaptioning') + if closed_captioning: + for cc_url in closed_captioning.values(): + if not cc_url: + continue + subtitles.setdefault('en', []).append({ + 'url': cc_url, + }) return { - '_type': 'url_transparent', 'id': video_id, - # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}), - 'ie_key': 'ThePlatformFeed', + 'title': title, + 'description': try_get(video_data, lambda x: x['description']['primary']), + 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datePublished')), + 'formats': formats, + 'subtitles': subtitles, } From 83e49259bfd4e0b54a4b53c30742109555087e3a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 19:45:42 +0100 Subject: [PATCH 0119/1705] [internetvideoarchive] fix extraction --- youtube_dl/extractor/internetvideoarchive.py | 92 ++++++-------------- 1 file changed, 28 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 76cc5ec3e..59b0a90c3 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,15 +1,13 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urlparse, ) -from ..utils import ( - determine_ext, - int_or_none, - xpath_text, -) class InternetVideoArchiveIE(InfoExtractor): @@ -20,7 +18,7 @@ class InternetVideoArchiveIE(InfoExtractor): 'info_dict': { 'id': '194487', 'ext': 'mp4', - 'title': 'KICK-ASS 2', + 'title': 'Kick-Ass 2', 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', }, 'params': { @@ -33,68 +31,34 @@ class InternetVideoArchiveIE(InfoExtractor): def _build_json_url(query): return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query - @staticmethod - def _build_xml_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query - def _real_extract(self, url): - query = compat_urlparse.urlparse(url).query - query_dic = compat_parse_qs(query) - video_id = query_dic['publishedid'][0] - - if '/player/' in url: - configuration = self._download_json(url, video_id) - - # There are multiple videos in the playlist whlie only the first one - # matches the video played in browsers - video_info = configuration['playlist'][0] - title = video_info['title'] - - formats = [] - for source in video_info['sources']: - file_url = source['file'] - if determine_ext(file_url) == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) - file_url = m3u8_formats[0]['url'] - formats.extend(self._extract_f4m_formats( - file_url.replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_mpd_formats( - file_url.replace('.m3u8', '.mpd'), - video_id, mpd_id='dash', fatal=False)) - else: - a_format = { - 'url': file_url, - } - - if source.get('label') and source['label'][-4:] == ' kbs': - tbr = int_or_none(source['label'][:-4]) - a_format.update({ - 'tbr': tbr, - 'format_id': 'http-%d' % tbr, - }) - formats.append(a_format) - - self._sort_formats(formats) - - description = video_info.get('description') - thumbnail = video_info.get('image') - else: - configuration = self._download_xml(url, video_id) - formats = [{ - 'url': xpath_text(configuration, './file', 'file URL', fatal=True), - }] - thumbnail = xpath_text(configuration, './image', 'thumbnail') - title = 'InternetVideoArchive video %s' % video_id - description = None + query = compat_parse_qs(compat_urlparse.urlparse(url).query) + video_id = query['publishedid'][0] + data = self._download_json( + 'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx', + video_id, data=json.dumps({ + 'customerid': query['customerid'][0], + 'publishedid': video_id, + }).encode()) + title = data['Title'] + formats = self._extract_m3u8_formats( + data['VideoUrl'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + file_url = formats[0]['url'] + if '.ism/' in file_url: + replace_url = lambda x: re.sub(r'\.ism/[^?]+', '.ism/' + x, file_url) + formats.extend(self._extract_f4m_formats( + replace_url('.f4m'), video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_ism_formats( + replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) + self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, + 'thumbnail': data.get('PosterUrl'), + 'description': data.get('Description'), } From 0086726e8674e9edec0682e7a84275c3c25ce646 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 19:48:34 +0100 Subject: [PATCH 0120/1705] [videodetective] fix extraction --- youtube_dl/extractor/videodetective.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index a19411a05..fe70db713 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse from .internetvideoarchive import InternetVideoArchiveIE @@ -13,7 +12,7 @@ class VideoDetectiveIE(InfoExtractor): 'info_dict': { 'id': '194487', 'ext': 'mp4', - 'title': 'KICK-ASS 2', + 'title': 'Kick-Ass 2', 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', }, 'params': { @@ -24,7 +23,7 @@ class VideoDetectiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage) - query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key()) + query = 'customerid=69249&publishedid=' + video_id + return self.url_result( + InternetVideoArchiveIE._build_json_url(query), + ie=InternetVideoArchiveIE.ie_key()) From cfabc505984acb3830aeac7759d913bb885d64b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 22:55:01 +0100 Subject: [PATCH 0121/1705] [mtv] fix extraction for mtv.de (closes #22113) --- youtube_dl/extractor/mtv.py | 51 ++++++++++++++----------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 7a3b57abd..7e95ca18e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -425,14 +425,14 @@ class MTVVideoIE(MTVServicesInfoExtractor): class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' - _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P\d+)-[^/#?]+/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P[0-9a-z]+)' _TESTS = [{ - 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', + 'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum', 'info_dict': { - 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'flv', - 'title': 'MusicVideo_cro-traum', - 'description': 'Cro - Traum', + 'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5', + 'ext': 'mp4', + 'title': 'Traum', + 'description': 'Traum', }, 'params': { # rtmp download @@ -441,11 +441,12 @@ class MTVDEIE(MTVServicesInfoExtractor): 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) - 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', + 'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1', 'info_dict': { - 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'flv', - 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', + 'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Teen Mom 2', + 'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7', }, 'params': { # rtmp download @@ -453,7 +454,7 @@ class MTVDEIE(MTVServicesInfoExtractor): }, 'skip': 'Blocked at Travis CI', }, { - 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', + 'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', 'ext': 'mp4', @@ -466,25 +467,11 @@ class MTVDEIE(MTVServicesInfoExtractor): }, 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] + _GEO_COUNTRIES = ['DE'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - playlist = self._parse_json( - self._search_regex( - r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), - video_id) - - def _mrss_url(item): - return item['mrss'] + item.get('mrssvars', '') - - # news pages contain single video in playlist with different id - if len(playlist) == 1: - return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) - - for item in playlist: - item_id = item.get('id') - if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(_mrss_url(item), video_id) + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtv.de', + 'mgid': uri, + } From 3cdcebf5470a56df7d52e6f8acbcde5b4b9f0241 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 23:31:14 +0100 Subject: [PATCH 0122/1705] [mtv] add support for mtvjapan.com --- youtube_dl/extractor/mtv.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 7e95ca18e..fedd5f46b 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -349,33 +350,29 @@ class MTVIE(MTVServicesInfoExtractor): }] -class MTV81IE(InfoExtractor): - IE_NAME = 'mtv81' - _VALID_URL = r'https?://(?:www\.)?mtv81\.com/videos/(?P[^/?#.]+)' +class MTVJapanIE(MTVServicesInfoExtractor): + IE_NAME = 'mtvjapan' + _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P[0-9a-z]+)' _TEST = { - 'url': 'http://www.mtv81.com/videos/artist-to-watch/the-godfather-of-japanese-hip-hop-segment-1/', - 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', + 'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade', 'info_dict': { - 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', + 'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5', 'ext': 'mp4', - 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', - 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', - 'timestamp': 1468846800, - 'upload_date': '20160718', + 'title': '【Fresh Info】Cadillac ESCALADE Sport Edition', + }, + 'params': { + 'skip_download': True, }, } + _GEO_COUNTRIES = ['JP'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' - def _extract_mgid(self, webpage): - return self._search_regex( - r'getTheVideo\((["\'])(?Pmgid:.+?)\1', webpage, - 'mgid', group='id') - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) - return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtvjapan.com', + 'mgid': uri, + } class MTVVideoIE(MTVServicesInfoExtractor): From 01358b9fc198cafb619a03ed5ad7865a74805611 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Oct 2019 23:34:31 +0100 Subject: [PATCH 0123/1705] [extractors] add import for MTVJapanIE --- youtube_dl/extractor/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 114ede8b9..c10bcbcc1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -666,7 +666,7 @@ from .mtv import ( MTVVideoIE, MTVServicesEmbeddedIE, MTVDEIE, - MTV81IE, + MTVJapanIE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE From dd90a21c28cb1ec592e5961a5f67556edfb3ce87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 05:49:36 +0700 Subject: [PATCH 0124/1705] [go] Add support for abc.com and freeform.com (closes #22823, closes #22864) --- youtube_dl/extractor/go.py | 44 ++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 03e48f4ea..107059023 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -40,8 +40,8 @@ class GoIE(AdobePassIE): 'resource_id': 'Disney', } } - _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pdisneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ - % '|'.join(list(_SITE_INFO.keys()) + ['disneynow']) + _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pabc|freeform|disneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ + % '|'.join(list(_SITE_INFO.keys())) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { @@ -54,6 +54,7 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + 'skip': 'This content is no longer available.', }, { 'url': 'http://watchdisneyxd.go.com/doraemon', 'info_dict': { @@ -61,6 +62,34 @@ class GoIE(AdobePassIE): 'id': 'SH55574025', }, 'playlist_mincount': 51, + }, { + 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', + 'info_dict': { + 'id': 'VDKA3609139', + 'ext': 'mp4', + 'title': 'This Guilty Blood', + 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', + 'age_limit': 14, + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', + 'info_dict': { + 'id': 'VDKA13435179', + 'ext': 'mp4', + 'title': 'The Bet', + 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', + 'age_limit': 14, + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -95,10 +124,13 @@ class GoIE(AdobePassIE): if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) video_id = self._search_regex( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', - default=video_id) + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', From aef9f87ea4dcfe483c5b776f1c37310766ad818d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 05:52:15 +0700 Subject: [PATCH 0125/1705] [go] Improve and beautify _VALID_URL --- youtube_dl/extractor/go.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 107059023..03cfba91f 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -40,8 +40,17 @@ class GoIE(AdobePassIE): 'resource_id': 'Disney', } } - _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pabc|freeform|disneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ - % '|'.join(list(_SITE_INFO.keys())) + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?P%s)\.)?go| + (?Pabc|freeform|disneynow) + )\.com/ + (?: + (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| + (?:[^/]+/)*(?P[^/?\#]+) + ) + ''' % '|'.join(list(_SITE_INFO.keys())) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { From 0d7392e68b7ebb7215651da0784e859d7bdff826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 05:54:32 +0700 Subject: [PATCH 0126/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/ChangeLog b/ChangeLog index 64233b03b..b664368a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,48 @@ +version + +Extractors ++ [go] Add support for abc.com and freeform.com (#22823, #22864) ++ [mtv] Add support for mtvjapan.com +* [mtv] Fix extraction for mtv.de (#22113) +* [videodetective] Fix extraction +* [internetvideoarchive] Fix extraction +* [nbcnews] Fix extraction (#12569, #12576, #21703, #21923) +- [hark] Remove extractor +- [tutv] Remove extractor +- [learnr] Remove extractor +- [macgamestore] Remove extractor +* [la7] Update Kaltura service URL (#22358) +* [thesun] Fix extraction (#16966) +- [makertv] Remove extractor ++ [tenplay] Add support for 10play.com.au (#21446) +* [soundcloud] Improve extraction + * Improve format extraction (#22123) + + Extract uploader_id and uploader_url (#21916) + + Extract all known thumbnails (#19071, #20659) + * Fix extration for private playlists (#20976) + + Add support for playlist embeds (#20976) + * Skip preview formats (#22806) +* [dplay] Improve extraction + + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969) + * Fix it.dplay.com extraction (#22826) + + Extract creator, tags and thumbnails + * Handle playback API call errors ++ [discoverynetworks] Add support for dplay.co.uk +* [vk] Improve extraction + + Add support for Odnoklassniki embeds + + Extract more videos from user lists (#4470) + + Fix wall post audio extraction (#18332) + * Improve error detection (#22568) ++ [odnoklassniki] Add support for embeds +* [puhutv] Improve extraction + * Fix subtitles extraction + * Transform HLS URLs to HTTP URLs + * Improve metadata extraction +* [ceskatelevize] Skip DRM media ++ [facebook] Extract subtitles (#22777) +* [globo] Handle alternative hash signing method + + version 2019.10.22 Core From 53896ca5be9a629c2cbaceb3fe43c707bb217437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 06:10:20 +0700 Subject: [PATCH 0127/1705] [utils] Actualize major IPv4 address blocks per country --- youtube_dl/utils.py | 71 +++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 53117ea90..aed988b88 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4979,7 +4979,7 @@ class ISO3166Utils(object): class GeoUtils(object): # Major IPv4 address blocks per country _country_ip_map = { - 'AD': '85.94.160.0/19', + 'AD': '46.172.224.0/19', 'AE': '94.200.0.0/13', 'AF': '149.54.0.0/17', 'AG': '209.59.64.0/18', @@ -4987,28 +4987,30 @@ class GeoUtils(object): 'AL': '46.99.0.0/16', 'AM': '46.70.0.0/15', 'AO': '105.168.0.0/13', - 'AP': '159.117.192.0/21', + 'AP': '182.50.184.0/21', + 'AQ': '23.154.160.0/24', 'AR': '181.0.0.0/12', 'AS': '202.70.112.0/20', - 'AT': '84.112.0.0/13', + 'AT': '77.116.0.0/14', 'AU': '1.128.0.0/11', 'AW': '181.41.0.0/18', - 'AZ': '5.191.0.0/16', + 'AX': '185.217.4.0/22', + 'AZ': '5.197.0.0/16', 'BA': '31.176.128.0/17', 'BB': '65.48.128.0/17', 'BD': '114.130.0.0/16', 'BE': '57.0.0.0/8', - 'BF': '129.45.128.0/17', + 'BF': '102.178.0.0/15', 'BG': '95.42.0.0/15', 'BH': '37.131.0.0/17', 'BI': '154.117.192.0/18', 'BJ': '137.255.0.0/16', - 'BL': '192.131.134.0/24', + 'BL': '185.212.72.0/23', 'BM': '196.12.64.0/18', 'BN': '156.31.0.0/16', 'BO': '161.56.0.0/16', 'BQ': '161.0.80.0/20', - 'BR': '152.240.0.0/12', + 'BR': '191.128.0.0/12', 'BS': '24.51.64.0/18', 'BT': '119.2.96.0/19', 'BW': '168.167.0.0/16', @@ -5016,20 +5018,20 @@ class GeoUtils(object): 'BZ': '179.42.192.0/18', 'CA': '99.224.0.0/11', 'CD': '41.243.0.0/16', - 'CF': '196.32.200.0/21', - 'CG': '197.214.128.0/17', + 'CF': '197.242.176.0/21', + 'CG': '160.113.0.0/16', 'CH': '85.0.0.0/13', - 'CI': '154.232.0.0/14', + 'CI': '102.136.0.0/14', 'CK': '202.65.32.0/19', 'CL': '152.172.0.0/14', - 'CM': '165.210.0.0/15', + 'CM': '102.244.0.0/14', 'CN': '36.128.0.0/10', 'CO': '181.240.0.0/12', 'CR': '201.192.0.0/12', 'CU': '152.206.0.0/15', 'CV': '165.90.96.0/19', 'CW': '190.88.128.0/17', - 'CY': '46.198.0.0/15', + 'CY': '31.153.0.0/16', 'CZ': '88.100.0.0/14', 'DE': '53.0.0.0/8', 'DJ': '197.241.0.0/17', @@ -5046,6 +5048,7 @@ class GeoUtils(object): 'EU': '2.16.0.0/13', 'FI': '91.152.0.0/13', 'FJ': '144.120.0.0/16', + 'FK': '80.73.208.0/21', 'FM': '119.252.112.0/20', 'FO': '88.85.32.0/19', 'FR': '90.0.0.0/9', @@ -5055,8 +5058,8 @@ class GeoUtils(object): 'GE': '31.146.0.0/16', 'GF': '161.22.64.0/18', 'GG': '62.68.160.0/19', - 'GH': '45.208.0.0/14', - 'GI': '85.115.128.0/19', + 'GH': '154.160.0.0/12', + 'GI': '95.164.0.0/16', 'GL': '88.83.0.0/19', 'GM': '160.182.0.0/15', 'GN': '197.149.192.0/18', @@ -5085,13 +5088,13 @@ class GeoUtils(object): 'JE': '87.244.64.0/18', 'JM': '72.27.0.0/17', 'JO': '176.29.0.0/16', - 'JP': '126.0.0.0/8', + 'JP': '133.0.0.0/8', 'KE': '105.48.0.0/12', 'KG': '158.181.128.0/17', 'KH': '36.37.128.0/17', 'KI': '103.25.140.0/22', 'KM': '197.255.224.0/20', - 'KN': '198.32.32.0/19', + 'KN': '198.167.192.0/19', 'KP': '175.45.176.0/22', 'KR': '175.192.0.0/10', 'KW': '37.36.0.0/14', @@ -5099,10 +5102,10 @@ class GeoUtils(object): 'KZ': '2.72.0.0/13', 'LA': '115.84.64.0/18', 'LB': '178.135.0.0/16', - 'LC': '192.147.231.0/24', + 'LC': '24.92.144.0/20', 'LI': '82.117.0.0/19', 'LK': '112.134.0.0/15', - 'LR': '41.86.0.0/19', + 'LR': '102.183.0.0/16', 'LS': '129.232.0.0/17', 'LT': '78.56.0.0/13', 'LU': '188.42.0.0/16', @@ -5127,7 +5130,7 @@ class GeoUtils(object): 'MT': '46.11.0.0/16', 'MU': '105.16.0.0/12', 'MV': '27.114.128.0/18', - 'MW': '105.234.0.0/16', + 'MW': '102.70.0.0/15', 'MX': '187.192.0.0/11', 'MY': '175.136.0.0/13', 'MZ': '197.218.0.0/15', @@ -5158,23 +5161,23 @@ class GeoUtils(object): 'PW': '202.124.224.0/20', 'PY': '181.120.0.0/14', 'QA': '37.210.0.0/15', - 'RE': '139.26.0.0/16', + 'RE': '102.35.0.0/16', 'RO': '79.112.0.0/13', - 'RS': '178.220.0.0/14', + 'RS': '93.86.0.0/15', 'RU': '5.136.0.0/13', - 'RW': '105.178.0.0/15', + 'RW': '41.186.0.0/16', 'SA': '188.48.0.0/13', 'SB': '202.1.160.0/19', 'SC': '154.192.0.0/11', - 'SD': '154.96.0.0/13', + 'SD': '102.120.0.0/13', 'SE': '78.64.0.0/12', - 'SG': '152.56.0.0/14', + 'SG': '8.128.0.0/10', 'SI': '188.196.0.0/14', 'SK': '78.98.0.0/15', - 'SL': '197.215.0.0/17', + 'SL': '102.143.0.0/17', 'SM': '89.186.32.0/19', 'SN': '41.82.0.0/15', - 'SO': '197.220.64.0/19', + 'SO': '154.115.192.0/18', 'SR': '186.179.128.0/17', 'SS': '105.235.208.0/21', 'ST': '197.159.160.0/19', @@ -5197,15 +5200,15 @@ class GeoUtils(object): 'TV': '202.2.96.0/19', 'TW': '120.96.0.0/11', 'TZ': '156.156.0.0/14', - 'UA': '93.72.0.0/13', - 'UG': '154.224.0.0/13', - 'US': '3.0.0.0/8', + 'UA': '37.52.0.0/14', + 'UG': '102.80.0.0/13', + 'US': '6.0.0.0/8', 'UY': '167.56.0.0/13', - 'UZ': '82.215.64.0/18', + 'UZ': '84.54.64.0/18', 'VA': '212.77.0.0/19', - 'VC': '24.92.144.0/20', + 'VC': '207.191.240.0/21', 'VE': '186.88.0.0/13', - 'VG': '172.103.64.0/18', + 'VG': '66.81.192.0/20', 'VI': '146.226.0.0/16', 'VN': '14.160.0.0/11', 'VU': '202.80.32.0/20', @@ -5214,8 +5217,8 @@ class GeoUtils(object): 'YE': '134.35.0.0/16', 'YT': '41.242.116.0/22', 'ZA': '41.0.0.0/11', - 'ZM': '165.56.0.0/13', - 'ZW': '41.85.192.0/19', + 'ZM': '102.144.0.0/13', + 'ZW': '102.177.192.0/18', } @classmethod From cae0bbc53831eed38c4af3755de43e223c503270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 06:11:09 +0700 Subject: [PATCH 0128/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ChangeLog b/ChangeLog index b664368a1..2957b7ced 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ version +Core +* [utils] Actualize major IPv4 address blocks per country + Extractors + [go] Add support for abc.com and freeform.com (#22823, #22864) + [mtv] Add support for mtvjapan.com From c4bd9cb7bb57c6e4bbc04fb054dfea14d4ecb171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Oct 2019 06:12:33 +0700 Subject: [PATCH 0129/1705] release 2019.10.29 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 10 +++------- youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 21 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index f1afe704c..f82502bd1 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.22 + [debug] youtube-dl version 2019.10.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index a4dc9b005..5ef983d43 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 5bf86adce..8f05aa79f 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7aa5534e5..e90900d8d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.22 + [debug] youtube-dl version 2019.10.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 5d3645e3d..7021d7397 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.22** +- [ ] I've verified that I'm running youtube-dl version **2019.10.29** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 2957b7ced..fcab1102c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.10.29 Core * [utils] Actualize major IPv4 address blocks per country diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a1b0edeeb..af905db5a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -232,7 +232,6 @@ - **DouyuShow** - **DouyuTV**: 斗鱼 - **DPlay** - - **DPlayIt** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -339,7 +338,6 @@ - **Goshgay** - **GPUTechConf** - **Groupon** - - **Hark** - **hbo** - **HearThisAt** - **Heise** @@ -432,7 +430,6 @@ - **Lcp** - **LcpPlay** - **Le**: 乐视网 - - **Learnr** - **Lecture2Go** - **Lecturio** - **LecturioCourse** @@ -466,11 +463,9 @@ - **lynda**: lynda.com videos - **lynda:course**: lynda.com online courses - **m6** - - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - - **MakerTV** - **MallTV** - **mangomolo:live** - **mangomolo:video** @@ -526,8 +521,8 @@ - **mtg**: MTG services - **mtv** - **mtv.de** - - **mtv81** - **mtv:video** + - **mtvjapan** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -815,6 +810,7 @@ - **soundcloud:set** - **soundcloud:trackstation** - **soundcloud:user** + - **SoundcloudEmbed** - **soundgasm** - **soundgasm:profile** - **southpark.cc.com** @@ -887,6 +883,7 @@ - **TeleTask** - **Telewebion** - **TennisTV** + - **TenPlay** - **TF1** - **TFO** - **TheIntercept** @@ -925,7 +922,6 @@ - **tunein:topic** - **TunePk** - **Turbo** - - **Tutv** - **tv.dfb.de** - **TV2** - **tv2.hu** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 39b355b9e..924f26ca8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.10.22' +__version__ = '2019.10.29' From 7455832f311843663b416968b9e5a0a0c6134d8d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Oct 2019 09:43:17 +0100 Subject: [PATCH 0130/1705] [fox9] fix extraction --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/fox9.py | 43 +++++++++++++++--------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c10bcbcc1..15f96fb8f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -367,7 +367,10 @@ from .fourtube import ( FuxIE, ) from .fox import FOXIE -from .fox9 import FOX9IE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) from .foxgay import FoxgayIE from .foxnews import ( FoxNewsIE, diff --git a/youtube_dl/extractor/fox9.py b/youtube_dl/extractor/fox9.py index 17dfffa7b..91f8f7b8a 100644 --- a/youtube_dl/extractor/fox9.py +++ b/youtube_dl/extractor/fox9.py @@ -1,13 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -from .anvato import AnvatoIE +from .common import InfoExtractor -class FOX9IE(AnvatoIE): - _VALID_URL = r'https?://(?:www\.)?fox9\.com/(?:[^/]+/)+(?P\d+)-story' - _TESTS = [{ - 'url': 'http://www.fox9.com/news/215123287-story', +class FOX9IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fox9\.com/video/(?P\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'anvato:anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b:' + video_id, + 'Anvato', video_id) + + +class FOX9NewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fox9\.com/news/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.fox9.com/news/black-bear-in-tree-draws-crowd-in-downtown-duluth-minnesota', 'md5': 'd6e1b2572c3bab8a849c9103615dd243', 'info_dict': { 'id': '314473', @@ -21,22 +31,11 @@ class FOX9IE(AnvatoIE): 'categories': ['News', 'Sports'], 'tags': ['news', 'video'], }, - }, { - 'url': 'http://www.fox9.com/news/investigators/214070684-story', - 'only_matching': True, - }] + } def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_id = self._parse_json( - self._search_regex( - r"this\.videosJson\s*=\s*'(\[.+?\])';", - webpage, 'anvato playlist'), - video_id)[0]['video'] - - return self._get_anvato_videos( - 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b', - video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + anvato_id = self._search_regex( + r'anvatoId\s*:\s*[\'"](\d+)', webpage, 'anvato id') + return self.url_result('https://www.fox9.com/video/' + anvato_id, 'FOX9') From 8989349e6dcaa98204f77fb9f1e15a86eecb823d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Oct 2019 09:44:07 +0100 Subject: [PATCH 0131/1705] [onet] improve extraction - add support for onet100.vod.pl domain - extract m3u8 formats - correct audio only format info --- youtube_dl/extractor/onet.py | 54 ++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 58da1bc27..e55b2ac89 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -20,6 +20,8 @@ from ..utils import ( class OnetBaseIE(InfoExtractor): + _URL_BASE_RE = r'https?://(?:(?:www\.)?onet\.tv|onet100\.vod\.pl)/[a-z]/' + def _search_mvp_id(self, webpage): return self._search_regex( r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') @@ -45,7 +47,7 @@ class OnetBaseIE(InfoExtractor): video = response['result'].get('0') formats = [] - for _, formats_dict in video['formats'].items(): + for format_type, formats_dict in video['formats'].items(): if not isinstance(formats_dict, dict): continue for format_id, format_list in formats_dict.items(): @@ -56,21 +58,31 @@ class OnetBaseIE(InfoExtractor): if not video_url: continue ext = determine_ext(video_url) - if format_id == 'ism': + if format_id.startswith('ism'): formats.extend(self._extract_ism_formats( video_url, video_id, 'mss', fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) + elif format_id.startswith('hls'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - formats.append({ + http_f = { 'url': video_url, 'format_id': format_id, - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), 'abr': float_or_none(f.get('audio_bitrate')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) + } + if format_type == 'audio': + http_f['vcodec'] = 'none' + else: + http_f.update({ + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + formats.append(http_f) self._sort_formats(formats) meta = video.get('meta', {}) @@ -105,12 +117,12 @@ class OnetMVPIE(OnetBaseIE): class OnetIE(OnetBaseIE): - _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + _VALID_URL = OnetBaseIE._URL_BASE_RE + r'[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' IE_NAME = 'onet.tv' - _TEST = { + _TESTS = [{ 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', - 'md5': 'e3ffbf47590032ac3f27249204173d50', + 'md5': '436102770fb095c75b8bb0392d3da9ff', 'info_dict': { 'id': 'qbpyqc', 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', @@ -120,7 +132,10 @@ class OnetIE(OnetBaseIE): 'upload_date': '20160705', 'timestamp': 1467721580, }, - } + }, { + 'url': 'https://onet100.vod.pl/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -140,18 +155,21 @@ class OnetIE(OnetBaseIE): class OnetChannelIE(OnetBaseIE): - _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P[a-z]+)(?:[?#]|$)' + _VALID_URL = OnetBaseIE._URL_BASE_RE + r'(?P[a-z]+)(?:[?#]|$)' IE_NAME = 'onet.tv:channel' - _TEST = { + _TESTS = [{ 'url': 'http://onet.tv/k/openerfestival', 'info_dict': { 'id': 'openerfestival', - 'title': 'Open\'er Festival Live', - 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', + 'title': "Open'er Festival", + 'description': "Tak było na Open'er Festival 2016! Oglądaj nasze reportaże i wywiady z artystami.", }, - 'playlist_mincount': 46, - } + 'playlist_mincount': 35, + }, { + 'url': 'https://onet100.vod.pl/k/openerfestival', + 'only_matching': True, + }] def _real_extract(self, url): channel_id = self._match_id(url) @@ -173,7 +191,7 @@ class OnetChannelIE(OnetBaseIE): 'Downloading channel %s - add --no-playlist to just download video %s' % ( channel_id, video_name)) matches = re.findall( - r']+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', + r']+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE, webpage) entries = [ self.url_result(video_link, OnetIE.ie_key()) From c56b2ac43ca27b32fb4f7b230d851a61b5fc7cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Oct 2019 02:21:03 +0700 Subject: [PATCH 0132/1705] [tv2dk] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tv2dk.py | 82 ++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 youtube_dl/extractor/tv2dk.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 15f96fb8f..5d20ba863 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1189,6 +1189,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv2dk import TV2DKIE from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py new file mode 100644 index 000000000..eb39424df --- /dev/null +++ b/youtube_dl/extractor/tv2dk.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class TV2DKIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + tvsyd| + tv2ostjylland| + tvmidtvest| + tv2fyn| + tv2east| + tv2lorry| + tv2nord + )\.dk/ + (:[^/]+/)* + (?P[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player', + 'info_dict': { + 'id': '0_52jmwa0p', + 'ext': 'mp4', + 'title': '19:30 - 28. okt. 2019', + 'timestamp': 1572290248, + 'upload_date': '20191028', + 'uploader_id': 'tvsyd', + 'duration': 1347, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi', + 'only_matching': True, + }, { + 'url': 'https://www.tv2ostjylland.dk/nyheder/28-10-2019/22/2200-nyhederne-mandag-d-28-oktober-2019?autoplay=1#player', + 'only_matching': True, + }, { + 'url': 'https://www.tvmidtvest.dk/nyheder/27-10-2019/1930/1930-27-okt-2019', + 'only_matching': True, + }, { + 'url': 'https://www.tv2fyn.dk/artikel/fyn-kan-faa-landets-foerste-fabrik-til-groent-jetbraendstof', + 'only_matching': True, + }, { + 'url': 'https://www.tv2east.dk/artikel/gods-faar-indleveret-tonsvis-af-aebler-100-kilo-aebler-gaar-til-en-aeblebrandy', + 'only_matching': True, + }, { + 'url': 'https://www.tv2lorry.dk/koebenhavn/rasmus-paludan-evakueret-til-egen-demonstration#player', + 'only_matching': True, + }, { + 'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + entries = [] + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): + video = extract_attributes(video_el) + kaltura_id = video.get('data-entryid') + if not kaltura_id: + continue + partner_id = video.get('data-partnerid') + if not partner_id: + continue + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + return self.playlist_result(entries) From 9a621ddc3a42769f107f8bd0d67b2c7073ea8256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Oct 2019 02:21:52 +0700 Subject: [PATCH 0133/1705] [tv2] Fix and improve extraction (closes #22787) --- youtube_dl/extractor/tv2.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index d5071e8a5..1b6590767 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -11,6 +11,7 @@ from ..utils import ( js_to_json, parse_iso8601, remove_end, + try_get, ) @@ -44,7 +45,14 @@ class TV2IE(InfoExtractor): data = self._download_json( 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), video_id, 'Downloading play JSON')['playback'] - for item in data['items']['item']: + items = try_get(data, lambda x: x['items']['item']) + if not items: + continue + if not isinstance(items, list): + items = [items] + for item in items: + if not isinstance(item, dict): + continue video_url = item.get('url') if not video_url or video_url in format_urls: continue From 45f4a433894556301204b704caca7d6a14286287 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Oct 2019 23:07:35 +0100 Subject: [PATCH 0134/1705] [yahoo] improve extraction - add support for live streams(closes #3597)(closes #3779)(closes #22178) - bypass cookie consent page for european domains(closes #16948)(closes #22576) - add generic support for embeds(closes #20332) --- youtube_dl/extractor/yahoo.py | 672 +++++++++++++--------------------- 1 file changed, 264 insertions(+), 408 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e5ebdd180..ee68096d0 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -3,453 +3,309 @@ from __future__ import unicode_literals import hashlib import itertools -import json import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_str, compat_urllib_parse, - compat_urlparse, ) from ..utils import ( clean_html, - determine_ext, - ExtractorError, - extract_attributes, int_or_none, mimetype2ext, + parse_iso8601, smuggle_url, try_get, - unescapeHTML, url_or_none, ) -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nbc import NBCSportsVPlayerIE +from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P.+)?-)?(?P[0-9]+)(?:-[a-z]+)?(?:\.html)?' - _TESTS = [ - { - 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'info_dict': { - 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', - 'ext': 'mp4', - 'title': 'Julian Smith & Travis Legg Watch Julian Smith', - 'description': 'Julian and Travis watch Julian Smith', - 'duration': 6863, - }, + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)\.html)' + _TESTS = [{ + 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + 'info_dict': { + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', + 'ext': 'mp4', + 'title': 'Julian Smith & Travis Legg Watch Julian Smith', + 'description': 'Julian and Travis watch Julian Smith', + 'duration': 6863, + 'timestamp': 1369812016, + 'upload_date': '20130529', }, - { - 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': '251af144a19ebc4a033e8ba91ac726bb', - 'info_dict': { - 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', - 'ext': 'mp4', - 'title': 'Codefellas - The Cougar Lies with Spanish Moss', - 'description': 'md5:66b627ab0a282b26352136ca96ce73c1', - 'duration': 151, - }, - 'skip': 'HTTP Error 404', + }, { + 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', + 'md5': '7993e572fac98e044588d0b5260f4352', + 'info_dict': { + 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', + 'ext': 'mp4', + 'title': "Yahoo Saves 'Community'", + 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', + 'duration': 170, + 'timestamp': 1406838636, + 'upload_date': '20140731', }, - { - 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '7993e572fac98e044588d0b5260f4352', - 'info_dict': { - 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', - 'ext': 'mp4', - 'title': "Yahoo Saves 'Community'", - 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', - 'duration': 170, - } - }, - { - 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', - 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', - 'info_dict': { - 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', - 'ext': 'mp4', - 'title': '敢問市長/黃秀霜批賴清德「非常高傲」', - 'description': '直言台南沒捷運 交通居五都之末', - 'duration': 396, - }, - }, - { - 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '71298482f7c64cbb7fa064e4553ff1c1', - 'info_dict': { - 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'webm', - 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', - 'description': 'md5:f66c890e1490f4910a9953c941dee944', - 'duration': 97, - } - }, - { - 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html', - 'md5': '57e06440778b1828a6079d2f744212c4', - 'info_dict': { - 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73', - 'ext': 'mp4', - 'title': 'Program that makes hockey more affordable not offered in Manitoba', - 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4', - 'duration': 121, - }, - 'skip': 'Video gone', - }, { - 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html', - 'info_dict': { - 'id': '154609075', - }, - 'playlist': [{ - 'md5': '000887d0dc609bc3a47c974151a40fb8', - 'info_dict': { - 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: War', - 'description': 'The Interview', - 'duration': 30, - }, - }, { - 'md5': '81bc74faf10750fe36e4542f9a184c66', - 'info_dict': { - 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: Guys', - 'description': 'The Interview', - 'duration': 30, - }, - }], - }, { - 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', - 'md5': '88e209b417f173d86186bef6e4d1f160', - 'info_dict': { - 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', - 'ext': 'mp4', - 'title': 'China Moses Is Crazy About the Blues', - 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', - 'duration': 128, - } - }, { - 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html', - 'md5': 'd9a083ccf1379127bf25699d67e4791b', - 'info_dict': { - 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c', - 'ext': 'mp4', - 'title': 'Connect the Dots: Dark Side of Virgo', - 'description': 'md5:1428185051cfd1949807ad4ff6d3686a', - 'duration': 201, - }, - 'skip': 'Domain name in.lifestyle.yahoo.com gone', - }, { - 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': '989396ae73d20c6f057746fb226aa215', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': '\'True Story\' Trailer', - 'description': 'True Story', - 'duration': 150, - }, - }, { - 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', - 'only_matching': True, - }, { - 'note': 'NBC Sports embeds', - 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', - 'info_dict': { - 'id': '9CsDKds0kvHI', - 'ext': 'flv', - 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', - 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', - 'upload_date': '20150313', - 'uploader': 'NBCU-SPORTS', - 'timestamp': 1426270238, - } - }, { - 'url': 'https://tw.news.yahoo.com/-100120367.html', - 'only_matching': True, - }, { - # Query result is embedded in webpage, but explicit request to video API fails with geo restriction - 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', - 'info_dict': { - 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', - 'ext': 'mp4', - 'title': 'Communitary - Community Episode 1: Ladders', - 'description': 'md5:8fc39608213295748e1e289807838c97', - 'duration': 1646, - }, - }, { - # it uses an alias to get the video_id - 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html', - 'info_dict': { - 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737', - 'ext': 'mp4', - 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking', - 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.', - }, - }, - { - # config['models']['applet_model']['data']['sapi'] has no query - 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016', - 'md5': 'dac0c72d502bc5facda80c9e6d5c98db', - 'info_dict': { - 'id': 'a6015640-e9e5-3efb-bb60-05589a183919', - 'ext': 'mp4', - 'description': 'Galactic', - 'title': 'Dolla Diva (feat. Maggie Koerner)', - }, - 'skip': 'redirect to https://www.yahoo.com/music', - }, - { - # yahoo://article/ - 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html', - 'info_dict': { - 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', - 'ext': 'mp4', - 'title': "'True Story' Trailer", - 'description': 'True Story', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # ytwnews://cavideo/ - 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', - 'info_dict': { - 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', - 'ext': 'mp4', - 'title': '單車天使 - 中文版預', - 'description': '中文版預', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # custom brightcove - 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/', - 'info_dict': { - 'id': '5575377707001', - 'ext': 'mp4', - 'title': "Clown entertainers say 'It' is hurting their business", - 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.', - 'timestamp': 1505341164, - 'upload_date': '20170913', - 'uploader_id': '2376984109001', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # custom brightcove, geo-restricted to Australia, bypassable - 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/', - 'only_matching': True, + }, { + 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', + 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'info_dict': { + 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', + 'ext': 'mp4', + 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', + 'description': 'md5:f66c890e1490f4910a9953c941dee944', + 'duration': 97, + 'timestamp': 1414489862, + 'upload_date': '20141028', } - ] + }, { + 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + 'md5': '88e209b417f173d86186bef6e4d1f160', + 'info_dict': { + 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', + 'ext': 'mp4', + 'title': 'China Moses Is Crazy About the Blues', + 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', + 'duration': 128, + 'timestamp': 1385722202, + 'upload_date': '20131129', + } + }, { + 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', + 'info_dict': { + 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', + 'ext': 'mp4', + 'title': '\'True Story\' Trailer', + 'description': 'True Story', + 'duration': 150, + 'timestamp': 1418919206, + 'upload_date': '20141218', + }, + }, { + 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', + 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + 'timestamp': 1426270238, + }, + }, { + 'url': 'https://tw.news.yahoo.com/-100120367.html', + 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + 'timestamp': 1440436550, + 'upload_date': '20150824', + 'series': 'Communitary', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + # ytwnews://cavideo/ + 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', + 'info_dict': { + 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', + 'ext': 'mp4', + 'title': '單車天使 - 中文版預', + 'description': '中文版預', + 'timestamp': 1476696196, + 'upload_date': '20161017', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Contains both a Yahoo hosted video and multiple Youtube embeds + 'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html', + 'info_dict': { + 'id': '46c5d95a-528f-3d03-b732-732fcadd51de', + 'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead', + 'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.', + }, + 'playlist': [{ + 'info_dict': { + 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6', + 'ext': 'mp4', + 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs', + 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.', + 'timestamp': 1572406500, + 'upload_date': '20191030', + }, + }, { + 'info_dict': { + 'id': '352CFDOQrKg', + 'ext': 'mp4', + 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019', + 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11', + 'uploader': 'The Voice', + 'uploader_id': 'NBCTheVoice', + 'upload_date': '20191029', + }, + }], + 'params': { + 'playlistend': 2, + }, + }, { + 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', + 'only_matching': True, + }, { + 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - display_id = mobj.group('display_id') or page_id - host = mobj.group('host') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'err=404' in urlh.geturl(): - raise ExtractorError('Video gone', expected=True) - - # Look for iframed media first - entries = [] - iframe_urls = re.findall(r']+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) - for idx, iframe_url in enumerate(iframe_urls): - entries.append(self.url_result(host + iframe_url, 'Yahoo')) - if entries: - return self.playlist_result(entries, page_id) - - # Look for NBCSports iframes - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) - - # Look for Brightcove Legacy Studio embeds - bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if bc_url: - return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) - - def brightcove_url_result(bc_url): - return self.url_result( - smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}), - BrightcoveNewIE.ie_key()) - - # Look for Brightcove New Studio embeds - bc_url = BrightcoveNewIE._extract_url(self, webpage) - if bc_url: - return brightcove_url_result(bc_url) - - brightcove_iframe = self._search_regex( - r'(]+data-video-id=["\']\d+[^>]+>)', webpage, - 'brightcove iframe', default=None) - if brightcove_iframe: - attr = extract_attributes(brightcove_iframe) - src = attr.get('src') - if src: - parsed_src = compat_urlparse.urlparse(src) - qs = compat_urlparse.parse_qs(parsed_src.query) - account_id = qs.get('accountId', ['2376984109001'])[0] - brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0] - if account_id and brightcove_id: - return brightcove_url_result( - 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - % (account_id, brightcove_id)) - - # Query result is often embedded in webpage as JSON. Sometimes explicit requests - # to video API results in a failure with geo restriction reason therefore using - # embedded query result when present sounds reasonable. - config_json = self._search_regex( - r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:|$)', - webpage, 'videoplayer applet', default=None) - if config_json: - config = self._parse_json(config_json, display_id, fatal=False) - if config: - sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') - if sapi and 'query' in sapi: - info = self._extract_info(display_id, sapi, webpage) - self._sort_formats(info['formats']) - return info - - items_json = self._search_regex( - r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, - default=None) - if items_json is None: - alias = self._search_regex( - r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None) - if alias is not None: - alias_info = self._download_json( - 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias, - display_id, 'Downloading alias info') - video_id = alias_info[0]['id'] - else: - CONTENT_ID_REGEXES = [ - r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', - r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', - r'"first_videoid"\s*:\s*"([^"]+)"', - r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), - r']data-uuid=["\']([^"\']+)', - r']+yahoo://article/view\?.*\buuid=([^&"\']+)', - r']+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']', - ] - video_id = self._search_regex( - CONTENT_ID_REGEXES, webpage, 'content ID') + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' else: - items = json.loads(items_json) - info = items['mediaItems']['query']['results']['mediaObj'][0] - # The 'meta' field is not always in the video webpage, we request it - # from another page - video_id = info['id'] - return self._get_info(video_id, display_id, webpage) + country = country.split('-')[0] + api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - def _extract_info(self, display_id, query, webpage): - info = query['query']['results']['mediaObj'][0] - meta = info.get('meta') - video_id = info.get('id') + for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): + content = self._download_json( + api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, + display_id, 'Downloading content JSON metadata', fatal=i == 1) + if content: + item = content['items'][0] + break - if not meta: - msg = info['status'].get('msg') - if msg: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unable to extract media object meta') + if item.get('type') != 'video': + entries = [] + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in item.get('body', []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + video_id = item['uuid'] + video = self._download_json( + api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + video_id, 'Downloading video JSON metadata')[0] + title = video['title'] + + if country == 'malaysia': + country = 'my' + + is_live = video.get('live_state') == 'live' + fmts = ('m3u8',) if is_live else ('web', 'mp4') + + urls = [] formats = [] - for s in info['streams']: - tbr = int_or_none(s.get('bitrate')) - format_info = { - 'width': int_or_none(s.get('width')), - 'height': int_or_none(s.get('height')), - 'tbr': tbr, - } - - host = s['host'] - path = s['path'] - if host.startswith('rtmp'): - fmt = 'rtmp' - format_info.update({ - 'url': host, - 'play_path': path, - 'ext': 'flv', - }) - else: - if s.get('format') == 'm3u8_playlist': - fmt = 'hls' - format_info.update({ - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - else: - fmt = format_info['ext'] = determine_ext(path) - format_url = compat_urlparse.urljoin(host, path) - format_info['url'] = format_url - format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') - formats.append(format_info) - - closed_captions = self._html_search_regex( - r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', - default='[]') - - cc_json = self._parse_json(closed_captions, video_id, fatal=False) subtitles = {} - if cc_json: - for closed_caption in cc_json: - lang = closed_caption['lang'] - if lang not in subtitles: - subtitles[lang] = [] - subtitles[lang].append({ - 'url': closed_caption['url'], - 'ext': mimetype2ext(closed_caption['content_type']), + for fmt in fmts: + media_obj = self._download_json( + 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + video_id, 'Downloading %s JSON metadata' % fmt, + headers=self.geo_verification_headers(), query={ + 'format': fmt, + 'region': country.upper(), + })['query']['results']['mediaObj'][0] + msg = media_obj.get('status', {}).get('msg') + + for s in media_obj.get('streams', []): + host = s.get('host') + path = s.get('path') + if not host or not path: + continue + s_url = host + path + if s.get('format') == 'm3u8': + formats.extend(self._extract_m3u8_formats( + s_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + continue + tbr = int_or_none(s.get('bitrate')) + formats.append({ + 'url': s_url, + 'format_id': fmt + ('-%d' % tbr if tbr else ''), + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'tbr': tbr, + 'fps': int_or_none(s.get('framerate')), }) + for cc in media_obj.get('closedcaptions', []): + cc_url = cc.get('url') + if not cc_url or cc_url in urls: + continue + urls.append(cc_url) + subtitles.setdefault(cc.get('lang') or 'en-US', []).append({ + 'url': cc_url, + 'ext': mimetype2ext(cc.get('content_type')), + }) + + streaming_url = video.get('streaming_url') + if streaming_url and not is_live: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + if not formats and msg == 'geo restricted': + self.raise_geo_restricted() + + self._sort_formats(formats) + + thumbnails = [] + for thumb in video.get('thumbnails', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'id': thumb.get('tag'), + 'url': thumb.get('url'), + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + series_info = video.get('series_info') or {} + return { 'id': video_id, - 'display_id': display_id, - 'title': unescapeHTML(meta['title']), + 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'description': clean_html(meta['description']), - 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), - 'duration': int_or_none(meta.get('duration')), + 'display_id': display_id, + 'thumbnails': thumbnails, + 'description': clean_html(video.get('description')), + 'timestamp': parse_iso8601(video.get('publish_time')), 'subtitles': subtitles, + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('view_count')), + 'is_live': is_live, + 'series': video.get('show_name'), + 'season_number': int_or_none(series_info.get('season_number')), + 'episode_number': int_or_none(series_info.get('episode_number')), } - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US').upper() - formats = [] - info = {} - for fmt in ('webm', 'mp4'): - query_result = self._download_json( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, - display_id, 'Downloading %s video info' % fmt, query={ - 'protocol': 'http', - 'region': region, - 'format': fmt, - }) - info = self._extract_info(display_id, query_result, webpage) - formats.extend(info['formats']) - formats.extend(self._extract_m3u8_formats( - 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - info['formats'] = formats - return info - class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From 8040a0d35e11f7b2bf6d698175ab0b12424d696f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Oct 2019 23:52:09 +0100 Subject: [PATCH 0135/1705] [yahoo] fix typo --- youtube_dl/extractor/yahoo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index ee68096d0..6c6bd76e8 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -51,10 +51,10 @@ class YahooIE(InfoExtractor): }, }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', 'info_dict': { 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 'description': 'md5:f66c890e1490f4910a9953c941dee944', 'duration': 97, @@ -164,6 +164,7 @@ class YahooIE(InfoExtractor): 'params': { 'playlistend': 2, }, + 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', 'only_matching': True, @@ -219,7 +220,7 @@ class YahooIE(InfoExtractor): country = 'my' is_live = video.get('live_state') == 'live' - fmts = ('m3u8',) if is_live else ('web', 'mp4') + fmts = ('m3u8',) if is_live else ('webm', 'mp4') urls = [] formats = [] From 237513e801671a51cc45d6a2fe5e7df69517958e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Oct 2019 07:38:53 +0100 Subject: [PATCH 0136/1705] [yahoo] restore support for cbs suffixed URLs --- test/test_all_urls.py | 6 ------ youtube_dl/extractor/yahoo.py | 5 ++++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 465ce0050..81056a999 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -123,12 +123,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs']) - def test_yahoo_https(self): - # https://github.com/ytdl-org/youtube-dl/issues/2701 - self.assertMatch( - 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', - ['Yahoo']) - def test_no_duplicated_ie_names(self): name_accu = collections.defaultdict(list) for ie in self.ies: diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 6c6bd76e8..f041cf5de 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -25,7 +25,7 @@ from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)\.html)' + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)(?:-[a-z]+)?\.html)' _TESTS = [{ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'info_dict': { @@ -171,6 +171,9 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', 'only_matching': True, + }, { + 'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html', + 'only_matching': True, }] def _real_extract(self, url): From 3cf70bf1590ce364dc223197ba804cb70e704760 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Oct 2019 07:44:21 +0100 Subject: [PATCH 0137/1705] [yahoo] make cbs URL suffix part of the media alias --- youtube_dl/extractor/yahoo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index f041cf5de..b9a9e88a0 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -25,7 +25,7 @@ from .brightcove import BrightcoveNewIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+)(?:-[a-z]+)?\.html)' + _VALID_URL = r'(?Phttps?://(?:(?P[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' _TESTS = [{ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'info_dict': { From e993f1a0959fc04507b1cb2efeb610ae628d6d98 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Oct 2019 08:13:10 +0100 Subject: [PATCH 0138/1705] [mixcloud] fix cloudcast data extraction(closes #22821) --- youtube_dl/extractor/mixcloud.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index bf5353ef9..e5f631506 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -86,9 +86,10 @@ class MixcloudIE(InfoExtractor): r'', webpage, 'play info'), 'play info') for item in full_info_json: - item_data = try_get( - item, lambda x: x['cloudcast']['data']['cloudcastLookup'], - dict) + item_data = try_get(item, [ + lambda x: x['cloudcast']['data']['cloudcastLookup'], + lambda x: x['cloudcastLookup']['data']['cloudcastLookup'], + ], dict) if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break From 274bf5e4c58bceed4ff8c283d77457bf1cb76d3e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 11:37:41 +0100 Subject: [PATCH 0139/1705] [kakao] improve extraction - support embed URLs - support Kakao Legacy vid based embed URLs - only extract fields used for extraction - strip description and extract tags --- youtube_dl/extractor/kakao.py | 45 +++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 7fa140b0c..96f918b75 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -6,14 +6,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + strip_or_none, unified_timestamp, update_url_query, ) class KakaoIE(InfoExtractor): - _VALID_URL = r'https?://tv\.kakao\.com/channel/(?P\d+)/cliplink/(?P\d+)' - _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks' + _VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P\d+|[^?#&]+@my)' + _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/' _TESTS = [{ 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', @@ -36,7 +37,7 @@ class KakaoIE(InfoExtractor): 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', 'uploader_id': 2653210, - 'uploader': '쇼 음악중심', + 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } @@ -44,6 +45,8 @@ class KakaoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + display_id = video_id.rstrip('@my') + api_base = self._API_BASE_TMPL % video_id player_header = { 'Referer': update_url_query( @@ -55,20 +58,22 @@ class KakaoIE(InfoExtractor): }) } - QUERY_COMMON = { + query = { 'player': 'monet_html5', 'referer': url, 'uuid': '', 'service': 'kakao_tv', 'section': '', 'dteType': 'PC', + 'fields': ','.join([ + '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', + 'description', 'channelId', 'createTime', 'duration', 'playCount', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', + 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault']) } - query = QUERY_COMMON.copy() - query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' impress = self._download_json( - '%s/%s/impress' % (self._API_BASE, video_id), - video_id, 'Downloading video info', + api_base + 'impress', display_id, 'Downloading video info', query=query, headers=player_header) clip_link = impress['clipLink'] @@ -78,30 +83,27 @@ class KakaoIE(InfoExtractor): tid = impress.get('tid', '') - query = QUERY_COMMON.copy() query.update({ + 'fields': '-*,outputList,profile,width,height,label,filesize', 'tid': tid, 'profile': 'HIGH', }) raw = self._download_json( - '%s/%s/raw' % (self._API_BASE, video_id), - video_id, 'Downloading video formats info', + api_base + 'raw', display_id, 'Downloading video formats info', query=query, headers=player_header) formats = [] for fmt in raw.get('outputList', []): try: profile_name = fmt['profile'] + query.update({ + 'profile': profile_name, + 'fields': '-*,url', + }) fmt_url_json = self._download_json( - '%s/%s/raw/videolocation' % (self._API_BASE, video_id), - video_id, + api_base + 'raw/videolocation', display_id, 'Downloading video URL for profile %s' % profile_name, - query={ - 'service': 'kakao_tv', - 'section': '', - 'tid': tid, - 'profile': profile_name - }, headers=player_header, fatal=False) + query=query, headers=player_header, fatal=False) if fmt_url_json is None: continue @@ -134,9 +136,9 @@ class KakaoIE(InfoExtractor): }) return { - 'id': video_id, + 'id': display_id, 'title': title, - 'description': clip.get('description'), + 'description': strip_or_none(clip.get('description')), 'uploader': clip_link.get('channel', {}).get('name'), 'uploader_id': clip_link.get('channelId'), 'thumbnails': thumbs, @@ -146,4 +148,5 @@ class KakaoIE(InfoExtractor): 'like_count': int_or_none(clip.get('likeCount')), 'comment_count': int_or_none(clip.get('commentCount')), 'formats': formats, + 'tags': clip.get('tagList'), } From d439989215fcb1672bc2ac18d4fb6206e12c387a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 11:43:18 +0100 Subject: [PATCH 0140/1705] [daum] fix VOD and Clip extracton(closes #15015) --- youtube_dl/extractor/daum.py | 106 +++++++++++------------------------ 1 file changed, 32 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 76f021892..137095577 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -2,25 +2,21 @@ from __future__ import unicode_literals -import re import itertools from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, compat_urlparse, ) -from ..utils import ( - int_or_none, - str_to_int, - xpath_text, - unescapeHTML, -) -class DaumIE(InfoExtractor): +class DaumBaseIE(InfoExtractor): + _KAKAO_EMBED_BASE = 'http://tv.kakao.com/embed/player/cliplink/' + + +class DaumIE(DaumBaseIE): _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P[^?#&]+)' IE_NAME = 'daum.net' @@ -36,6 +32,9 @@ class DaumIE(InfoExtractor): 'duration': 2117, 'view_count': int, 'comment_count': int, + 'uploader_id': 186139, + 'uploader': '콘간지', + 'timestamp': 1387310323, }, }, { 'url': 'http://m.tvpot.daum.net/v/65139429', @@ -44,11 +43,14 @@ class DaumIE(InfoExtractor): 'ext': 'mp4', 'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', 'description': 'md5:79794514261164ff27e36a21ad229fc5', - 'upload_date': '20150604', + 'upload_date': '20150118', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 154, 'view_count': int, 'comment_count': int, + 'uploader': 'MBC 예능', + 'uploader_id': 132251, + 'timestamp': 1421604228, }, }, { 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', @@ -59,12 +61,15 @@ class DaumIE(InfoExtractor): 'id': 'vwIpVpCQsT8$', 'ext': 'flv', 'title': '01-Korean War ( Trouble on the horizon )', - 'description': '\nKorean War 01\nTrouble on the horizon\n전쟁의 먹구름', + 'description': 'Korean War 01\r\nTrouble on the horizon\r\n전쟁의 먹구름', 'upload_date': '20080223', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 249, 'view_count': int, 'comment_count': int, + 'uploader': '까칠한 墮落始祖 황비홍님의', + 'uploader_id': 560824, + 'timestamp': 1203770745, }, }, { # Requires dte_type=WEB (#9972) @@ -73,60 +78,24 @@ class DaumIE(InfoExtractor): 'info_dict': { 'id': 's3794Uf1NZeZ1qMpGpeqeRU', 'ext': 'mp4', - 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611', - 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회', - 'upload_date': '20160611', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'upload_date': '20170129', + 'uploader': '쇼! 음악중심', + 'uploader_id': 2653210, + 'timestamp': 1485684628, }, }] def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - movie_data = self._download_json( - 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json', - video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'}) - - # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid - if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): - return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) - - info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id, - 'Downloading video info', query={'vid': video_id}) - - formats = [] - for format_el in movie_data['output_list']['output_list']: - profile = format_el['profile'] - format_query = compat_urllib_parse_urlencode({ - 'vid': video_id, - 'profile': profile, - }) - url_doc = self._download_xml( - 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, - video_id, note='Downloading video data for %s format' % profile) - format_url = url_doc.find('result/url').text - formats.append({ - 'url': format_url, - 'format_id': profile, - 'width': int_or_none(format_el.get('width')), - 'height': int_or_none(format_el.get('height')), - 'filesize': int_or_none(format_el.get('filesize')), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info.find('TITLE').text, - 'formats': formats, - 'thumbnail': xpath_text(info, 'THUMB_URL'), - 'description': xpath_text(info, 'CONTENTS'), - 'duration': int_or_none(xpath_text(info, 'DURATION')), - 'upload_date': info.find('REGDTTM').text[:8], - 'view_count': str_to_int(xpath_text(info, 'PLAY_CNT')), - 'comment_count': str_to_int(xpath_text(info, 'COMMENT_CNT')), - } + if not video_id.isdigit(): + video_id += '@my' + return self.url_result( + self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) -class DaumClipIE(InfoExtractor): +class DaumClipIE(DaumBaseIE): _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P\d+)' IE_NAME = 'daum.net:clip' _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s' @@ -142,6 +111,9 @@ class DaumClipIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'duration': 3868, 'view_count': int, + 'uploader': 'GOMeXP', + 'uploader_id': 6667, + 'timestamp': 1377911092, }, }, { 'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425', @@ -154,22 +126,8 @@ class DaumClipIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - clip_info = self._download_json( - 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?clipid=%s' % video_id, - video_id, 'Downloading clip info')['clip_bean'] - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'http://tvpot.daum.net/v/%s' % clip_info['vid'], - 'title': unescapeHTML(clip_info['title']), - 'thumbnail': clip_info.get('thumb_url'), - 'description': clip_info.get('contents'), - 'duration': int_or_none(clip_info.get('duration')), - 'upload_date': clip_info.get('up_date')[:8], - 'view_count': int_or_none(clip_info.get('play_count')), - 'ie_key': 'Daum', - } + return self.url_result( + self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) class DaumListIE(InfoExtractor): From e987ce4bda476a387937e4af5b46f4a412a67830 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 12:40:41 +0100 Subject: [PATCH 0141/1705] [kakao] remove raw request and extract format total bitrate --- youtube_dl/extractor/kakao.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 96f918b75..32935bb28 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -69,7 +69,8 @@ class KakaoIE(InfoExtractor): '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault']) + 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } impress = self._download_json( @@ -81,21 +82,14 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - tid = impress.get('tid', '') - - query.update({ - 'fields': '-*,outputList,profile,width,height,label,filesize', - 'tid': tid, - 'profile': 'HIGH', - }) - raw = self._download_json( - api_base + 'raw', display_id, 'Downloading video formats info', - query=query, headers=player_header) + query['tid'] = impress.get('tid', '') formats = [] - for fmt in raw.get('outputList', []): + for fmt in clip.get('videoOutputList', []): try: profile_name = fmt['profile'] + if profile_name == 'AUDIO': + continue query.update({ 'profile': profile_name, 'fields': '-*,url', @@ -115,7 +109,8 @@ class KakaoIE(InfoExtractor): 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), 'format_note': fmt.get('label'), - 'filesize': int_or_none(fmt.get('filesize')) + 'filesize': int_or_none(fmt.get('filesize')), + 'tbr': int_or_none(fmt.get('kbps')), }) except KeyError: pass From 20cc7c082b82e82050a4e1f1bb815fee51f6c1c2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 16:36:35 +0100 Subject: [PATCH 0142/1705] [go90] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/go90.py | 149 ----------------------------- 2 files changed, 150 deletions(-) delete mode 100644 youtube_dl/extractor/go90.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5d20ba863..e9b59ce52 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -422,7 +422,6 @@ from .globo import ( GloboArticleIE, ) from .go import GoIE -from .go90 import Go90IE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py deleted file mode 100644 index c3ea717bc..000000000 --- a/youtube_dl/extractor/go90.py +++ /dev/null @@ -1,149 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, -) - - -class Go90IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'https://www.go90.com/videos/84BUqjLpf9D', - 'md5': 'efa7670dbbbf21a7b07b360652b24a32', - 'info_dict': { - 'id': '84BUqjLpf9D', - 'ext': 'mp4', - 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention', - 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', - 'timestamp': 1491868800, - 'upload_date': '20170411', - 'age_limit': 14, - } - }, { - 'url': 'https://www.go90.com/embed/261MflWkD3N', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - headers = self.geo_verification_headers() - headers.update({ - 'Content-Type': 'application/json; charset=utf-8', - }) - video_data = self._download_json( - 'https://www.go90.com/api/view/items/' + video_id, video_id, - headers=headers, data=b'{"client":"web","device_type":"pc"}') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - message = self._parse_json(e.cause.read().decode(), None)['error']['message'] - if 'region unavailable' in message: - self.raise_geo_restricted(countries=['US']) - raise ExtractorError(message, expected=True) - raise - - if video_data.get('requires_drm'): - raise ExtractorError('This video is DRM protected.', expected=True) - main_video_asset = video_data['main_video_asset'] - - episode_number = int_or_none(video_data.get('episode_number')) - series = None - season = None - season_id = None - season_number = None - for metadata in video_data.get('__children', {}).get('Item', {}).values(): - if metadata.get('type') == 'show': - series = metadata.get('title') - elif metadata.get('type') == 'season': - season = metadata.get('title') - season_id = metadata.get('id') - season_number = int_or_none(metadata.get('season_number')) - - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - thumbnails = [] - formats = [] - subtitles = {} - for asset in video_data.get('assets'): - if asset.get('id') == main_video_asset: - for source in asset.get('sources', []): - source_location = source.get('location') - if not source_location: - continue - source_type = source.get('type') - if source_type == 'hls': - m3u8_formats = self._extract_m3u8_formats( - source_location, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - for f in m3u8_formats: - mobj = re.search(r'/hls-(\d+)-(\d+)K', f['url']) - if mobj: - height, tbr = mobj.groups() - height = int_or_none(height) - f.update({ - 'height': f.get('height') or height, - 'width': f.get('width') or int_or_none(height / 9.0 * 16.0 if height else None), - 'tbr': f.get('tbr') or int_or_none(tbr), - }) - formats.extend(m3u8_formats) - elif source_type == 'dash': - formats.extend(self._extract_mpd_formats( - source_location, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': source.get('name'), - 'url': source_location, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('bitrate')), - }) - - for caption in asset.get('caption_metadata', []): - caption_url = caption.get('source_url') - if not caption_url: - continue - subtitles.setdefault(caption.get('language', 'en'), []).append({ - 'url': caption_url, - 'ext': determine_ext(caption_url, 'vtt'), - }) - elif asset.get('type') == 'image': - asset_location = asset.get('location') - if not asset_location: - continue - thumbnails.append({ - 'url': asset_location, - 'width': int_or_none(asset.get('width')), - 'height': int_or_none(asset.get('height')), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': video_data.get('short_description'), - 'like_count': int_or_none(video_data.get('like_count')), - 'timestamp': parse_iso8601(video_data.get('released_at')), - 'series': series, - 'episode': episode, - 'season': season, - 'season_id': season_id, - 'season_number': season_number, - 'episode_number': episode_number, - 'subtitles': subtitles, - 'age_limit': parse_age_limit(video_data.get('rating')), - } From 152f22920d73bb0dc24fa357d5904a8dd97a5bf6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 17:44:34 +0100 Subject: [PATCH 0143/1705] [wistia] reduce embed extraction false positives and support inline embeds(closes #22931) --- youtube_dl/extractor/wistia.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index fa142b974..0fbc888ec 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -12,7 +12,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): - _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]+)' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]{10})' _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' @@ -43,25 +43,26 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + # https://wistia.com/support/embed-and-share/video-on-your-website @staticmethod def _extract_url(webpage): match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage) if match: return unescapeHTML(match.group('url')) - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) - if match: - return 'wistia:%s' % match.group('id') - match = re.search( r'''(?sx) ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]+)\b.*?\2 + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]{10})\b.*?\2 ''', webpage) if match: return 'wistia:%s' % match.group('id') + match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P[a-z0-9]{10})', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) From 4c95fcf9e8fa2ed113698d13df55df4aaecd8433 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Nov 2019 21:16:47 +0100 Subject: [PATCH 0144/1705] [bambuser] remove extractor https://web.archive.org/web/20190808014227/https://go.bambuser.com/shutdown-announcement --- youtube_dl/extractor/bambuser.py | 142 ----------------------------- youtube_dl/extractor/extractors.py | 1 - 2 files changed, 143 deletions(-) delete mode 100644 youtube_dl/extractor/bambuser.py diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py deleted file mode 100644 index 4400ff9c1..000000000 --- a/youtube_dl/extractor/bambuser.py +++ /dev/null @@ -1,142 +0,0 @@ -from __future__ import unicode_literals - -import re -import itertools - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - sanitized_Request, - urlencode_postdata, -) - - -class BambuserIE(InfoExtractor): - IE_NAME = 'bambuser' - _VALID_URL = r'https?://bambuser\.com/v/(?P\d+)' - _API_KEY = '005f64509e19a868399060af746a00aa' - _LOGIN_URL = 'https://bambuser.com/user' - _NETRC_MACHINE = 'bambuser' - - _TEST = { - 'url': 'http://bambuser.com/v/4050584', - # MD5 seems to be flaky, see https://travis-ci.org/ytdl-org/youtube-dl/jobs/14051016#L388 - # 'md5': 'fba8f7693e48fd4e8641b3fd5539a641', - 'info_dict': { - 'id': '4050584', - 'ext': 'flv', - 'title': 'Education engineering days - lightning talks', - 'duration': 3741, - 'uploader': 'pixelversity', - 'uploader_id': '344706', - 'timestamp': 1382976692, - 'upload_date': '20131028', - 'view_count': int, - }, - 'params': { - # It doesn't respect the 'Range' header, it would download the whole video - # caused the travis builds to fail: https://travis-ci.org/ytdl-org/youtube-dl/jobs/14493845#L59 - 'skip_download': True, - }, - } - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_form = { - 'form_id': 'user_login', - 'op': 'Log in', - 'name': username, - 'pass': password, - } - - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) - response = self._download_webpage( - request, None, 'Logging in') - - login_error = self._html_search_regex( - r'(?s)
    (.+?)
    ', - response, 'login error', default=None) - if login_error: - raise ExtractorError( - 'Unable to login: %s' % login_error, expected=True) - - def _real_initialize(self): - self._login() - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s' - % (self._API_KEY, video_id), video_id) - - error = info.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - result = info['result'] - - return { - 'id': video_id, - 'title': result['title'], - 'url': result['url'], - 'thumbnail': result.get('preview'), - 'duration': int_or_none(result.get('length')), - 'uploader': result.get('username'), - 'uploader_id': compat_str(result.get('owner', {}).get('uid')), - 'timestamp': int_or_none(result.get('created')), - 'fps': float_or_none(result.get('framerate')), - 'view_count': int_or_none(result.get('views_total')), - 'comment_count': int_or_none(result.get('comment_count')), - } - - -class BambuserChannelIE(InfoExtractor): - IE_NAME = 'bambuser:channel' - _VALID_URL = r'https?://bambuser\.com/channel/(?P.*?)(?:/|#|\?|$)' - # The maximum number we can get with each request - _STEP = 50 - _TEST = { - 'url': 'http://bambuser.com/channel/pixelversity', - 'info_dict': { - 'title': 'pixelversity', - }, - 'playlist_mincount': 60, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user = mobj.group('user') - urls = [] - last_id = '' - for i in itertools.count(1): - req_url = ( - 'http://bambuser.com/xhr-api/index.php?username={user}' - '&sort=created&access_mode=0%2C1%2C2&limit={count}' - '&method=broadcast&format=json&vid_older_than={last}' - ).format(user=user, count=self._STEP, last=last_id) - req = sanitized_Request(req_url) - # Without setting this header, we wouldn't get any result - req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) - data = self._download_json( - req, user, 'Downloading page %d' % i) - results = data['result'] - if not results: - break - last_id = results[-1]['vid'] - urls.extend(self.url_result(v['page'], 'Bambuser') for v in results) - - return { - '_type': 'playlist', - 'title': user, - 'entries': urls, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e9b59ce52..af3fff601 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -80,7 +80,6 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE -from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, From 836bfcb54e4d1664815ebffb753a9dc7c9c7d72c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Nov 2019 11:08:51 +0100 Subject: [PATCH 0145/1705] [flipagram] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/flipagram.py | 115 ----------------------------- 2 files changed, 116 deletions(-) delete mode 100644 youtube_dl/extractor/flipagram.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index af3fff601..33fb461a0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -355,7 +355,6 @@ from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .flickr import FlickrIE -from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py deleted file mode 100644 index b7be40f1b..000000000 --- a/youtube_dl/extractor/flipagram.py +++ /dev/null @@ -1,115 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - float_or_none, - try_get, - unified_timestamp, -) - - -class FlipagramIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://flipagram.com/f/nyvTSJMKId', - 'md5': '888dcf08b7ea671381f00fab74692755', - 'info_dict': { - 'id': 'nyvTSJMKId', - 'ext': 'mp4', - 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', - 'description': 'md5:d55e32edc55261cae96a41fa85ff630e', - 'duration': 35.571, - 'timestamp': 1461244995, - 'upload_date': '20160421', - 'uploader': 'kitty juria', - 'uploader_id': 'sjuria101', - 'creator': 'kitty juria', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - 'comments': list, - 'formats': 'mincount:2', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), - video_id) - - flipagram = video_data['flipagram'] - video = flipagram['video'] - - json_ld = self._search_json_ld(webpage, video_id, default={}) - title = json_ld.get('title') or flipagram['captionText'] - description = json_ld.get('description') or flipagram.get('captionText') - - formats = [{ - 'url': video['url'], - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'filesize': int_or_none(video_data.get('size')), - }] - - preview_url = try_get( - flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) - if preview_url: - formats.append({ - 'url': preview_url, - 'ext': 'm4a', - 'vcodec': 'none', - }) - - self._sort_formats(formats) - - counts = flipagram.get('counts', {}) - user = flipagram.get('user', {}) - video_data = flipagram.get('video', {}) - - thumbnails = [{ - 'url': self._proto_relative_url(cover['url']), - 'width': int_or_none(cover.get('width')), - 'height': int_or_none(cover.get('height')), - 'filesize': int_or_none(cover.get('size')), - } for cover in flipagram.get('covers', []) if cover.get('url')] - - # Note that this only retrieves comments that are initially loaded. - # For videos with large amounts of comments, most won't be retrieved. - comments = [] - for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): - text = comment.get('comment') - if not text or not isinstance(text, list): - continue - comments.append({ - 'author': comment.get('user', {}).get('name'), - 'author_id': comment.get('user', {}).get('username'), - 'id': comment.get('id'), - 'text': text[0], - 'timestamp': unified_timestamp(comment.get('created')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': float_or_none(flipagram.get('duration'), 1000), - 'thumbnails': thumbnails, - 'timestamp': unified_timestamp(flipagram.get('iso8601Created')), - 'uploader': user.get('name'), - 'uploader_id': user.get('username'), - 'creator': user.get('name'), - 'view_count': int_or_none(counts.get('plays')), - 'like_count': int_or_none(counts.get('likes')), - 'repost_count': int_or_none(counts.get('reflips')), - 'comment_count': int_or_none(counts.get('comments')), - 'comments': comments, - 'formats': formats, - } From 79b35e7c15f4a285525b5ec52035ff0f8fc6150d Mon Sep 17 00:00:00 2001 From: geditorit <52565706+geditorit@users.noreply.github.com> Date: Sat, 2 Nov 2019 18:32:49 +0700 Subject: [PATCH 0146/1705] [gameone] Remove extractor (#21778) --- youtube_dl/extractor/extractors.py | 4 - youtube_dl/extractor/gameone.py | 134 ----------------------------- 2 files changed, 138 deletions(-) delete mode 100644 youtube_dl/extractor/gameone.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 33fb461a0..dce08e077 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -401,10 +401,6 @@ from .fusion import FusionIE from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE -from .gameone import ( - GameOneIE, - GameOnePlaylistIE, -) from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py deleted file mode 100644 index a07d69841..000000000 --- a/youtube_dl/extractor/gameone.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - xpath_with_ns, - parse_iso8601, - float_or_none, - int_or_none, -) - -NAMESPACE_MAP = { - 'media': 'http://search.yahoo.com/mrss/', -} - -# URL prefix to download the mp4 files directly instead of streaming via rtmp -# Credits go to XBox-Maniac -# http://board.jdownloader.org/showpost.php?p=185835&postcount=31 -RAW_MP4_URL = 'http://cdn.riptide-mtvn.com/' - - -class GameOneIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gameone\.de/tv/(?P\d+)' - _TESTS = [ - { - 'url': 'http://www.gameone.de/tv/288', - 'md5': '136656b7fb4c9cb4a8e2d500651c499b', - 'info_dict': { - 'id': '288', - 'ext': 'mp4', - 'title': 'Game One - Folge 288', - 'duration': 1238, - 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg', - 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1', - 'age_limit': 16, - 'upload_date': '20140513', - 'timestamp': 1399980122, - } - }, - { - 'url': 'http://gameone.de/tv/220', - 'md5': '5227ca74c4ae6b5f74c0510a7c48839e', - 'info_dict': { - 'id': '220', - 'ext': 'mp4', - 'upload_date': '20120918', - 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker', - 'timestamp': 1347971451, - 'title': 'Game One - Folge 220', - 'duration': 896.62, - 'age_limit': 16, - } - } - - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage, secure=False) - description = self._html_search_meta('description', webpage) - age_limit = int( - self._search_regex( - r'age=(\d+)', - self._html_search_meta( - 'age-de-meta-label', - webpage), - 'age_limit', - '0')) - mrss_url = self._search_regex(r'mrss=([^&]+)', og_video, 'mrss') - - mrss = self._download_xml(mrss_url, video_id, 'Downloading mrss') - title = mrss.find('.//item/title').text - thumbnail = mrss.find('.//item/image').get('url') - timestamp = parse_iso8601(mrss.find('.//pubDate').text, delimiter=' ') - content = mrss.find(xpath_with_ns('.//media:content', NAMESPACE_MAP)) - content_url = content.get('url') - - content = self._download_xml( - content_url, - video_id, - 'Downloading media:content') - rendition_items = content.findall('.//rendition') - duration = float_or_none(rendition_items[0].get('duration')) - formats = [ - { - 'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text), - 'width': int_or_none(r.get('width')), - 'height': int_or_none(r.get('height')), - 'tbr': int_or_none(r.get('bitrate')), - } - for r in rendition_items - ] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'description': description, - 'age_limit': age_limit, - 'timestamp': timestamp, - } - - -class GameOnePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$' - IE_NAME = 'gameone:playlist' - _TEST = { - 'url': 'http://www.gameone.de/tv', - 'info_dict': { - 'title': 'GameOne', - }, - 'playlist_mincount': 294, - } - - def _real_extract(self, url): - webpage = self._download_webpage('http://www.gameone.de/tv', 'TV') - max_id = max(map(int, re.findall(r' Date: Sat, 2 Nov 2019 13:09:44 +0100 Subject: [PATCH 0147/1705] [keek] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/keek.py | 39 ------------------------------ 2 files changed, 40 deletions(-) delete mode 100644 youtube_dl/extractor/keek.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dce08e077..08facf8d3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -515,7 +515,6 @@ from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .kinopoisk import KinoPoiskIE -from .keek import KeekIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py deleted file mode 100644 index 94a03d277..000000000 --- a/youtube_dl/extractor/keek.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class KeekIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keek\.com/keek/(?P\w+)' - IE_NAME = 'keek' - _TEST = { - 'url': 'https://www.keek.com/keek/NODfbab', - 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83', - 'info_dict': { - 'id': 'NODfbab', - 'ext': 'mp4', - 'title': 'md5:35d42050a3ece241d5ddd7fdcc6fd896', - 'uploader': 'ytdl', - 'uploader_id': 'eGT5bab', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - return { - 'id': video_id, - 'url': self._og_search_video_url(webpage), - 'ext': 'mp4', - 'title': self._og_search_description(webpage).strip(), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': self._search_regex( - r'data-username=(["\'])(?P.+?)\1', webpage, - 'uploader', fatal=False, group='uploader'), - 'uploader_id': self._search_regex( - r'data-user-id=(["\'])(?P.+?)\1', webpage, - 'uploader id', fatal=False, group='uploader_id'), - } From 5e36b63486794750aca0ee6b9b83f27abf6332dc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Nov 2019 13:25:39 +0100 Subject: [PATCH 0148/1705] [iconosquare] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/iconosquare.py | 85 ----------------------------- 2 files changed, 86 deletions(-) delete mode 100644 youtube_dl/extractor/iconosquare.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 08facf8d3..dd5f68ca3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -455,7 +455,6 @@ from .hungama import ( HungamaSongIE, ) from .hypem import HypemIE -from .iconosquare import IconosquareIE from .ign import ( IGNIE, OneUPIE, diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py deleted file mode 100644 index a39f422e9..000000000 --- a/youtube_dl/extractor/iconosquare.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - get_element_by_id, - remove_end, -) - - -class IconosquareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P[^/]+)' - _TEST = { - 'url': 'http://statigr.am/p/522207370455279102_24101272', - 'md5': '6eb93b882a3ded7c378ee1d6884b1814', - 'info_dict': { - 'id': '522207370455279102_24101272', - 'ext': 'mp4', - 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)', - 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d', - 'timestamp': 1376471991, - 'upload_date': '20130814', - 'uploader': 'aguynamedpatrick', - 'uploader_id': '24101272', - 'comment_count': int, - 'like_count': int, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - media = self._parse_json( - get_element_by_id('mediaJson', webpage), - video_id) - - formats = [{ - 'url': f['url'], - 'format_id': format_id, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')) - } for format_id, f in media['videos'].items()] - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - via Iconosquare') - - timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time')) - description = media.get('caption', {}).get('text') - - uploader = media.get('user', {}).get('username') - uploader_id = media.get('user', {}).get('id') - - comment_count = int_or_none(media.get('comments', {}).get('count')) - like_count = int_or_none(media.get('likes', {}).get('count')) - - thumbnails = [{ - 'url': t['url'], - 'id': thumbnail_id, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')) - } for thumbnail_id, t in media.get('images', {}).items()] - - comments = [{ - 'id': comment.get('id'), - 'text': comment['text'], - 'timestamp': int_or_none(comment.get('created_time')), - 'author': comment.get('from', {}).get('full_name'), - 'author_id': comment.get('from', {}).get('username'), - } for comment in media.get('comments', {}).get('data', []) if 'text' in comment] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'comment_count': comment_count, - 'like_count': like_count, - 'formats': formats, - 'comments': comments, - } From e54924c46fac6a9745868424dc14011da2572178 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Nov 2019 18:13:31 +0100 Subject: [PATCH 0149/1705] [stv] fix extraction(closes #22928) --- youtube_dl/extractor/stv.py | 89 +++++++++++++------------------------ 1 file changed, 31 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py index ccb074cd4..bae8b71f4 100644 --- a/youtube_dl/extractor/stv.py +++ b/youtube_dl/extractor/stv.py @@ -4,15 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse -) from ..utils import ( - extract_attributes, + compat_str, float_or_none, int_or_none, - str_or_none, ) @@ -20,20 +15,20 @@ class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?Pepisode|video)/(?P[a-z0-9]{4})' _TEST = { - 'url': 'https://player.stv.tv/video/7srz/victoria/interview-with-the-cast-ahead-of-new-victoria/', - 'md5': '2ad867d4afd641fa14187596e0fbc91b', + 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', + 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { - 'id': '6016487034001', + 'id': '5333973339001', 'ext': 'mp4', - 'upload_date': '20190321', - 'title': 'Interview with the cast ahead of new Victoria', - 'description': 'Nell Hudson and Lily Travers tell us what to expect in the new season of Victoria.', - 'timestamp': 1553179628, + 'upload_date': '20170301', + 'title': '60 seconds on set with Laura Norton', + 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", + 'timestamp': 1488388054, 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', } - _PUBLISHER_ID = '1486976045' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', 'video': 'shortform', @@ -41,54 +36,32 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) + resp = self._download_json( + 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), + video_id) - qs = compat_parse_qs(compat_urllib_parse_urlparse(self._search_regex( - r'itemprop="embedURL"[^>]+href="([^"]+)', - webpage, 'embed URL', default=None)).query) - publisher_id = qs.get('publisherID', [None])[0] or self._PUBLISHER_ID + result = resp['results'] + video = result['video'] + video_id = compat_str(video['id']) - player_attr = extract_attributes(self._search_regex( - r'(<[^>]+class="bcplayer"[^>]+>)', webpage, 'player', default=None)) or {} + subtitles = {} + _subtitles = result.get('_subtitles') or {} + for ext, sub_url in _subtitles.items(): + subtitles.setdefault('en', []).append({ + 'ext': 'vtt' if ext == 'webvtt' else ext, + 'url': sub_url, + }) - info = {} - duration = ref_id = series = video_id = None - api_ref_id = player_attr.get('data-player-api-refid') - if api_ref_id: - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], api_ref_id), - api_ref_id, fatal=False) - if resp: - result = resp.get('results') or {} - video = result.get('video') or {} - video_id = str_or_none(video.get('id')) - ref_id = video.get('guid') - duration = video.get('length') - programme = result.get('programme') or {} - series = programme.get('name') or programme.get('shortName') - subtitles = {} - _subtitles = result.get('_subtitles') or {} - for ext, sub_url in _subtitles.items(): - subtitles.setdefault('en', []).append({ - 'ext': 'vtt' if ext == 'webvtt' else ext, - 'url': sub_url, - }) - info.update({ - 'description': result.get('summary'), - 'subtitles': subtitles, - 'view_count': int_or_none(result.get('views')), - }) - if not video_id: - video_id = qs.get('videoId', [None])[0] or self._search_regex( - r' Date: Sat, 2 Nov 2019 22:33:51 +0100 Subject: [PATCH 0150/1705] [bellmedia] add support for marilyn.ca videos(#22193) --- youtube_dl/extractor/bellmedia.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index f36a2452d..485173774 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -22,7 +22,8 @@ class BellMediaIE(InfoExtractor): bravo| mtv| space| - etalk + etalk| + marilyn )\.ca| much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' @@ -70,6 +71,7 @@ class BellMediaIE(InfoExtractor): 'animalplanet': 'aniplan', 'etalk': 'ctv', 'bnnbloomberg': 'bnn', + 'marilyn': 'ctv_marilyn', } def _real_extract(self, url): From 564275e26fc963fb920236e37c6c19e8e2b046f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Nov 2019 22:04:03 +0100 Subject: [PATCH 0151/1705] [telegraaf] fix extraction --- youtube_dl/extractor/telegraaf.py | 75 ++++++++++++++++++------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 0f576c1ab..2dc020537 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -4,21 +4,25 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, - remove_end, + int_or_none, + parse_iso8601, + try_get, ) class TelegraafIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P\d+)/[^/]+\.html' + _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/video/(?P\d+)' _TEST = { - 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', + 'url': 'https://www.telegraaf.nl/video/734366489/historisch-scheepswrak-slaat-na-100-jaar-los', 'info_dict': { - 'id': '24353229', + 'id': 'gaMItuoSeUg2', 'ext': 'mp4', - 'title': 'Tikibad ontruimd wegens brand', - 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 33, + 'title': 'Historisch scheepswrak slaat na 100 jaar los', + 'description': 'md5:6f53b7c4f55596722ac24d6c0ec00cfb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 55, + 'timestamp': 1572805527, + 'upload_date': '20191103', }, 'params': { # m3u8 download @@ -27,23 +31,30 @@ class TelegraafIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + article_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_id = self._download_json( + 'https://www.telegraaf.nl/graphql', article_id, query={ + 'query': '''{ + article(uid: %s) { + videos { + videoId + } + } +}''' % article_id, + })['data']['article']['videos'][0]['videoId'] - player_url = self._html_search_regex( - r']+src="([^"]+")', webpage, 'player URL') - player_page = self._download_webpage( - player_url, video_id, note='Download player webpage') - playlist_url = self._search_regex( - r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') - playlist_data = self._download_json(playlist_url, video_id) + item = self._download_json( + 'https://content.tmgvideo.nl/playlist/item=%s/playlist.json' % video_id, + video_id)['items'][0] + title = item['title'] - item = playlist_data['items'][0] formats = [] - locations = item['locations'] + locations = item.get('locations') or {} for location in locations.get('adaptive', []): - manifest_url = location['src'] + manifest_url = location.get('src') + if not manifest_url: + continue ext = determine_ext(manifest_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -54,25 +65,25 @@ class TelegraafIE(InfoExtractor): else: self.report_warning('Unknown adaptive format %s' % ext) for location in locations.get('progressive', []): + src = try_get(location, lambda x: x['sources'][0]['src']) + if not src: + continue + label = location.get('label') formats.append({ - 'url': location['sources'][0]['src'], - 'width': location.get('width'), - 'height': location.get('height'), - 'format_id': 'http-%s' % location['label'], + 'url': src, + 'width': int_or_none(location.get('width')), + 'height': int_or_none(location.get('height')), + 'format_id': 'http' + ('-%s' % label if label else ''), }) self._sort_formats(formats) - title = remove_end(self._og_search_title(webpage), ' - VIDEO') - description = self._og_search_description(webpage) - duration = item.get('duration') - thumbnail = item.get('poster') - return { 'id': video_id, 'title': title, - 'description': description, + 'description': item.get('description'), 'formats': formats, - 'duration': duration, - 'thumbnail': thumbnail, + 'duration': int_or_none(item.get('duration')), + 'thumbnail': item.get('poster'), + 'timestamp': parse_iso8601(item.get('datecreated'), ' '), } From a6e6673e825f6225c3a316b164ddca03fd20b5d2 Mon Sep 17 00:00:00 2001 From: Manu Cornet Date: Sun, 3 Nov 2019 21:23:27 +0000 Subject: [PATCH 0152/1705] [README.md] Also read permission to the binary in how to update section (#22903) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c39b13616..01f975958 100644 --- a/README.md +++ b/README.md @@ -752,8 +752,8 @@ As a last resort, you can also uninstall the version installed by your package m Afterwards, simply follow [our manual installation instructions](https://ytdl-org.github.io/youtube-dl/download.html): ``` -sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl -sudo chmod a+x /usr/local/bin/youtube-dl +sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl +sudo chmod a+rx /usr/local/bin/youtube-dl hash -r ``` From ef382405c6dc79d2b7e3f81a527232941e2c0b2d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Nov 2019 02:01:01 +0100 Subject: [PATCH 0153/1705] [mediaset] extract unprotected M3U and MPD manifests(closes #17204) --- youtube_dl/extractor/mediaset.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index df3748798..fcbc064ff 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -62,7 +62,6 @@ class MediasetIE(ThePlatformBaseIE): 'uploader': 'Canale 5', 'uploader_id': 'C5', }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', @@ -109,6 +108,11 @@ class MediasetIE(ThePlatformBaseIE): entries.append(embed_url) return entries + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + for video in smil.findall(self._xpath_ns('.//video', namespace)): + video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) + return super()._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + def _real_extract(self, url): guid = self._match_id(url) tp_path = 'PR1GhC/media/guid/2702976343/' + guid @@ -118,14 +122,15 @@ class MediasetIE(ThePlatformBaseIE): subtitles = {} first_e = None for asset_type in ('SD', 'HD'): - for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + # TODO: fixup ISM+none manifest URLs + for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): try: tp_formats, tp_subtitles = self._extract_theplatform_smil( update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { 'mbr': 'true', 'formats': f, 'assetTypes': asset_type, - }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + }), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type)) except ExtractorError as e: if not first_e: first_e = e From bf45295c5387d0d90b97ca34d377cdaa07c71bcb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Nov 2019 11:13:14 +0100 Subject: [PATCH 0154/1705] [mediaset] relax URL guid matching(closes #18352) --- youtube_dl/extractor/mediaset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index fcbc064ff..f976506f4 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -27,7 +27,7 @@ class MediasetIE(ThePlatformBaseIE): (?:video|on-demand)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) - )(?P[0-9A-Z]{16}) + )(?P[0-9A-Z]{16,}) ''' _TESTS = [{ # full episode @@ -77,6 +77,18 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01', + 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', + 'only_matching': True, }] @staticmethod From e452345fc5cee5e79d2cad6be575da563987a4ff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Nov 2019 15:43:52 +0100 Subject: [PATCH 0155/1705] [jamendo] improve extraction - fix album extraction(closes #18564) - improve metadata extraction(closes #18565)(closes #21379) --- youtube_dl/extractor/jamendo.py | 162 +++++++++++++++++++------------- 1 file changed, 99 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index c21827618..12e21eb6f 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -1,38 +1,26 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import hashlib +import random -from ..compat import compat_urlparse +from ..compat import compat_str from .common import InfoExtractor -from ..utils import parse_duration +from ..utils import ( + clean_html, + int_or_none, + try_get, +) -class JamendoBaseIE(InfoExtractor): - def _extract_meta(self, webpage, fatal=True): - title = self._og_search_title( - webpage, default=None) or self._search_regex( - r'([^<]+)', webpage, - 'title', default=None) - if title: - title = self._search_regex( - r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None) - if not title: - title = self._html_search_meta( - 'name', webpage, 'title', fatal=fatal) - mobj = re.search(r'(.+) - (.+)', title or '') - artist, second = mobj.groups() if mobj else [None] * 2 - return title, artist, second - - -class JamendoIE(JamendoBaseIE): +class JamendoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: licensing\.jamendo\.com/[^/]+| (?:www\.)?jamendo\.com ) - /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+) + /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))? ''' _TESTS = [{ 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', @@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE): 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, - 'thumbnail': r're:^https?://.*\.jpg' + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1217438117, + 'upload_date': '20080730', } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', @@ -53,15 +43,19 @@ class JamendoIE(JamendoBaseIE): }] def _real_extract(self, url): - mobj = self._VALID_URL_RE.match(url) - track_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage( - 'https://www.jamendo.com/track/%s/%s' % (track_id, display_id), - display_id) - - title, artist, track = self._extract_meta(webpage) + track_id, display_id = self._VALID_URL_RE.match(url).groups() + webpage = self._download_webpage(url, track_id) + models = self._parse_json(self._html_search_regex( + r"data-bundled-models='([^']+)", + webpage, 'bundled models'), track_id) + track = models['track']['models'][0] + title = track_name = track['name'] + get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + artist = get_model('artist') + artist_name = artist.get('name') + if artist_name: + title = '%s - %s' % (artist_name, title) + album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -77,31 +71,58 @@ class JamendoIE(JamendoBaseIE): ))] self._sort_formats(formats) - thumbnail = self._html_search_meta( - 'image', webpage, 'thumbnail', fatal=False) - duration = parse_duration(self._search_regex( - r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']', - webpage, 'duration', fatal=False)) + urls = [] + thumbnails = [] + for _, covers in track.get('cover', {}).items(): + for cover_id, cover_url in covers.items(): + if not cover_url or cover_url in urls: + continue + urls.append(cover_url) + size = int_or_none(cover_id.lstrip('size')) + thumbnails.append({ + 'id': cover_id, + 'url': cover_url, + 'width': size, + 'height': size, + }) + + tags = [] + for tag in track.get('tags', []): + tag_name = tag.get('name') + if not tag_name: + continue + tags.append(tag_name) + + stats = track.get('stats') or {} return { 'id': track_id, 'display_id': display_id, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'title': title, - 'duration': duration, - 'artist': artist, - 'track': track, - 'formats': formats + 'description': track.get('description'), + 'duration': int_or_none(track.get('duration')), + 'artist': artist_name, + 'track': track_name, + 'album': album.get('name'), + 'formats': formats, + 'license': '-'.join(track.get('licenseCC', [])) or None, + 'timestamp': int_or_none(track.get('dateCreated')), + 'view_count': int_or_none(stats.get('listenedAll')), + 'like_count': int_or_none(stats.get('favorited')), + 'average_rating': int_or_none(stats.get('averageNote')), + 'tags': tags, } -class JamendoAlbumIE(JamendoBaseIE): - _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)' +class JamendoAlbumIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)' _TEST = { 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', - 'title': 'Shearer - Duck On Cover' + 'title': 'Duck On Cover', + 'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239', }, 'playlist': [{ 'md5': 'e1a2fcb42bda30dfac990212924149a8', @@ -111,6 +132,8 @@ class JamendoAlbumIE(JamendoBaseIE): 'title': 'Shearer - Warmachine', 'artist': 'Shearer', 'track': 'Warmachine', + 'timestamp': 1368089771, + 'upload_date': '20130509', } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', @@ -120,6 +143,8 @@ class JamendoAlbumIE(JamendoBaseIE): 'title': 'Shearer - Without Your Ghost', 'artist': 'Shearer', 'track': 'Without Your Ghost', + 'timestamp': 1368089771, + 'upload_date': '20130509', } }], 'params': { @@ -127,24 +152,35 @@ class JamendoAlbumIE(JamendoBaseIE): } } + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): - mobj = self._VALID_URL_RE.match(url) - album_id = mobj.group('id') + album_id = self._match_id(url) + album = self._call_api('album', album_id) + album_name = album.get('name') - webpage = self._download_webpage(url, mobj.group('display_id')) + entries = [] + for track in album.get('tracks', []): + track_id = track.get('id') + if not track_id: + continue + track_id = compat_str(track_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'https://www.jamendo.com/track/' + track_id, + 'ie_key': JamendoIE.ie_key(), + 'id': track_id, + 'album': album_name, + }) - title, artist, album = self._extract_meta(webpage, fatal=False) - - entries = [{ - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, m.group('path')), - 'ie_key': JamendoIE.ie_key(), - 'id': self._search_regex( - r'/track/(\d+)', m.group('path'), 'track id', default=None), - 'artist': artist, - 'album': album, - } for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', - webpage)] - - return self.playlist_result(entries, album_id, title) + return self.playlist_result( + entries, album_id, album_name, + clean_html(try_get(album, lambda x: x['description']['en'], compat_str))) From 2349255abdf822e0bb9508d510db926cae777f8c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 4 Nov 2019 15:51:44 +0100 Subject: [PATCH 0156/1705] [jamendo] restore track url modification --- youtube_dl/extractor/jamendo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 12e21eb6f..490efa8fb 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -44,7 +44,8 @@ class JamendoIE(InfoExtractor): def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage(url, track_id) + webpage = self._download_webpage( + 'https://www.jamendo.com/track/' + track_id, track_id) models = self._parse_json(self._html_search_regex( r"data-bundled-models='([^']+)", webpage, 'bundled models'), track_id) From 3e4908360417bc29e1446bfa85145193fa2c8462 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 4 Nov 2019 20:05:27 +0100 Subject: [PATCH 0157/1705] [myspass] fix video URL extraction and improve metadata extraction(closes #22448) --- youtube_dl/extractor/myspass.py | 75 +++++++++++++-------------------- 1 file changed, 29 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 2afe535b5..db7ebc94c 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,73 +1,56 @@ +# coding: utf-8 from __future__ import unicode_literals -import os.path + +import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( - ExtractorError, + int_or_none, + parse_duration, + xpath_text, ) class MySpassIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?myspass\.de/.*' + _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', 'info_dict': { 'id': '11741', 'ext': 'mp4', - 'description': 'Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?', - 'title': 'Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2', + 'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?', + 'title': '17.02.2013 - Die Highlights, Teil 2', }, } def _real_extract(self, url): - META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' + video_id = self._match_id(url) - # video id is the last path element of the URL - # usually there is a trailing slash, so also try the second but last - url_path = compat_urllib_parse_urlparse(url).path - url_parent_path, video_id = os.path.split(url_path) - if not video_id: - _, video_id = os.path.split(url_parent_path) - - # get metadata - metadata_url = META_DATA_URL_TEMPLATE % video_id metadata = self._download_xml( - metadata_url, video_id, transform_source=lambda s: s.strip()) + 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id, + video_id) - # extract values from metadata - url_flv_el = metadata.find('url_flv') - if url_flv_el is None: - raise ExtractorError('Unable to extract download url') - video_url = url_flv_el.text - title_el = metadata.find('title') - if title_el is None: - raise ExtractorError('Unable to extract title') - title = title_el.text - format_id_el = metadata.find('format_id') - if format_id_el is None: - format = 'mp4' - else: - format = format_id_el.text - description_el = metadata.find('description') - if description_el is not None: - description = description_el.text - else: - description = None - imagePreview_el = metadata.find('imagePreview') - if imagePreview_el is not None: - thumbnail = imagePreview_el.text - else: - thumbnail = None + title = xpath_text(metadata, 'title', fatal=True) + video_url = xpath_text(metadata, 'url_flv', 'download url', True) + video_id_int = int(video_id) + for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + group_int = int(group) + if group_int > video_id_int: + video_url = video_url.replace( + group, compat_str(group_int // video_id_int)) return { 'id': video_id, 'url': video_url, 'title': title, - 'format': format, - 'thumbnail': thumbnail, - 'description': description, + 'thumbnail': xpath_text(metadata, 'imagePreview'), + 'description': xpath_text(metadata, 'description'), + 'duration': parse_duration(xpath_text(metadata, 'duration')), + 'series': xpath_text(metadata, 'format'), + 'season_number': int_or_none(xpath_text(metadata, 'season')), + 'season_id': xpath_text(metadata, 'season_id'), + 'episode': title, + 'episode_number': int_or_none(xpath_text(metadata, 'episode')), } From c69e71733d9619cb1a2bee769b9a381b52901de3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 4 Nov 2019 22:21:00 +0100 Subject: [PATCH 0158/1705] [msn] add support for Vidible and AOL embeds(closes #22195)(closes #22227) --- youtube_dl/extractor/msn.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 0460cf4d5..0c3813dda 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -41,6 +41,14 @@ class MSNIE(InfoExtractor): }, { 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', 'only_matching': True, + }, { + # Vidible(AOL) Embed + 'url': 'https://www.msn.com/en-us/video/animals/yellowstone-park-staffers-catch-deer-engaged-in-behavior-they-cant-explain/vi-AAGfdg1', + 'only_matching': True, + }, { + # Dailymotion Embed + 'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L', + 'only_matching': True, }] def _real_extract(self, url): @@ -61,6 +69,18 @@ class MSNIE(InfoExtractor): webpage, 'error', group='error')) raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + player_name = video.get('playerName') + if player_name: + provider_id = video.get('providerId') + if provider_id: + if player_name == 'AOL': + return self.url_result( + 'aol-video:' + provider_id, 'Aol', provider_id) + elif player_name == 'Dailymotion': + return self.url_result( + 'https://www.dailymotion.com/video/' + provider_id, + 'Dailymotion', provider_id) + title = video['title'] formats = [] From 20218040db2b1e063191cc470ce403d35d394e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Nov 2019 05:21:16 +0700 Subject: [PATCH 0159/1705] [scte] Add extractor (closes #22975) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/scte.py | 144 +++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 youtube_dl/extractor/scte.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dd5f68ca3..9f43b284d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -980,6 +980,10 @@ from .sbs import SBSIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE from .scrippsnetworks import ScrippsNetworksWatchIE +from .scte import ( + SCTEIE, + SCTECourseIE, +) from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE diff --git a/youtube_dl/extractor/scte.py b/youtube_dl/extractor/scte.py new file mode 100644 index 000000000..ca1de63b6 --- /dev/null +++ b/youtube_dl/extractor/scte.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + urlencode_postdata, +) + + +class SCTEBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' + _NETRC_MACHINE = 'scte' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_popup = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']welcome\b', r'>Sign Out<')) + + # already logged in + if is_logged(login_popup): + return + + login_form = self._hidden_inputs(login_popup) + + login_form.update({ + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on', + }) + + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + if '|pageRedirect|' not in response and not is_logged(response): + error = self._html_search_regex( + r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class SCTEIE(SCTEBaseIE): + _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', + 'info_dict': { + 'title': 'Introduction to DOCSIS Engineering Professional', + 'id': '31484', + }, + 'playlist_count': 5, + 'skip': 'Requires account credentials', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title') + + context_id = self._search_regex(r'context-(\d+)', webpage, video_id) + content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id + context = decode_packed_codes(self._download_webpage( + '%smobile/data.js' % content_base, video_id)) + + data = self._parse_xml( + self._search_regex( + r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"), + video_id) + + entries = [] + for asset in data.findall('.//asset'): + asset_url = asset.get('url') + if not asset_url or not asset_url.endswith('.mp4'): + continue + asset_id = self._search_regex( + r'video_([^_]+)_', asset_url, 'asset id', default=None) + if not asset_id: + continue + entries.append({ + 'id': asset_id, + 'title': title, + 'url': content_base + asset_url, + }) + + return self.playlist_result(entries, video_id, title) + + +class SCTECourseIE(SCTEBaseIE): + _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3639', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3073', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + title = self._search_regex( + r'<h1>(.+?)</h1>', webpage, 'title', default=None) + + entries = [] + for mobj in re.finditer( + r'''(?x) + <a[^>]+ + href=(["\']) + (?P<url> + https?://learning\.scte\.org/mod/ + (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*? + \bid=\d+ + ) + ''', + webpage): + item_url = mobj.group('url') + if item_url == url: + continue + ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm' + else SCTECourseIE.ie_key()) + entries.append(self.url_result(item_url, ie=ie)) + + return self.playlist_result(entries, course_id, title) From 1a4e4b0bfeb83b24755f80630d1e7f3427a5bf48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Nov 2019 05:31:40 +0700 Subject: [PATCH 0160/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/ChangeLog b/ChangeLog index fcab1102c..338dd456b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,47 @@ +version <unreleased> + +Extractors ++ [scte] Add support for learning.scte.org (#22975) ++ [msn] Add support for Vidible and AOL embeds (#22195, #22227) +* [myspass] Fix video URL extraction and improve metadata extraction (#22448) +* [jamendo] Improve extraction + * Fix album extraction (#18564) + * Improve metadata extraction (#18565, #21379) +* [mediaset] Relax URL guid matching (#18352) ++ [mediaset] Extract unprotected M3U and MPD manifests (#17204) +* [telegraaf] Fix extraction ++ [bellmedia] Add support for marilyn.ca videos (#22193) +* [stv] Fix extraction (#22928) +- [iconosquare] Remove extractor +- [keek] Remove extractor +- [gameone] Remove extractor (#21778) +- [flipagram] Remove extractor +- [bambuser] Remove extractor +* [wistia] Reduce embed extraction false positives ++ [wistia] Add support for inline embeds (#22931) +- [go90] Remove extractor +* [kakao] Remove raw request ++ [kakao] Extract format total bitrate +* [daum] Fix VOD and Clip extracton (#15015) +* [kakao] Improve extraction + + Add support for embed URLs + + Add support for Kakao Legacy vid based embed URLs + * Only extract fields used for extraction + * Strip description and extract tags +* [mixcloud] Fix cloudcast data extraction (#22821) +* [yahoo] Improve extraction + + Add support for live streams (#3597, #3779, #22178) + * Bypass cookie consent page for european domains (#16948, #22576) + + Add generic support for embeds (#20332) +* [tv2] Fix and improve extraction (#22787) ++ [tv2dk] Add support for TV2 DK sites +* [onet] Improve extraction … + + Add support for onet100.vod.pl + + Extract m3u8 formats + * Correct audio only format info +* [fox9] Fix extraction + + version 2019.10.29 Core From ea07412ebf6fff7c17bcac9960cfe4e92ed62f12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Nov 2019 05:32:56 +0700 Subject: [PATCH 0161/1705] release 2019.11.05 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 12 ++++-------- youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 22 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index f82502bd1..12de9add2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.29 + [debug] youtube-dl version 2019.11.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5ef983d43..8a6202cf6 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 8f05aa79f..83f91d5fe 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index e90900d8d..be8e70f1e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.10.29 + [debug] youtube-dl version 2019.11.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 7021d7397..7544d171c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.29. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.05. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.10.29** +- [ ] I've verified that I'm running youtube-dl version **2019.11.05** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 338dd456b..d46d20082 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.11.05 Extractors + [scte] Add support for learning.scte.org (#22975) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index af905db5a..536b87479 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -76,8 +76,6 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 - - **bambuser** - - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -284,12 +282,12 @@ - **FiveThirtyEight** - **FiveTV** - **Flickr** - - **Flipagram** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Formula1** - **FOX** - **FOX9** + - **FOX9News** - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** @@ -315,8 +313,6 @@ - **FXNetworks** - **Gaia** - **GameInformer** - - **GameOne** - - **gameone:playlist** - **GameSpot** - **GameStar** - **Gaskrank** @@ -331,7 +327,6 @@ - **Globo** - **GloboArticle** - **Go** - - **Go90** - **GodTube** - **Golem** - **GoogleDrive** @@ -366,7 +361,6 @@ - **Hungama** - **HungamaSong** - **Hypem** - - **Iconosquare** - **ign.com** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists @@ -406,7 +400,6 @@ - **Kankan** - **Karaoketv** - **KarriereVideos** - - **keek** - **KeezMovies** - **Ketnet** - **KhanAcademy** @@ -777,6 +770,8 @@ - **Screencast** - **ScreencastOMatic** - **scrippsnetworks:watch** + - **SCTE** + - **SCTECourse** - **Seeker** - **SenateISVP** - **SendtoNews** @@ -926,6 +921,7 @@ - **TV2** - **tv2.hu** - **TV2Article** + - **TV2DK** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 924f26ca8..8012a66db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.10.29' +__version__ = '2019.11.05' From e9b95167af3f9cacd16e379a40bacb27999840b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 10:03:38 +0100 Subject: [PATCH 0162/1705] [roosterteeth] fix login request(closes #16094)(closes #22689) --- youtube_dl/extractor/roosterteeth.py | 55 +++++++++++----------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8d88ee499..8883639b2 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_HTTPError, @@ -18,7 +16,6 @@ from ..utils import ( class RoosterTeethIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' - _LOGIN_URL = 'https://roosterteeth.com/login' _NETRC_MACHINE = 'roosterteeth' _TESTS = [{ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', @@ -53,48 +50,40 @@ class RoosterTeethIE(InfoExtractor): 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'only_matching': True, }] + _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/' def _login(self): username, password = self._get_login_info() if username is None: return - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='Unable to download login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password, - }) - - login_request = self._download_webpage( - self._LOGIN_URL, None, - note='Logging in', - data=urlencode_postdata(login_form), - headers={ - 'Referer': self._LOGIN_URL, - }) - - if not any(re.search(p, login_request) for p in ( - r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', - r'>Sign Out<')): - error = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', - login_request, 'alert', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) def _real_initialize(self): + if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): + return self._login() def _real_extract(self, url): display_id = self._match_id(url) - api_episode_url = 'https://svod-be.roosterteeth.com/api/v1/episodes/%s' % display_id + api_episode_url = self._EPISODE_BASE_URL + display_id try: m3u8_url = self._download_json( From b77c3949e899902de78b140f6e444dc55bac824f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 14:04:17 +0100 Subject: [PATCH 0163/1705] [patreon] minimize reponse size and extract uploader_id and filesize --- youtube_dl/extractor/patreon.py | 52 +++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 426dd8121..761a4b1de 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -6,7 +6,11 @@ from ..utils import ( clean_html, determine_ext, int_or_none, + KNOWN_EXTENSIONS, + mimetype2ext, parse_iso8601, + str_or_none, + try_get, ) @@ -24,6 +28,7 @@ class PatreonIE(InfoExtractor): 'thumbnail': 're:^https?://.*$', 'timestamp': 1406473987, 'upload_date': '20140727', + 'uploader_id': '87145', }, }, { 'url': 'http://www.patreon.com/creation?hid=754133', @@ -90,7 +95,13 @@ class PatreonIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) post = self._download_json( - 'https://www.patreon.com/api/posts/' + video_id, video_id) + 'https://www.patreon.com/api/posts/' + video_id, video_id, query={ + 'fields[media]': 'download_url,mimetype,size_bytes', + 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title', + 'fields[user]': 'full_name,url', + 'json-api-use-default-includes': 'false', + 'include': 'media,user', + }) attributes = post['data']['attributes'] title = attributes['title'].strip() image = attributes.get('image') or {} @@ -104,33 +115,42 @@ class PatreonIE(InfoExtractor): 'comment_count': int_or_none(attributes.get('comment_count')), } - def add_file(file_data): - file_url = file_data.get('url') - if file_url: - info.update({ - 'url': file_url, - 'ext': determine_ext(file_data.get('name'), 'mp3'), - }) - for i in post.get('included', []): i_type = i.get('type') - if i_type == 'attachment': - add_file(i.get('attributes') or {}) + if i_type == 'media': + media_attributes = i.get('attributes') or {} + download_url = media_attributes.get('download_url') + ext = mimetype2ext(media_attributes.get('mimetype')) + if download_url and ext in KNOWN_EXTENSIONS: + info.update({ + 'ext': ext, + 'filesize': int_or_none(media_attributes.get('size_bytes')), + 'url': download_url, + }) elif i_type == 'user': user_attributes = i.get('attributes') if user_attributes: info.update({ 'uploader': user_attributes.get('full_name'), + 'uploader_id': str_or_none(i.get('id')), 'uploader_url': user_attributes.get('url'), }) if not info.get('url'): - add_file(attributes.get('post_file') or {}) + embed_url = try_get(attributes, lambda x: x['embed']['url']) + if embed_url: + info.update({ + '_type': 'url', + 'url': embed_url, + }) if not info.get('url'): - info.update({ - '_type': 'url', - 'url': attributes['embed']['url'], - }) + post_file = attributes['post_file'] + ext = determine_ext(post_file.get('name')) + if ext in KNOWN_EXTENSIONS: + info.update({ + 'ext': ext, + 'url': post_file['url'], + }) return info From 2318629b2b79cad5fcab743bce86233a7592ed46 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 14:04:50 +0100 Subject: [PATCH 0164/1705] [dplay] minimize response size --- youtube_dl/extractor/dplay.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index d9c3d59cd..a7b9db568 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -146,6 +146,11 @@ class DPlayIE(InfoExtractor): video = self._download_json( disco_base + 'content/videos/' + display_id, display_id, headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', 'include': 'images,primaryChannel,show,tags' }) video_id = video['data']['id'] @@ -226,7 +231,6 @@ class DPlayIE(InfoExtractor): 'series': series, 'season_number': int_or_none(info.get('seasonNumber')), 'episode_number': int_or_none(info.get('episodeNumber')), - 'age_limit': int_or_none(info.get('minimum_age')), 'creator': creator, 'tags': tags, 'thumbnails': thumbnails, From b6139cb0c3635eb96e39973ab288c17a9f104067 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 22:56:25 +0100 Subject: [PATCH 0165/1705] [common] pass headers to _extract_(m3u8|mpd)_formats methods --- youtube_dl/extractor/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 50d48c40d..2688b19e4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1586,12 +1586,12 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False): + fatal=True, live=False, headers=None): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal) + fatal=fatal, headers=headers) if res is False: return [] @@ -2009,12 +2009,12 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers=None): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal) + fatal=fatal, headers=None) if res is False: return [] mpd_doc, urlh = res From d7def23d0539430f5d816f1cfd733e436f62c257 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 23:08:42 +0100 Subject: [PATCH 0166/1705] [hotstar] pass Referer header to format requests(closes #22836) --- youtube_dl/extractor/hotstar.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index f9f7c5a64..f97eefa3d 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -118,6 +118,7 @@ class HotStarIE(HotStarBaseIE): if video_data.get('drmProtected'): raise ExtractorError('This video is DRM protected.', expected=True) + headers = {'Referer': url} formats = [] geo_restricted = False playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets'] @@ -137,10 +138,11 @@ class HotStarIE(HotStarBaseIE): if 'package:hls' in tags or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls')) + entry_protocol='m3u8_native', + m3u8_id='hls', headers=headers)) elif 'package:dash' in tags or ext == 'mpd': formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash')) + format_url, video_id, mpd_id='dash', headers=headers)) elif ext == 'f4m': # produce broken files pass @@ -158,6 +160,9 @@ class HotStarIE(HotStarBaseIE): self.raise_geo_restricted(countries=['IN']) self._sort_formats(formats) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + return { 'id': video_id, 'title': title, From 57033e35e58e1d57ab3be5ffe5df5a80a5dbcf83 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 5 Nov 2019 23:41:57 +0100 Subject: [PATCH 0167/1705] [common] fix typo --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2688b19e4..1e6b66d25 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2014,7 +2014,7 @@ class InfoExtractor(object): mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal, headers=None) + fatal=fatal, headers=headers) if res is False: return [] mpd_doc, urlh = res From 3ec86619e33a3d1e29c14ec053d7e420ac8b62ae Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 07:18:29 +0100 Subject: [PATCH 0168/1705] [common] initialize headers param with empty dict --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1e6b66d25..4a683f6d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1586,7 +1586,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, headers=None): + fatal=True, live=False, headers={}): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -2009,7 +2009,7 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers=None): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', From d64ec1242e9dec03ea2aa86b6e913db78c8619e0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 10:44:19 +0100 Subject: [PATCH 0169/1705] [onionstudios] fix extraction --- youtube_dl/extractor/onionstudios.py | 78 ++++++++++++++++------------ 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index c6e3d5640..7f8c6f0d3 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -5,10 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, + compat_str, int_or_none, - float_or_none, - mimetype2ext, + js_to_json, + parse_iso8601, + try_get, ) @@ -17,14 +18,16 @@ class OnionStudiosIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': '719d1f8c32094b8c33902c17bcae5e34', + 'md5': '5a118d466d62b5cd03647cf2c593977f', 'info_dict': { 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', + 'description': 'md5:545299bda6abf87e5ec666548c6a9448', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'The A.V. Club', - 'uploader_id': 'the-av-club', + 'uploader': 'a.v. club', + 'upload_date': '20150619', + 'timestamp': 1434728546, }, }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -44,38 +47,49 @@ class OnionStudiosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://onionstudios.com/embed/dc94dc2899fe644c0e7241fa04c1b732.js', + video_id) + mcp_id = compat_str(self._parse_json(self._search_regex( + r'window\.mcpMapping\s*=\s*({.+?});', webpage, + 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) video_data = self._download_json( - 'http://www.onionstudios.com/video/%s.json' % video_id, video_id) - - title = video_data['title'] - + 'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, + mcp_id)['videoMetadata'] + iptc = video_data['photoVideoMetadataIPTC'] + title = iptc['title']['en'] + fmg = video_data.get('photoVideoMetadata_fmg') or {} + tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' + data = self._download_json( + tvss_domain + '/api/v3/video-auth/url-signature-tokens', + mcp_id, query={'mcpids': mcp_id})['data'][0] formats = [] - for source in video_data.get('sources', []): - source_url = source.get('url') - if not source_url: - continue - ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - tbr = int_or_none(source.get('bitrate')) - formats.append({ - 'format_id': ext + ('-%d' % tbr if tbr else ''), - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'tbr': tbr, - 'ext': ext, - }) + + rendition_url = data.get('renditionUrl') + if rendition_url: + formats = self._extract_m3u8_formats( + rendition_url, mcp_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + + fallback_rendition_url = data.get('fallbackRenditionUrl') + if fallback_rendition_url: + formats.append({ + 'format_id': 'fallback', + 'tbr': int_or_none(self._search_regex( + r'_(\d+)\.mp4', fallback_rendition_url, + 'bitrate', default=None)), + 'url': fallback_rendition_url, + }) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': video_data.get('poster_url'), - 'uploader': video_data.get('channel_name'), - 'uploader_id': video_data.get('channel_slug'), - 'duration': float_or_none(video_data.get('duration', 1000)), - 'tags': video_data.get('tags'), + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'uploader': fmg.get('network'), + 'duration': int_or_none(iptc.get('fileDuration')), 'formats': formats, + 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'timestamp': parse_iso8601(iptc.get('dateReleased')), } From 55adb63e5412fa5556be22e97d61b8d27c7a5e67 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 19:56:10 +0100 Subject: [PATCH 0170/1705] [kinja] add support for Kinja embeds closes #5756 closes #11282 closes #22237 closes #22384 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 17 ++- youtube_dl/extractor/kinja.py | 221 +++++++++++++++++++++++++++ youtube_dl/extractor/onionstudios.py | 54 +------ 4 files changed, 241 insertions(+), 52 deletions(-) create mode 100644 youtube_dl/extractor/kinja.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9f43b284d..9e3b554fa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -513,6 +513,7 @@ from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c0780e98..3d919f656 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -119,6 +119,7 @@ from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE +from .kinja import KinjaEmbedIE class GenericIE(InfoExtractor): @@ -1487,16 +1488,18 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # OnionStudios embed + # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', 'info_dict': { - 'id': '2855', + 'id': '106351', 'ext': 'mp4', 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', + 'description': 'Migrated from OnionStudios', 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'ClickHole', - 'uploader_id': 'clickhole', + 'uploader': 'clickhole', + 'upload_date': '20150527', + 'timestamp': 1432744860, } }, # SnagFilms embed @@ -2894,6 +2897,12 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') + # Look for Kinja embeds + kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) + if kinja_embed_urls: + return self.playlist_from_matches( + kinja_embed_urls, video_id, video_title) + # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py new file mode 100644 index 000000000..79e3026d2 --- /dev/null +++ b/youtube_dl/extractor/kinja.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + unescapeHTML, + urljoin, +) + + +class KinjaEmbedIE(InfoExtractor): + IENAME = 'kinja:embed' + _DOMAIN_REGEX = r'''(?:[^.]+\.)? + (?: + avclub| + clickhole| + deadspin| + gizmodo| + jalopnik| + jezebel| + kinja| + kotaku| + lifehacker| + splinternews| + the(?:inventory|onion|root|takeout) + )\.com''' + _COMMON_REGEX = r'''/ + (?: + ajax/inset| + embed/video + )/iframe\?.*?\bid=''' + _VALID_URL = r'''(?x)https?://%s%s + (?P<type> + fb| + imgur| + instagram| + jwp(?:layer)?-video| + kinjavideo| + mcp| + megaphone| + ooyala| + soundcloud(?:-playlist)?| + tumblr-post| + twitch-stream| + twitter| + ustream-channel| + vimeo| + vine| + youtube-(?:list|video) + )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + _TESTS = [{ + 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E', + 'only_matching': True, + }, { + 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE', + 'only_matching': True, + }] + _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform') + _PROVIDER_MAP = { + 'fb': ('facebook.com/video.php?v=', 'Facebook'), + 'imgur': ('imgur.com/', 'Imgur'), + 'instagram': ('instagram.com/p/', 'Instagram'), + 'jwplayer-video': _JWPLATFORM_PROVIDER, + 'jwp-video': _JWPLATFORM_PROVIDER, + 'megaphone': ('player.megaphone.fm/', 'Generic'), + 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), + 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), + 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), + 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), + 'twitch-stream': ('twitch.tv/', 'TwitchStream'), + 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'), + 'ustream-channel': ('ustream.tv/embed/', 'Ustream'), + 'vimeo': ('vimeo.com/', 'Vimeo'), + 'vine': ('vine.co/v/', 'Vine'), + 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'), + 'youtube-video': ('youtube.com/embed/', 'Youtube'), + } + + @staticmethod + def _extract_urls(webpage, url): + return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( + r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), + webpage)] + + def _real_extract(self, url): + video_type, video_id = re.match(self._VALID_URL, url).groups() + + provider = self._PROVIDER_MAP.get(video_type) + if provider: + video_id = compat_urllib_parse_unquote(video_id) + if video_type == 'tumblr-post': + video_id, blog = video_id.split('-', 1) + result_url = provider[0] % (blog, video_id) + elif video_type == 'youtube-list': + video_id, playlist_id = video_id.split('/') + result_url = provider[0] % (video_id, playlist_id) + else: + if video_type == 'ooyala': + video_id = video_id.split('/')[0] + result_url = provider[0] + video_id + return self.url_result('http://' + result_url, provider[1]) + + if video_type == 'kinjavideo': + data = self._download_json( + 'https://kinja.com/api/core/video/views/videoById', + video_id, query={'videoId': video_id})['data'] + title = data['title'] + + formats = [] + for k in ('signedPlaylist', 'streaming'): + m3u8_url = data.get(k + 'Url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + thumbnail = None + poster = data.get('poster') or {} + poster_id = poster.get('id') + if poster_id: + thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'formats': formats, + 'tags': data.get('tags'), + 'timestamp': int_or_none(try_get( + data, lambda x: x['postInfo']['publishTimeMillis']), 1000), + 'thumbnail': thumbnail, + 'uploader': data.get('network'), + } + else: + video_data = self._download_json( + 'https://api.vmh.univision.com/metadata/v1/content/' + video_id, + video_id)['videoMetadata'] + iptc = video_data['photoVideoMetadataIPTC'] + title = iptc['title']['en'] + fmg = video_data.get('photoVideoMetadata_fmg') or {} + tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' + data = self._download_json( + tvss_domain + '/api/v3/video-auth/url-signature-tokens', + video_id, query={'mcpids': video_id})['data'][0] + formats = [] + + rendition_url = data.get('renditionUrl') + if rendition_url: + formats = self._extract_m3u8_formats( + rendition_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + + fallback_rendition_url = data.get('fallbackRenditionUrl') + if fallback_rendition_url: + formats.append({ + 'format_id': 'fallback', + 'tbr': int_or_none(self._search_regex( + r'_(\d+)\.mp4', fallback_rendition_url, + 'bitrate', default=None)), + 'url': fallback_rendition_url, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'uploader': fmg.get('network'), + 'duration': int_or_none(iptc.get('fileDuration')), + 'formats': formats, + 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'timestamp': parse_iso8601(iptc.get('dateReleased')), + } diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 7f8c6f0d3..cf5c39e66 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -4,13 +4,8 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - compat_str, - int_or_none, - js_to_json, - parse_iso8601, - try_get, -) +from ..compat import compat_str +from ..utils import js_to_json class OnionStudiosIE(InfoExtractor): @@ -20,7 +15,7 @@ class OnionStudiosIE(InfoExtractor): 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', 'md5': '5a118d466d62b5cd03647cf2c593977f', 'info_dict': { - 'id': '2937', + 'id': '3459881', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', 'description': 'md5:545299bda6abf87e5ec666548c6a9448', @@ -53,43 +48,6 @@ class OnionStudiosIE(InfoExtractor): mcp_id = compat_str(self._parse_json(self._search_regex( r'window\.mcpMapping\s*=\s*({.+?});', webpage, 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) - video_data = self._download_json( - 'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, - mcp_id)['videoMetadata'] - iptc = video_data['photoVideoMetadataIPTC'] - title = iptc['title']['en'] - fmg = video_data.get('photoVideoMetadata_fmg') or {} - tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com' - data = self._download_json( - tvss_domain + '/api/v3/video-auth/url-signature-tokens', - mcp_id, query={'mcpids': mcp_id})['data'][0] - formats = [] - - rendition_url = data.get('renditionUrl') - if rendition_url: - formats = self._extract_m3u8_formats( - rendition_url, mcp_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) - - fallback_rendition_url = data.get('fallbackRenditionUrl') - if fallback_rendition_url: - formats.append({ - 'format_id': 'fallback', - 'tbr': int_or_none(self._search_regex( - r'_(\d+)\.mp4', fallback_rendition_url, - 'bitrate', default=None)), - 'url': fallback_rendition_url, - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), - 'uploader': fmg.get('network'), - 'duration': int_or_none(iptc.get('fileDuration')), - 'formats': formats, - 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), - 'timestamp': parse_iso8601(iptc.get('dateReleased')), - } + return self.url_result( + 'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id, + 'KinjaEmbed', mcp_id) From 5d92b407e0ea856e3dbadfef35e5258e94e0bb23 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 20:41:49 +0100 Subject: [PATCH 0171/1705] [mixcloud] improve extraction - improve metadata extraction(closes #11721) - fix playlist extraction(closes #22378) - fix user mixes extraction(closes #15197)(closes #17865) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/mixcloud.py | 498 +++++++++++++---------------- 2 files changed, 225 insertions(+), 274 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9e3b554fa..2f9ba6893 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -639,7 +639,6 @@ from .mixcloud import ( MixcloudIE, MixcloudUserIE, MixcloudPlaylistIE, - MixcloudStreamIE, ) from .mlb import MLBIE from .mnet import MnetIE diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index e5f631506..9759560f1 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import functools import itertools import re @@ -11,28 +10,37 @@ from ..compat import ( compat_ord, compat_str, compat_urllib_parse_unquote, - compat_urlparse, compat_zip ) from ..utils import ( - clean_html, - ExtractorError, int_or_none, - OnDemandPagedList, - str_to_int, + parse_iso8601, + strip_or_none, try_get, - urljoin, ) -class MixcloudIE(InfoExtractor): +class MixcloudBaseIE(InfoExtractor): + def _call_api(self, object_type, object_fields, display_id, username, slug=None): + lookup_key = object_type + 'Lookup' + return self._download_json( + 'https://www.mixcloud.com/graphql', display_id, query={ + 'query': '''{ + %s(lookup: {username: "%s"%s}) { + %s + } +}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields) + })['data'][lookup_key] + + +class MixcloudIE(MixcloudBaseIE): _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', 'info_dict': { - 'id': 'dholbach-cryptkeeper', + 'id': 'dholbach_cryptkeeper', 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', @@ -40,11 +48,13 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, + 'timestamp': 1321359578, + 'upload_date': '20111115', }, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { - 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', + 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat', 'ext': 'mp3', 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', @@ -52,11 +62,14 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'gillespeterson', 'thumbnail': 're:https?://.*', 'view_count': int, + 'timestamp': 1422987057, + 'upload_date': '20150203', }, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'only_matching': True, }] + _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD' @staticmethod def _decrypt_xor_cipher(key, ciphertext): @@ -66,177 +79,193 @@ class MixcloudIE(InfoExtractor): for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader = mobj.group(1) - cloudcast_name = mobj.group(2) - track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name))) + username, slug = re.match(self._VALID_URL, url).groups() + username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug) + track_id = '%s_%s' % (username, slug) - webpage = self._download_webpage(url, track_id) + cloudcast = self._call_api('cloudcast', '''audioLength + comments(first: 100) { + edges { + node { + comment + created + user { + displayName + username + } + } + } + totalCount + } + description + favorites { + totalCount + } + featuringArtistList + isExclusive + name + owner { + displayName + url + username + } + picture(width: 1024, height: 1024) { + url + } + plays + publishDate + reposts { + totalCount + } + streamInfo { + dashUrl + hlsUrl + url + } + tags { + tag { + name + } + }''', track_id, username, slug) - # Legacy path - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + title = cloudcast['name'] - if encrypted_play_info is not None: - # Decode - encrypted_play_info = compat_b64decode(encrypted_play_info) - else: - # New path - full_info_json = self._parse_json(self._html_search_regex( - r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', - webpage, 'play info'), 'play info') - for item in full_info_json: - item_data = try_get(item, [ - lambda x: x['cloudcast']['data']['cloudcastLookup'], - lambda x: x['cloudcastLookup']['data']['cloudcastLookup'], - ], dict) - if try_get(item_data, lambda x: x['streamInfo']['url']): - info_json = item_data - break - else: - raise ExtractorError('Failed to extract matching stream info') + stream_info = cloudcast['streamInfo'] + formats = [] - message = self._html_search_regex( - r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', - webpage, 'error message', default=None) - - js_url = self._search_regex( - r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', - webpage, 'js url') - js = self._download_webpage(js_url, track_id, 'Downloading JS') - # Known plaintext attack - if encrypted_play_info: - kps = ['{"stream_url":'] - kpa_target = encrypted_play_info - else: - kps = ['https://', 'http://'] - kpa_target = compat_b64decode(info_json['streamInfo']['url']) - for kp in kps: - partial_key = self._decrypt_xor_cipher(kpa_target, kp) - for quote in ["'", '"']: - key = self._search_regex( - r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), - js, 'encryption key', default=None) - if key is not None: - break - else: + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: continue - break - else: - raise ExtractorError('Failed to extract encryption key') + decrypted = self._decrypt_xor_cipher( + self._DECRYPTION_KEY, compat_b64decode(format_url)) + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, + }) - if encrypted_play_info is not None: - play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - song_url = play_info['stream_url'] - formats = [{ - 'format_id': 'normal', - 'url': song_url - }] + if not formats and cloudcast.get('isExclusive'): + self.raise_login_required() - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', - r'/listeners/?">([0-9,.]+)</a>', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + self._sort_formats(formats) - else: - title = info_json['name'] - thumbnail = urljoin( - 'https://thumbnailer.mixcloud.com/unsafe/600x600/', - try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) - uploader = try_get(info_json, lambda x: x['owner']['displayName']) - uploader_id = try_get(info_json, lambda x: x['owner']['username']) - description = try_get(info_json, lambda x: x['description']) - view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) + comments = [] + for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): + node = edge.get('node') or {} + text = strip_or_none(node.get('comment')) + if not text: + continue + user = node.get('user') or {} + comments.append({ + 'author': user.get('displayName'), + 'author_id': user.get('username'), + 'text': text, + 'timestamp': parse_iso8601(node.get('created')), + }) - stream_info = info_json['streamInfo'] - formats = [] + tags = [] + for t in cloudcast.get('tags'): + tag = try_get(t, lambda x: x['tag']['name'], compat_str) + if not tag: + tags.append(tag) - def decrypt_url(f_url): - for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): - decrypted_url = self._decrypt_xor_cipher(k, f_url) - if re.search(r'^https?://[0-9A-Za-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): - return decrypted_url + get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - for url_key in ('url', 'hlsUrl', 'dashUrl'): - format_url = stream_info.get(url_key) - if not format_url: - continue - decrypted = decrypt_url(compat_b64decode(format_url)) - if not decrypted: - continue - if url_key == 'hlsUrl': - formats.extend(self._extract_m3u8_formats( - decrypted, track_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif url_key == 'dashUrl': - formats.extend(self._extract_mpd_formats( - decrypted, track_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'format_id': 'http', - 'url': decrypted, - 'downloader_options': { - # Mixcloud starts throttling at >~5M - 'http_chunk_size': 5242880, - }, - }) - self._sort_formats(formats) + owner = cloudcast.get('owner') or {} return { 'id': track_id, 'title': title, 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, + 'description': cloudcast.get('description'), + 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str), + 'uploader': owner.get('displayName'), + 'timestamp': parse_iso8601(cloudcast.get('publishDate')), + 'uploader_id': owner.get('username'), + 'uploader_url': owner.get('url'), + 'duration': int_or_none(cloudcast.get('audioLength')), + 'view_count': int_or_none(cloudcast.get('plays')), + 'like_count': get_count('favorites'), + 'repost_count': get_count('reposts'), + 'comment_count': get_count('comments'), + 'comments': comments, + 'tags': tags, + 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, } -class MixcloudPlaylistBaseIE(InfoExtractor): - _PAGE_SIZE = 24 +class MixcloudPlaylistBaseIE(MixcloudBaseIE): + def _get_cloudcast(self, node): + return node - def _find_urls_in_page(self, page): - for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page): - yield self.url_result( - compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)), - MixcloudIE.ie_key()) + def _get_playlist_title(self, title, slug): + return title - def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None): - real_page_number = real_page_number or current_page + 1 - return self._download_webpage( - 'https://www.mixcloud.com/%s/' % path, video_id, - note='Download %s (page %d)' % (page_name, current_page + 1), - errnote='Unable to download %s' % page_name, - query={'page': real_page_number, 'list': 'main', '_ajax': '1'}, - headers={'X-Requested-With': 'XMLHttpRequest'}) + def _real_extract(self, url): + username, slug = re.match(self._VALID_URL, url).groups() + username = compat_urllib_parse_unquote(username) + if not slug: + slug = 'uploads' + else: + slug = compat_urllib_parse_unquote(slug) + playlist_id = '%s_%s' % (username, slug) - def _tracks_page_func(self, page, video_id, page_name, current_page): - resp = self._fetch_tracks_page(page, video_id, page_name, current_page) + is_playlist_type = self._ROOT_TYPE == 'playlist' + playlist_type = 'items' if is_playlist_type else slug + list_filter = '' - for item in self._find_urls_in_page(resp): - yield item + has_next_page = True + entries = [] + while has_next_page: + playlist = self._call_api( + self._ROOT_TYPE, '''%s + %s + %s(first: 100%s) { + edges { + node { + %s + } + } + pageInfo { + endCursor + hasNextPage + } + }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), + playlist_id, username, slug if is_playlist_type else None) - def _get_user_description(self, page_content): - return self._html_search_regex( - r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>', - page_content, 'user description', fatal=False) + items = playlist.get(playlist_type) or {} + for edge in items.get('edges', []): + cloudcast = self._get_cloudcast(edge.get('node') or {}) + cloudcast_url = cloudcast.get('url') + if not cloudcast_url: + continue + entries.append(self.url_result( + cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug'))) + + page_info = items['pageInfo'] + has_next_page = page_info['hasNextPage'] + list_filter = ', after: "%s"' % page_info['endCursor'] + + return self.playlist_result( + entries, playlist_id, + self._get_playlist_title(playlist[self._TITLE_KEY], slug), + playlist.get(self._DESCRIPTION_KEY)) class MixcloudUserIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$' + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$' IE_NAME = 'mixcloud:user' _TESTS = [{ @@ -244,68 +273,58 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'playlist_mincount': 11, + 'playlist_mincount': 36, }, { 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'playlist_mincount': 11, + 'playlist_mincount': 36, }, { 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'params': { - 'playlist_items': '1-100', - }, - 'playlist_mincount': 100, + # 'params': { + # 'playlist_items': '1-100', + # }, + 'playlist_mincount': 396, }, { 'url': 'http://www.mixcloud.com/dholbach/listens/', 'info_dict': { 'id': 'dholbach_listens', 'title': 'Daniel Holbach (listens)', - 'description': 'md5:def36060ac8747b3aabca54924897e47', + 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', }, - 'params': { - 'playlist_items': '1-100', + # 'params': { + # 'playlist_items': '1-100', + # }, + 'playlist_mincount': 1623, + 'skip': 'Large list', + }, { + 'url': 'https://www.mixcloud.com/FirstEar/stream/', + 'info_dict': { + 'id': 'FirstEar_stream', + 'title': 'First Ear (stream)', + 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', }, - 'playlist_mincount': 100, + 'playlist_mincount': 271, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user') - list_type = mobj.group('type') + _TITLE_KEY = 'displayName' + _DESCRIPTION_KEY = 'biog' + _ROOT_TYPE = 'user' + _NODE_TEMPLATE = '''slug + url''' - # if only a profile URL was supplied, default to download all uploads - if list_type is None: - list_type = 'uploads' - - video_id = '%s_%s' % (user_id, list_type) - - profile = self._download_webpage( - 'https://www.mixcloud.com/%s/' % user_id, video_id, - note='Downloading user profile', - errnote='Unable to download user profile') - - username = self._og_search_title(profile) - description = self._get_user_description(profile) - - entries = OnDemandPagedList( - functools.partial( - self._tracks_page_func, - '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), - self._PAGE_SIZE) - - return self.playlist_result( - entries, video_id, '%s (%s)' % (username, list_type), description) + def _get_playlist_title(self, title, slug): + return '%s (%s)' % (title, slug) class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): @@ -313,87 +332,20 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): IE_NAME = 'mixcloud:playlist' _TESTS = [{ - 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/', - 'info_dict': { - 'id': 'RedBullThre3style_tokyo-finalists-2015', - 'title': 'National Champions 2015', - 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3', - }, - 'playlist_mincount': 16, - }, { 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user') - playlist_id = mobj.group('playlist') - video_id = '%s_%s' % (user_id, playlist_id) - - webpage = self._download_webpage( - url, user_id, - note='Downloading playlist page', - errnote='Unable to download playlist page') - - title = self._html_search_regex( - r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)', - webpage, 'playlist title', - default=None) or self._og_search_title(webpage, fatal=False) - description = self._get_user_description(webpage) - - entries = OnDemandPagedList( - functools.partial( - self._tracks_page_func, - '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), - self._PAGE_SIZE) - - return self.playlist_result(entries, video_id, title, description) - - -class MixcloudStreamIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' - IE_NAME = 'mixcloud:stream' - - _TEST = { - 'url': 'https://www.mixcloud.com/FirstEar/stream/', 'info_dict': { - 'id': 'FirstEar', - 'title': 'First Ear', - 'description': 'Curators of good music\nfirstearmusic.com', + 'id': 'maxvibes_jazzcat-on-ness-radio', + 'title': 'Ness Radio sessions', }, - 'playlist_mincount': 192, - } + 'playlist_mincount': 59, + }] + _TITLE_KEY = 'name' + _DESCRIPTION_KEY = 'description' + _ROOT_TYPE = 'playlist' + _NODE_TEMPLATE = '''cloudcast { + slug + url + }''' - def _real_extract(self, url): - user_id = self._match_id(url) - - webpage = self._download_webpage(url, user_id) - - entries = [] - prev_page_url = None - - def _handle_page(page): - entries.extend(self._find_urls_in_page(page)) - return self._search_regex( - r'm-next-page-url="([^"]+)"', page, - 'next page URL', default=None) - - next_page_url = _handle_page(webpage) - - for idx in itertools.count(0): - if not next_page_url or prev_page_url == next_page_url: - break - - prev_page_url = next_page_url - current_page = int(self._search_regex( - r'\?page=(\d+)', next_page_url, 'next page number')) - - next_page_url = _handle_page(self._fetch_tracks_page( - '%s/stream' % user_id, user_id, 'stream', idx, - real_page_number=current_page)) - - username = self._og_search_title(webpage) - description = self._get_user_description(webpage) - - return self.playlist_result(entries, user_id, username, description) + def _get_cloudcast(self, node): + return node.get('cloudcast') or {} From d4f53af482cc47b0473a3576da7ad902bea4ac39 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 6 Nov 2019 23:14:26 +0100 Subject: [PATCH 0172/1705] [lnkgo] fix extraction(closes #16834) --- youtube_dl/extractor/lnkgo.py | 100 ++++++++++++---------------------- 1 file changed, 36 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py index cfec0d3d0..3e71852aa 100644 --- a/youtube_dl/extractor/lnkgo.py +++ b/youtube_dl/extractor/lnkgo.py @@ -5,24 +5,27 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + compat_str, int_or_none, - unified_strdate, + parse_iso8601, ) class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnkgo\.(?:alfa\.)?lt/visi-video/(?P<show>[^/]+)/ziurek-(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?' _TESTS = [{ - 'url': 'http://lnkgo.alfa.lt/visi-video/yra-kaip-yra/ziurek-yra-kaip-yra-162', + 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', 'info_dict': { - 'id': '46712', + 'id': '10809', 'ext': 'mp4', - 'title': 'Yra kaip yra', - 'upload_date': '20150107', - 'description': 'md5:d82a5e36b775b7048617f263a0e3475e', - 'age_limit': 7, - 'duration': 3019, - 'thumbnail': r're:^https?://.*\.jpg$' + 'title': "Put'ka: Trys Klausimai", + 'upload_date': '20161216', + 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', + 'age_limit': 18, + 'duration': 117, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1481904000, }, 'params': { 'skip_download': True, # HLS download @@ -30,20 +33,21 @@ class LnkGoIE(InfoExtractor): }, { 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', 'info_dict': { - 'id': '47289', + 'id': '10467', 'ext': 'mp4', 'title': 'Nėrdas: Kompiuterio Valymas', 'upload_date': '20150113', 'description': 'md5:7352d113a242a808676ff17e69db6a69', 'age_limit': 18, 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1421164800, }, 'params': { 'skip_download': True, # HLS download }, }, { - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', + 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', 'only_matching': True, }] _AGE_LIMITS = { @@ -51,66 +55,34 @@ class LnkGoIE(InfoExtractor): 'N-14': 14, 'S': 18, } + _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' def _real_extract(self, url): - display_id = self._match_id(url) + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage( - url, display_id, 'Downloading player webpage') - - video_id = self._search_regex( - r'data-ep="([^"]+)"', webpage, 'video ID') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - upload_date = unified_strdate(self._search_regex( - r'class="[^"]*meta-item[^"]*air-time[^"]*">.*?<strong>([^<]+)</strong>', webpage, 'upload date', fatal=False)) - - thumbnail_w = int_or_none( - self._og_search_property('image:width', webpage, 'thumbnail width', fatal=False)) - thumbnail_h = int_or_none( - self._og_search_property('image:height', webpage, 'thumbnail height', fatal=False)) - thumbnail = { - 'url': self._og_search_thumbnail(webpage), - } - if thumbnail_w and thumbnail_h: - thumbnail.update({ - 'width': thumbnail_w, - 'height': thumbnail_h, - }) - - config = self._parse_json(self._search_regex( - r'episodePlayer\((\{.*?\}),\s*\{', webpage, 'sources'), video_id) - - if config.get('pGeo'): - self.report_warning( - 'This content might not be available in your country due to copyright reasons') - - formats = [{ - 'format_id': 'hls', - 'ext': 'mp4', - 'url': config['EpisodeVideoLink_HLS'], - }] - - m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<play_path>.+)$', config['EpisodeVideoLink']) - if m: - formats.append({ - 'format_id': 'rtmp', - 'ext': 'flv', - 'url': m.group('url'), - 'play_path': m.group('play_path'), - 'page_url': url, - }) + video_info = self._download_json( + 'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'), + display_id)['videoConfig']['videoInfo'] + video_id = compat_str(video_info['id']) + title = video_info['title'] + prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' + formats = self._extract_m3u8_formats( + self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), + video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) + poster_image = video_info.get('posterImage') + return { 'id': video_id, 'display_id': display_id, 'title': title, 'formats': formats, - 'thumbnails': [thumbnail], - 'duration': int_or_none(config.get('VideoTime')), - 'description': description, - 'age_limit': self._AGE_LIMITS.get(config.get('PGRating'), 0), - 'upload_date': upload_date, + 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None, + 'duration': int_or_none(video_info.get('duration')), + 'description': clean_html(video_info.get('htmlDescription')), + 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), + 'timestamp': parse_iso8601(video_info.get('airDate')), + 'view_count': int_or_none(video_info.get('viewsCount')), } From 0b16b3c2d35d1706ec5c55e5b06352c753127368 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 9 Nov 2019 09:22:24 +0100 Subject: [PATCH 0173/1705] [twitch] add support for Clip embed URLs --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ca7676fe2..a5681409c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -644,7 +644,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -667,6 +667,9 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', 'only_matching': True, + }, { + 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', + 'only_matching': True, }] def _real_extract(self, url): From 18ca61c5e153d1c1cb8b9a2de3c8b9dfdaa69b0e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 9 Nov 2019 09:23:20 +0100 Subject: [PATCH 0174/1705] [twitter] improve extraction - add support for generic embeds(closes #22168) - always extract http formats for native videos(closes #14934) - add support for Twitter Broadcasts(closes #21369) - extract more metadata - improve VMap format extraction - unify extraction code for both twitter statuses and cards --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/periscope.py | 80 ++-- youtube_dl/extractor/twitter.py | 570 +++++++++++++++-------------- 3 files changed, 344 insertions(+), 307 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f9ba6893..598006061 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1241,6 +1241,7 @@ from .twitter import ( TwitterCardIE, TwitterIE, TwitterAmplifyIE, + TwitterBroadcastIE, ) from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index b337a56c0..c02e34aba 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -17,12 +17,54 @@ class PeriscopeBaseIE(InfoExtractor): 'https://api.periscope.tv/api/v2/%s' % method, item_id, query=query) + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast['status'] + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + is_live = broadcast.get('state').lower() == 'running' + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': self._live_title(title) if is_live else title, + 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'tags': broadcast.get('tags'), + 'is_live': is_live, + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + return m3u8_formats + class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' - # Alive example URLs can be found here http://onperiscope.com/ + # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', @@ -61,21 +103,9 @@ class PeriscopeIE(PeriscopeBaseIE): 'accessVideoPublic', {'broadcast_id': token}, token) broadcast = stream['broadcast'] - title = broadcast['status'] + info = self._parse_broadcast_data(broadcast, token) - uploader = broadcast.get('user_display_name') or broadcast.get('username') - uploader_id = (broadcast.get('user_id') or broadcast.get('username')) - - title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() - if state == 'running': - title = self._live_title(title) - timestamp = parse_iso8601(broadcast.get('created_at')) - - thumbnails = [{ - 'url': broadcast[image], - } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - width = int_or_none(broadcast.get('width')) height = int_or_none(broadcast.get('height')) @@ -92,32 +122,20 @@ class PeriscopeIE(PeriscopeBaseIE): continue video_urls.add(video_url) if format_id != 'rtmp': - m3u8_formats = self._extract_m3u8_formats( - video_url, token, 'mp4', - entry_protocol='m3u8_native' - if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=False) - if len(m3u8_formats) == 1: - add_width_and_height(m3u8_formats[0]) + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) formats.extend(m3u8_formats) continue rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } - add_width_and_height(rtmp_format) + self._add_width_and_height(rtmp_format) formats.append(rtmp_format) self._sort_formats(formats) - return { - 'id': broadcast.get('id') or token, - 'title': title, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnails': thumbnails, - 'formats': formats, - } + info['formats'] = formats + return info class PeriscopeUserIE(PeriscopeBaseIE): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index cebb6238c..5f8d90fb4 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -4,32 +4,67 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( - determine_ext, dict_get, ExtractorError, float_or_none, int_or_none, - remove_end, try_get, + strip_or_none, + unified_timestamp, + update_url_query, xpath_text, ) -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeBaseIE, + PeriscopeIE, +) class TwitterBaseIE(InfoExtractor): + _API_BASE = 'https://api.twitter.com/1.1/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' + _GUEST_TOKEN = None + + def _extract_variant_formats(self, variant, video_id): + variant_url = variant.get('url') + if not variant_url: + return [] + elif '.m3u8' in variant_url: + return self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + else: + tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None + f = { + 'url': variant_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + } + self._search_dimensions_in_video_url(f, variant_url) + return [f] + def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) - video_url = xpath_text(vmap_data, './/MediaFile').strip() - if determine_ext(video_url) == 'm3u8': - return self._extract_m3u8_formats( - video_url, video_id, ext='mp4', m3u8_id='hls', - entry_protocol='m3u8_native') - return [{ - 'url': video_url, - }] + formats = [] + urls = [] + for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): + video_variant.attrib['url'] = compat_urllib_parse_unquote( + video_variant.attrib['url']) + urls.append(video_variant.attrib['url']) + formats.extend(self._extract_variant_formats( + video_variant.attrib, video_id)) + video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) + if video_url not in urls: + formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) + return formats @staticmethod def _search_dimensions_in_video_url(a_format, video_url): @@ -40,10 +75,30 @@ class TwitterBaseIE(InfoExtractor): 'height': int(m.group('height')), }) + def _call_api(self, path, video_id, query={}): + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + } + if not self._GUEST_TOKEN: + self._GUEST_TOKEN = self._download_json( + self._API_BASE + 'guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = self._GUEST_TOKEN + try: + return self._download_json( + self._API_BASE + path, video_id, headers=headers, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), + video_id)['errors'][0]['message'], expected=True) + raise -class TwitterCardIE(TwitterBaseIE): + +class TwitterCardIE(InfoExtractor): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -51,19 +106,28 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", + 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 30.033, + 'timestamp': 1422366112, + 'upload_date': '20150127', }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', - 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'md5': '7137eca597f72b9abbe61e5ae0161399', 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*$', + 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", + 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", + 'uploader': 'NASA', + 'uploader_id': 'NASA', + 'timestamp': 1437408129, + 'upload_date': '20150720', }, }, { @@ -75,7 +139,7 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Ubuntu 11.10 Overview', 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', - 'uploader': 'OMG! Ubuntu!', + 'uploader': 'OMG! UBUNTU!', 'uploader_id': 'omgubuntu', }, 'add_ie': ['Youtube'], @@ -99,190 +163,30 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", + 'uploader': 'Brent Yarina', + 'uploader_id': 'BTNBrentYarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, + 'skip': 'This content is no longer available.', }, { 'url': 'https://twitter.com/i/videos/752274308186120192', 'only_matching': True, }, ] - _API_BASE = 'https://api.twitter.com/1.1' - - def _parse_media_info(self, media_info, video_id): - formats = [] - for media_variant in media_info.get('variants', []): - media_url = media_variant['url'] - if media_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) - elif media_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) - else: - tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) - a_format = { - 'url': media_url, - 'format_id': 'http-%d' % tbr if tbr else 'http', - 'tbr': tbr, - } - # Reported bitRate may be zero - if not a_format['tbr']: - del a_format['tbr'] - - self._search_dimensions_in_video_url(a_format, media_url) - - formats.append(a_format) - return formats - - def _extract_mobile_formats(self, username, video_id): - webpage = self._download_webpage( - 'https://mobile.twitter.com/%s/status/%s' % (username, video_id), - video_id, 'Downloading mobile webpage', - headers={ - # A recent mobile UA is necessary for `gt` cookie - 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', - }) - main_script_url = self._html_search_regex( - r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') - main_script = self._download_webpage( - main_script_url, video_id, 'Downloading main script') - bearer_token = self._search_regex( - r'BEARER_TOKEN\s*:\s*"([^"]+)"', - main_script, 'bearer token') - # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id - api_data = self._download_json( - '%s/statuses/show/%s.json' % (self._API_BASE, video_id), - video_id, 'Downloading API data', - headers={ - 'Authorization': 'Bearer ' + bearer_token, - }) - media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {} - return self._parse_media_info(media_info, video_id) - def _real_extract(self, url): - path, video_id = re.search(self._VALID_URL, url).groups() - - config = None - formats = [] - duration = None - - urls = [url] - if path.startswith('cards/'): - urls.append('https://twitter.com/i/videos/' + video_id) - - for u in urls: - webpage = self._download_webpage( - u, video_id, headers={'Referer': 'https://twitter.com/'}) - - iframe_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) - - config = self._parse_json(self._html_search_regex( - r'data-(?:player-)?config="([^"]+)"', webpage, - 'data player config', default='{}'), - video_id) - - if config.get('source_type') == 'vine': - return self.url_result(config['player_url'], 'Vine') - - periscope_url = PeriscopeIE._extract_url(webpage) - if periscope_url: - return self.url_result(periscope_url, PeriscopeIE.ie_key()) - - video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') - - if video_url: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) - else: - f = { - 'url': video_url, - } - - self._search_dimensions_in_video_url(f, video_url) - - formats.append(f) - - vmap_url = config.get('vmapUrl') or config.get('vmap_url') - if vmap_url: - formats.extend( - self._extract_formats_from_vmap_url(vmap_url, video_id)) - - media_info = None - - for entity in config.get('status', {}).get('entities', []): - if 'mediaInfo' in entity: - media_info = entity['mediaInfo'] - - if media_info: - formats.extend(self._parse_media_info(media_info, video_id)) - duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) - - username = config.get('user', {}).get('screen_name') - if username: - formats.extend(self._extract_mobile_formats(username, video_id)) - - if formats: - title = self._search_regex(r'<title>([^<]+)', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - break - - if not formats: - headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', - 'Referer': url, - } - ct0 = self._get_cookies(url).get('ct0') - if ct0: - headers['csrf_token'] = ct0.value - guest_token = self._download_json( - '%s/guest/activate.json' % self._API_BASE, video_id, - 'Downloading guest token', data=b'', - headers=headers)['guest_token'] - headers['x-guest-token'] = guest_token - self._set_cookie('api.twitter.com', 'gt', guest_token) - config = self._download_json( - '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), - video_id, headers=headers) - track = config['track'] - vmap_url = track.get('vmapUrl') - if vmap_url: - formats = self._extract_formats_from_vmap_url(vmap_url, video_id) - else: - playback_url = track['playbackUrl'] - if determine_ext(playback_url) == 'm3u8': - formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - else: - formats = [{ - 'url': playback_url, - }] - title = 'Twitter web player' - thumbnail = config.get('posterImage') - duration = float_or_none(track.get('durationMs'), scale=1000) - - self._remove_duplicate_formats(formats) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + status_id = self._match_id(url) + return self.url_result( + 'https://twitter.com/statuses/' + status_id, + TwitterIE.ie_key(), status_id) -class TwitterIE(InfoExtractor): +class TwitterIE(TwitterBaseIE): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' - _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' - _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P\d+)' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -291,10 +195,13 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', 'duration': 12.922, + 'timestamp': 1442188653, + 'upload_date': '20150913', + 'age_limit': 18, }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -316,19 +223,23 @@ class TwitterIE(InfoExtractor): 'id': '665052190608723968', 'ext': 'mp4', 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', - 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', + 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', 'uploader': 'Star Wars', + 'timestamp': 1447395772, + 'upload_date': '20151113', }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', - 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", 'uploader_id': 'BTNBrentYarina', 'uploader': 'Brent Yarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, 'params': { # The same video as https://twitter.com/i/videos/tweet/705235433198714880 @@ -340,12 +251,14 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'JG', - 'uploader_id': 'jaydingeer', + 'uploader': 'Simon Vertugo', + 'uploader_id': 'simonvertugo', 'duration': 30.0, + 'timestamp': 1455777459, + 'upload_date': '20160218', }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -353,10 +266,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'Vince Mancini - Vine of the day', - 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'Vince Mancini', - 'uploader_id': 'Filmdrunk', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', 'timestamp': 1402826626, 'upload_date': '20140615', }, @@ -367,21 +279,22 @@ class TwitterIE(InfoExtractor): 'id': '719944021058060289', 'ext': 'mp4', 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', - 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', - 'uploader_id': 'captainamerica', + 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', + 'uploader_id': 'CaptainAmerica', 'uploader': 'Captain America', 'duration': 3.17, + 'timestamp': 1460483005, + 'upload_date': '20160412', }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'info_dict': { 'id': '1zqKVVlkqLaKB', 'ext': 'mp4', - 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', - 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"', + 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', 'upload_date': '20160923', - 'uploader_id': 'OPP_HSD', - 'uploader': 'Sgt Kerry Schmidt', + 'uploader_id': '1PmKqpJdOJQoY', + 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'timestamp': 1474613214, }, 'add_ie': ['Periscope'], @@ -392,10 +305,12 @@ class TwitterIE(InfoExtractor): 'id': '852138619213144067', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', - 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', + 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', 'duration': 277.4, + 'timestamp': 1492000653, + 'upload_date': '20170412', }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -404,10 +319,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', 'uploader': 'Préfet de Guadeloupe', 'uploader_id': 'Prefet971', 'duration': 47.48, + 'timestamp': 1505803395, + 'upload_date': '20170919', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -420,10 +337,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 're:.*?Shep is on a roll today.*?', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09', 'uploader': 'Lis Power', 'uploader_id': 'LisPower1', 'duration': 111.278, + 'timestamp': 1527623489, + 'upload_date': '20180529', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -435,88 +354,163 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:66d493500c013e3e2d434195746a7f78', + 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', 'uploader': 'Twitter', 'uploader_id': 'Twitter', 'duration': 61.567, + 'timestamp': 1548184644, + 'upload_date': '20190122', }, + }, { + # not available in Periscope + 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', + 'info_dict': { + 'id': '1vOGwqejwoWxB', + 'ext': 'mp4', + 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', + 'uploader': 'Vivi', + 'uploader_id': '1eVjYOLGkGrQL', + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Twitch Clip Embed + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - twid = mobj.group('id') - - webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_STATUSES_URL % twid, twid) - - if 'twitter.com/account/suspended' in urlh.geturl(): - raise ExtractorError('Account suspended by Twitter.', expected=True) - - user_id = None - - redirect_mobj = re.match(self._VALID_URL, urlh.geturl()) - if redirect_mobj: - user_id = redirect_mobj.group('user_id') - - if not user_id: - user_id = mobj.group('user_id') - - username = remove_end(self._og_search_title(webpage), ' on Twitter') - - title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') + twid = self._match_id(url) + status = self._call_api( + 'statuses/show/%s.json' % twid, twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', + }) + title = description = status['full_text'].replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) + user = status.get('user') or {} + uploader = user.get('name') + if uploader: + title = '%s - %s' % (uploader, title) + uploader_id = user.get('screen_name') + + tags = [] + for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): + hashtag_text = hashtag.get('text') + if not hashtag_text: + continue + tags.append(hashtag_text) info = { - 'uploader_id': user_id, - 'uploader': username, - 'webpage_url': url, - 'description': '%s on Twitter: "%s"' % (username, description), - 'title': username + ' - ' + title, + 'id': twid, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, + 'like_count': int_or_none(status.get('favorite_count')), + 'repost_count': int_or_none(status.get('retweet_count')), + 'comment_count': int_or_none(status.get('reply_count')), + 'age_limit': 18 if status.get('possibly_sensitive') else 0, + 'tags': tags, } - mobj = re.search(r'''(?x) - ]+class="animated-gif"(?P[^>]+)>\s* - ]+video-src="(?P[^"]+)" - ''', webpage) + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + video_info = media.get('video_info') or {} + + formats = [] + for variant in video_info.get('variants', []): + formats.extend(self._extract_variant_formats(variant, twid)) + self._sort_formats(formats) + + thumbnails = [] + media_url = media.get('media_url_https') or media.get('media_url') + if media_url: + def add_thumbnail(name, size): + thumbnails.append({ + 'id': name, + 'url': update_url_query(media_url, {'name': name}), + 'width': int_or_none(size.get('w') or size.get('width')), + 'height': int_or_none(size.get('h') or size.get('height')), + }) + for name, size in media.get('sizes', {}).items(): + add_thumbnail(name, size) + add_thumbnail('orig', media.get('original_info') or {}) - if mobj: - more_info = mobj.group('more_info') - height = int_or_none(self._search_regex( - r'data-height="(\d+)"', more_info, 'height', fatal=False)) - width = int_or_none(self._search_regex( - r'data-width="(\d+)"', more_info, 'width', fatal=False)) - thumbnail = self._search_regex( - r'poster="([^"]+)"', more_info, 'poster', fatal=False) info.update({ - 'id': twid, - 'url': mobj.group('url'), - 'height': height, - 'width': width, - 'thumbnail': thumbnail, + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) - return info - - twitter_card_url = None - if 'class="PlayableMedia' in webpage: - twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid) else: - twitter_card_iframe_url = self._search_regex( - r'data-full-card-iframe-url=([\'"])(?P(?:(?!\1).)+)\1', - webpage, 'Twitter card iframe URL', default=None, group='url') - if twitter_card_iframe_url: - twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url) + card = status.get('card') + if card: + binding_values = card['binding_values'] - if twitter_card_url: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', - 'url': twitter_card_url, - }) - return info + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) - raise ExtractorError('There\'s no video in this tweet.') + card_name = card['name'].split(':')[-1] + if card_name == 'amplify': + formats = self._extract_formats_from_vmap_url( + get_binding_value('amplify_url_vmap'), + get_binding_value('amplify_content_id') or twid) + self._sort_formats(formats) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + }) + elif card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + else: + raise ExtractorError('Unsupported Twitter Card.') + else: + expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) + if not expanded_url: + raise ExtractorError("There's no video in this tweet.") + info.update({ + '_type': 'url', + 'url': expanded_url, + }) + return info class TwitterAmplifyIE(TwitterBaseIE): @@ -573,3 +567,27 @@ class TwitterAmplifyIE(TwitterBaseIE): 'formats': formats, 'thumbnails': thumbnails, } + + +class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): + IE_NAME = 'twitter:broadcast' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P[0-9a-zA-Z]{13})' + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + broadcast = self._call_api( + 'broadcasts/show.json', broadcast_id, + {'ids': broadcast_id})['broadcasts'][broadcast_id] + info = self._parse_broadcast_data(broadcast, broadcast_id) + media_key = broadcast['media_key'] + source = self._call_api( + 'live_video_stream/status/' + media_key, media_key)['source'] + m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] + if '/live_video_stream/geoblocked/' in m3u8_url: + self.raise_geo_restricted() + m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse( + m3u8_url).query).get('type', [None])[0] + state, width, height = self._extract_common_format_info(broadcast) + info['formats'] = self._extract_pscp_m3u8_formats( + m3u8_url, broadcast_id, m3u8_id, state, width, height) + return info From ce112a8c19ebcc9d401ff26a5cdcf58ba565901c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 11:01:07 +0100 Subject: [PATCH 0175/1705] [twitch] fix video comments URL(#18593)(closes #15828) --- youtube_dl/extractor/twitch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a5681409c..8c0d70010 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -344,9 +344,8 @@ class TwitchVodIE(TwitchItemBaseIE): info['subtitles'] = { 'rechat': [{ 'url': update_url_query( - 'https://rechat.twitch.tv/rechat-messages', { - 'video_id': 'v%s' % item_id, - 'start': info['timestamp'], + 'https://api.twitch.tv/v5/videos/%s/comments' % item_id, { + 'client_id': self._CLIENT_ID, }), 'ext': 'json', }], From f81dd65ba2c1e7be549e5c8cfe6cbf0f0829edfe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 13:11:59 +0100 Subject: [PATCH 0176/1705] [extractor/common] clean jwplayer description HTML tags --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4a683f6d6..4c2f9303e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2689,7 +2689,7 @@ class InfoExtractor(object): entry = { 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': video_data.get('description'), + 'description': clean_html(video_data.get('description')), 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), From 8fbf5d2f87fbfe0441bc20cf69d506109b2810bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 13:14:23 +0100 Subject: [PATCH 0177/1705] [seeker] remove Revision3 extractors and fix extraction --- youtube_dl/extractor/extractors.py | 4 - youtube_dl/extractor/revision3.py | 170 ----------------------------- youtube_dl/extractor/seeker.py | 45 ++++---- 3 files changed, 23 insertions(+), 196 deletions(-) delete mode 100644 youtube_dl/extractor/revision3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 598006061..8df9d95b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -932,10 +932,6 @@ from .rentv import ( from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE -from .revision3 import ( - Revision3EmbedIE, - Revision3IE, -) from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py deleted file mode 100644 index 833d8a2f0..000000000 --- a/youtube_dl/extractor/revision3.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_iso8601, - unescapeHTML, - qualities, -) - - -class Revision3EmbedIE(InfoExtractor): - IE_NAME = 'revision3:embed' - _VALID_URL = r'(?:revision3:(?:(?P[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P\d+)' - _TEST = { - 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', - 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', - 'info_dict': { - 'id': '67558', - 'ext': 'mp4', - 'title': 'The Pros & Cons Of Zoos', - 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', - 'uploader_id': 'dnews', - 'uploader': 'DNews', - } - } - _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('playlist_id') - playlist_type = mobj.group('playlist_type') or 'video_id' - video_data = self._download_json( - 'http://revision3.com/api/getPlaylist.json', playlist_id, query={ - 'api_key': self._API_KEY, - 'codecs': 'h264,vp8,theora', - playlist_type: playlist_id, - })['items'][0] - - formats = [] - for vcodec, media in video_data['media'].items(): - for quality_id, quality in media.items(): - if quality_id == 'hls': - formats.extend(self._extract_m3u8_formats( - quality['url'], playlist_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': quality['url'], - 'format_id': '%s-%s' % (vcodec, quality_id), - 'tbr': int_or_none(quality.get('bitrate')), - 'vcodec': vcodec, - }) - self._sort_formats(formats) - - return { - 'id': playlist_id, - 'title': unescapeHTML(video_data['title']), - 'description': unescapeHTML(video_data.get('summary')), - 'uploader': video_data.get('show', {}).get('name'), - 'uploader_id': video_data.get('show', {}).get('slug'), - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, - } - - -class Revision3IE(InfoExtractor): - IE_NAME = 'revision' - _VALID_URL = r'https?://(?:www\.)?(?P(?:revision3|animalist)\.com)/(?P[^/]+(?:/[^/?#]+)?)' - _TESTS = [{ - 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', - 'md5': 'd94a72d85d0a829766de4deb8daaf7df', - 'info_dict': { - 'id': '71089', - 'display_id': 'technobuffalo/5-google-predictions-for-2016', - 'ext': 'webm', - 'title': '5 Google Predictions for 2016', - 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.', - 'upload_date': '20151228', - 'timestamp': 1451325600, - 'duration': 187, - 'uploader': 'TechnoBuffalo', - 'uploader_id': 'technobuffalo', - } - }, { - # Show - 'url': 'http://revision3.com/variant', - 'only_matching': True, - }, { - # Tag - 'url': 'http://revision3.com/vr', - 'only_matching': True, - }] - _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' - - def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - site = domain.split('.')[0] - page_info = self._download_json( - self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) - - page_data = page_info['data'] - page_type = page_data['type'] - if page_type in ('episode', 'embed'): - show_data = page_data['show']['data'] - page_id = compat_str(page_data['id']) - video_id = compat_str(page_data['video']['data']['id']) - - preference = qualities(['mini', 'small', 'medium', 'large']) - thumbnails = [{ - 'url': image_url, - 'id': image_id, - 'preference': preference(image_id) - } for image_id, image_url in page_data.get('images', {}).items()] - - info = { - 'id': page_id, - 'display_id': display_id, - 'title': unescapeHTML(page_data['name']), - 'description': unescapeHTML(page_data.get('summary')), - 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), - 'author': page_data.get('author'), - 'uploader': show_data.get('name'), - 'uploader_id': show_data.get('slug'), - 'thumbnails': thumbnails, - 'extractor_key': site, - } - - if page_type == 'embed': - info.update({ - '_type': 'url_transparent', - 'url': page_data['video']['data']['embed'], - }) - return info - - info.update({ - '_type': 'url_transparent', - 'url': 'revision3:%s' % video_id, - }) - return info - else: - list_data = page_info[page_type]['data'] - episodes_data = page_info['episodes']['data'] - num_episodes = page_info['meta']['totalEpisodes'] - processed_episodes = 0 - entries = [] - page_num = 1 - while True: - entries.extend([{ - '_type': 'url', - 'url': 'http://%s%s' % (domain, episode['path']), - 'id': compat_str(episode['id']), - 'ie_key': 'Revision3', - 'extractor_key': site, - } for episode in episodes_data]) - processed_episodes += len(episodes_data) - if processed_episodes == num_episodes: - break - page_num += 1 - episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % ( - domain, display_id + '/' + compat_str(page_num), domain), - display_id)['episodes']['data'] - - return self.playlist_result( - entries, compat_str(list_data['id']), - list_data.get('name'), list_data.get('summary')) diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py index 3b9c65e7e..7872dc80d 100644 --- a/youtube_dl/extractor/seeker.py +++ b/youtube_dl/extractor/seeker.py @@ -4,34 +4,37 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + get_element_by_class, + strip_or_none, +) class SeekerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' _TESTS = [{ - # player.loadRevision3Item 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', - 'md5': '30c1dc4030cc715cf05b423d0947ac18', + 'md5': '897d44bbe0d8986a2ead96de565a92db', 'info_dict': { - 'id': '76243', - 'ext': 'webm', + 'id': 'Elrn3gnY', + 'ext': 'mp4', 'title': 'Should Trump Be Required To Release His Tax Returns?', - 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', - 'uploader': 'Seeker Daily', - 'uploader_id': 'seekerdaily', + 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', + 'timestamp': 1490090165, + 'upload_date': '20170321', } }, { 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', 'playlist': [ { - 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'md5': '0497b9f20495174be73ae136949707d2', 'info_dict': { - 'id': '67558', + 'id': 'FihYQ8AE', 'ext': 'mp4', 'title': 'The Pros & Cons Of Zoos', - 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', - 'uploader': 'DNews', - 'uploader_id': 'dnews', + 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', + 'timestamp': 1490039133, + 'upload_date': '20170320', }, } ], @@ -45,13 +48,11 @@ class SeekerIE(InfoExtractor): def _real_extract(self, url): display_id, article_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) - if mobj: - playlist_type, playlist_id = mobj.groups() - return self.url_result( - 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) - else: - entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( - r']+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] - return self.playlist_result( - entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage)) + entries = [] + for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): + entries.append(self.url_result( + 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) + return self.playlist_result( + entries, article_id, + self._og_search_title(webpage), + strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) From 20baa17c0180c7254644abea968792abcf0743cb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 16:00:12 +0100 Subject: [PATCH 0178/1705] [daisuki] remove extractor --- youtube_dl/extractor/daisuki.py | 154 ----------------------------- youtube_dl/extractor/extractors.py | 4 - 2 files changed, 158 deletions(-) delete mode 100644 youtube_dl/extractor/daisuki.py diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py deleted file mode 100644 index dbc1aa5d4..000000000 --- a/youtube_dl/extractor/daisuki.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import unicode_literals - -import base64 -import json -import random -import re - -from .common import InfoExtractor -from ..aes import ( - aes_cbc_decrypt, - aes_cbc_encrypt, -) -from ..compat import compat_b64decode -from ..utils import ( - bytes_to_intlist, - bytes_to_long, - extract_attributes, - ExtractorError, - intlist_to_bytes, - js_to_json, - int_or_none, - long_to_bytes, - pkcs1pad, -) - - -class DaisukiMottoIE(InfoExtractor): - _VALID_URL = r'https?://motto\.daisuki\.net/framewatch/embed/[^/]+/(?P[0-9a-zA-Z]{3})' - - _TEST = { - 'url': 'http://motto.daisuki.net/framewatch/embed/embedDRAGONBALLSUPERUniverseSurvivalsaga/V2e/760/428', - 'info_dict': { - 'id': 'V2e', - 'ext': 'mp4', - 'title': '#117 SHOWDOWN OF LOVE! ANDROIDS VS UNIVERSE 2!!', - 'subtitles': { - 'mul': [{ - 'ext': 'ttml', - }], - }, - }, - 'params': { - 'skip_download': True, # AES-encrypted HLS stream - }, - } - - # The public key in PEM format can be found in clientlibs_anime_watch.min.js - _RSA_KEY = (0xc5524c25e8e14b366b3754940beeb6f96cb7e2feef0b932c7659a0c5c3bf173d602464c2df73d693b513ae06ff1be8f367529ab30bf969c5640522181f2a0c51ea546ae120d3d8d908595e4eff765b389cde080a1ef7f1bbfb07411cc568db73b7f521cedf270cbfbe0ddbc29b1ac9d0f2d8f4359098caffee6d07915020077d, 65537) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - flashvars = self._parse_json(self._search_regex( - r'(?s)var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), - video_id, transform_source=js_to_json) - - iv = [0] * 16 - - data = {} - for key in ('device_cd', 'mv_id', 'ss1_prm', 'ss2_prm', 'ss3_prm', 'ss_id'): - data[key] = flashvars.get(key, '') - - encrypted_rtn = None - - # Some AES keys are rejected. Try it with different AES keys - for idx in range(5): - aes_key = [random.randint(0, 254) for _ in range(32)] - padded_aeskey = intlist_to_bytes(pkcs1pad(aes_key, 128)) - - n, e = self._RSA_KEY - encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n)) - init_data = self._download_json( - 'http://motto.daisuki.net/fastAPI/bgn/init/', - video_id, query={ - 's': flashvars.get('s', ''), - 'c': flashvars.get('ss3_prm', ''), - 'e': url, - 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( - bytes_to_intlist(json.dumps(data)), - aes_key, iv))).decode('ascii'), - 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), - }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) - - if 'rtn' in init_data: - encrypted_rtn = init_data['rtn'] - break - - self._sleep(5, video_id) - - if encrypted_rtn is None: - raise ExtractorError('Failed to fetch init data') - - rtn = self._parse_json( - intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist( - compat_b64decode(encrypted_rtn)), - aes_key, iv)).decode('utf-8').rstrip('\0'), - video_id) - - title = rtn['title_str'] - - formats = self._extract_m3u8_formats( - rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native') - - subtitles = {} - caption_url = rtn.get('caption_url') - if caption_url: - # mul: multiple languages - subtitles['mul'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - } - - -class DaisukiMottoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://motto\.daisuki\.net/(?Pinformation)/' - - _TEST = { - 'url': 'http://motto.daisuki.net/information/', - 'info_dict': { - 'title': 'DRAGON BALL SUPER', - }, - 'playlist_mincount': 117, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [] - for li in re.findall(r'(]+?data-product_id="[a-zA-Z0-9]{3}"[^>]+>)', webpage): - attr = extract_attributes(li) - ad_id = attr.get('data-ad_id') - product_id = attr.get('data-product_id') - if ad_id and product_id: - episode_id = attr.get('data-chapter') - entries.append({ - '_type': 'url_transparent', - 'url': 'http://motto.daisuki.net/framewatch/embed/%s/%s/760/428' % (ad_id, product_id), - 'episode_id': episode_id, - 'episode_number': int_or_none(episode_id), - 'ie_key': 'DaisukiMotto', - }) - - return self.playlist_result(entries, playlist_title='DRAGON BALL SUPER') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8df9d95b1..e2ebe8f95 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -254,10 +254,6 @@ from .dailymotion import ( DailymotionPlaylistIE, DailymotionUserIE, ) -from .daisuki import ( - DaisukiMottoIE, - DaisukiMottoPlaylistIE, -) from .daum import ( DaumIE, DaumClipIE, From 88b87b08b1ed06940053ee018547de051bf8d986 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 17:01:21 +0100 Subject: [PATCH 0179/1705] [minhateca] remove extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/minhateca.py | 70 ------------------------------ 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/minhateca.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e2ebe8f95..dfd0ef198 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -625,7 +625,6 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) -from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py deleted file mode 100644 index dccc54249..000000000 --- a/youtube_dl/extractor/minhateca.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - parse_filesize, - sanitized_Request, - urlencode_postdata, -) - - -class MinhatecaIE(InfoExtractor): - _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P[0-9]+)\.' - _TEST = { - 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)', - 'info_dict': { - 'id': '125848331', - 'ext': 'mp4', - 'title': 'youtube-dl test video', - 'thumbnail': r're:^https?://.*\.jpg$', - 'filesize_approx': 1530000, - 'duration': 9, - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - token = self._html_search_regex( - r'(.*?)', webpage, 'title') - title, _, ext = title_str.rpartition('.') - filesize_approx = parse_filesize(self._html_search_regex( - r'

    (.*?)

    ', - webpage, 'file size approximation', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)

    .*?class="bold">(.*?)<', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'

    ([0-9]+)

    ', - webpage, 'view count', fatal=False)) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'ext': ext, - 'filesize_approx': filesize_approx, - 'duration': duration, - 'view_count': view_count, - 'thumbnail': self._og_search_thumbnail(webpage), - } From 9e46d1f8aadd38f6de7c2b921b294e67ed2267eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 9 Nov 2019 17:15:15 +0100 Subject: [PATCH 0180/1705] [addanime] remove extractor --- youtube_dl/extractor/addanime.py | 95 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 96 deletions(-) delete mode 100644 youtube_dl/extractor/addanime.py diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py deleted file mode 100644 index 5e7c0724e..000000000 --- a/youtube_dl/extractor/addanime.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, -) -from ..utils import ( - ExtractorError, - qualities, -) - - -class AddAnimeIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' - _TESTS = [{ - 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - 'md5': '72954ea10bc979ab5e2eb288b21425a0', - 'info_dict': { - 'id': '24MR3YO5SAS9', - 'ext': 'mp4', - 'description': 'One Piece 606', - 'title': 'One Piece 606', - }, - 'skip': 'Video is gone', - }, { - 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage(url, video_id) - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError) or \ - ee.cause.code != 503: - raise - - redir_webpage = ee.cause.read().decode('utf-8') - action = self._search_regex( - r'
    ', - redir_webpage, 'redirect vc value') - av = re.search( - r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', - redir_webpage) - if av is None: - raise ExtractorError('Cannot find redirect math task') - av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) - - parsed_url = compat_urllib_parse_urlparse(url) - av_val = av_res + len(parsed_url.netloc) - confirm_url = ( - parsed_url.scheme + '://' + parsed_url.netloc - + action + '?' - + compat_urllib_parse_urlencode({ - 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) - self._download_webpage( - confirm_url, video_id, - note='Confirming after redirect') - webpage = self._download_webpage(url, video_id) - - FORMATS = ('normal', 'hq') - quality = qualities(FORMATS) - formats = [] - for format_id in FORMATS: - rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) - video_url = self._search_regex(rex, webpage, 'video file URLx', - fatal=False) - if not video_url: - continue - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'quality': quality(format_id), - }) - self._sort_formats(formats) - video_title = self._og_search_title(webpage) - video_description = self._og_search_description(webpage) - - return { - '_type': 'video', - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'description': video_description - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dfd0ef198..d96f0d284 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -18,7 +18,6 @@ from .acast import ( ACastIE, ACastChannelIE, ) -from .addanime import AddAnimeIE from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import ( From 433e0710585e2414697cff6d444204e1db950bd7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 10 Nov 2019 17:02:47 +0100 Subject: [PATCH 0181/1705] [facebook] fix posts video data extraction(closes #22473) --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index c723726b7..ce64e2683 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -334,7 +334,7 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json( self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) From 2e9ad59a4d6dfd82b34a965cfc5b8c5a647d1598 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Nov 2019 09:53:04 +0100 Subject: [PATCH 0182/1705] [soundcloud] check if the soundtrack has downloads left(closes #23045) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 875b9d887..e8ffb2cbe 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -276,7 +276,7 @@ class SoundcloudIE(InfoExtractor): if secret_token: query['secret_token'] = secret_token - if info.get('downloadable'): + if info.get('downloadable') and info.get('has_downloads_left'): format_url = update_url_query( info.get('download_url') or track_base_url + '/download', query) format_urls.add(format_url) From 48970d5cc8838ac404a64462d175b248401e2bd2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 12 Nov 2019 10:51:54 +0100 Subject: [PATCH 0183/1705] [teamcoco] add support for new videos(closes #23054) --- youtube_dl/extractor/teamcoco.py | 68 +++++++++++++++++--------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7640cf00a..5793b711f 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -84,6 +84,19 @@ class TeamcocoIE(TurnerBaseIE): 'only_matching': True, } ] + _RECORD_TEMPL = '''id + title + teaser + publishOn + thumb { + preview + } + tags { + name + } + duration + turnerMediaId + turnerMediaAuthToken''' def _graphql_call(self, query_template, object_type, object_id): find_object = 'find' + object_type @@ -98,36 +111,36 @@ class TeamcocoIE(TurnerBaseIE): display_id = self._match_id(url) response = self._graphql_call('''{ - %s(slug: "%s") { + %%s(slug: "%%s") { ... on RecordSlug { record { + %s + } + } + ... on PageSlug { + child { id - title - teaser - publishOn - thumb { - preview - } - file { - url - } - tags { - name - } - duration - turnerMediaId - turnerMediaAuthToken } } ... on NotFoundSlug { status } } -}''', 'Slug', display_id) +}''' % self._RECORD_TEMPL, 'Slug', display_id) if response.get('status'): raise ExtractorError('This video is no longer available.', expected=True) - record = response['record'] + child = response.get('child') + if child: + record = self._graphql_call('''{ + %%s(id: "%%s") { + ... on Video { + %s + } + } +}''' % self._RECORD_TEMPL, 'Record', child['id']) + else: + record = response['record'] video_id = record['id'] info = { @@ -150,25 +163,21 @@ class TeamcocoIE(TurnerBaseIE): 'accessTokenType': 'jws', })) else: - d = self._download_json( + video_sources = self._download_json( 'https://teamcoco.com/_truman/d/' + video_id, - video_id, fatal=False) or {} - video_sources = d.get('meta') or {} - if not video_sources: - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} + video_id)['meta']['src'] + if isinstance(video_sources, dict): + video_sources = video_sources.values() formats = [] get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): + for src in video_sources: if not isinstance(src, dict): continue src_url = src.get('src') if not src_url: continue + format_id = src.get('label') ext = determine_ext(src_url, mimetype2ext(src.get('type'))) if format_id == 'hls' or ext == 'm3u8': # compat_urllib_parse.urljoin does not work here @@ -190,9 +199,6 @@ class TeamcocoIE(TurnerBaseIE): 'format_id': format_id, 'quality': get_quality(format_id), }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) self._sort_formats(formats) info['formats'] = formats From eb22d1b55744b69d5ec3556529868acfba6c217f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 13 Nov 2019 19:09:32 +0100 Subject: [PATCH 0184/1705] [nexx] Add support for Multi Player JS Setup(closes #23052) --- youtube_dl/extractor/nexx.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index f9aad83c4..586c1b7eb 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -108,7 +108,7 @@ class NexxIE(InfoExtractor): @staticmethod def _extract_domain_id(webpage): mobj = re.search( - r']+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P\d+)', + r']+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P\d+)', webpage) return mobj.group('id') if mobj else None @@ -123,7 +123,7 @@ class NexxIE(InfoExtractor): domain_id = NexxIE._extract_domain_id(webpage) if domain_id: for video_id in re.findall( - r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)', + r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)', webpage): entries.append( 'https://api.nexx.cloud/v3/%s/videos/byid/%s' @@ -410,8 +410,8 @@ class NexxIE(InfoExtractor): class NexxEmbedIE(InfoExtractor): - _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P[^/?#&]+)' + _TESTS = [{ 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', 'md5': '16746bfc28c42049492385c989b26c4a', 'info_dict': { @@ -420,7 +420,6 @@ class NexxEmbedIE(InfoExtractor): 'title': 'Nervenkitzel Achterbahn', 'alt_title': 'Karussellbauer in Deutschland', 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, 'creator': 'SPIEGEL TV', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2761, @@ -431,7 +430,10 @@ class NexxEmbedIE(InfoExtractor): 'format': 'bestvideo', 'skip_download': True, }, - } + }, { + 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): From 5709d661a2509fab0c9f3412239ecbe7a621f45b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Nov 2019 01:45:04 +0700 Subject: [PATCH 0185/1705] [drtv] Add support for new URL schema (closes #23059) --- youtube_dl/extractor/drtv.py | 57 ++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 218f10209..390e79f8c 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -17,6 +17,7 @@ from ..utils import ( float_or_none, mimetype2ext, str_or_none, + try_get, unified_timestamp, update_url_query, url_or_none, @@ -24,7 +25,14 @@ from ..utils import ( class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + ) + (?P[\da-z_-]+) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['DK'] IE_NAME = 'drtv' @@ -83,6 +91,26 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', + 'info_dict': { + 'id': '00951930010', + 'ext': 'mp4', + 'title': 'Bonderøven (1:8)', + 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', + 'timestamp': 1546542000, + 'upload_date': '20190103', + 'duration': 2576.6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769', + 'only_matching': True, + }, { + 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,13 +128,32 @@ class DRTVIE(InfoExtractor): webpage, 'video id', default=None) if not video_id: - video_id = compat_urllib_parse_unquote(self._search_regex( + video_id = self._search_regex( r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', - webpage, 'urn')) + webpage, 'urn', default=None) + if video_id: + video_id = compat_urllib_parse_unquote(video_id) + + _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard' + query = {'expanded': 'true'} + + if video_id: + programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) + else: + programcard_url = _PROGRAMCARD_BASE + page = self._parse_json( + self._search_regex( + r'data\s*=\s*({.+?})\s*(?:;| Date: Thu, 14 Nov 2019 06:38:55 +0100 Subject: [PATCH 0186/1705] [comcarcoff] remove extractor --- youtube_dl/extractor/comcarcoff.py | 74 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 75 deletions(-) delete mode 100644 youtube_dl/extractor/comcarcoff.py diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py deleted file mode 100644 index 588aad0d9..000000000 --- a/youtube_dl/extractor/comcarcoff.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601, -) - - -class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' - _TESTS = [{ - 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', - 'info_dict': { - 'id': '2494164', - 'ext': 'mp4', - 'upload_date': '20141127', - 'timestamp': 1417107600, - 'duration': 1232, - 'title': 'Happy Thanksgiving Miranda', - 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', - }, - 'params': { - 'skip_download': 'requires ffmpeg', - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - if not display_id: - display_id = 'comediansincarsgettingcoffee.com' - webpage = self._download_webpage(url, display_id) - - full_data = self._parse_json( - self._search_regex( - r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), - display_id)['videoData'] - - display_id = full_data['activeVideo']['video'] - video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] - - video_id = compat_str(video_data['mediaId']) - title = video_data['title'] - formats = self._extract_m3u8_formats( - video_data['mediaUrl'], video_id, 'mp4') - self._sort_formats(formats) - - thumbnails = [{ - 'url': video_data['images']['thumb'], - }, { - 'url': video_data['images']['poster'], - }] - - timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( - video_data.get('pubDate')) - duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( - video_data.get('duration')) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': timestamp, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - 'season_number': int_or_none(video_data.get('season')), - 'episode_number': int_or_none(video_data.get('episode')), - 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d96f0d284..cf4bb8f20 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -222,7 +222,6 @@ from .comedycentral import ( ComedyCentralTVIE, ToshIE, ) -from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( MmsIE, From 656c20010f53851c1b01e839744f7fe48497c03f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Nov 2019 21:17:47 +0100 Subject: [PATCH 0187/1705] [ivi] fix format extraction(closes #21991) --- youtube_dl/extractor/ivi.py | 56 ++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 86c014b07..efdc3cc98 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -18,6 +18,8 @@ class IviIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] + _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' + _LIGHT_URL = 'https://api.ivi.ru/light/' _TESTS = [ # Single movie @@ -78,25 +80,41 @@ class IviIE(InfoExtractor): 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): + try: + from Crypto.Cipher import Blowfish + from Crypto.Hash import CMAC + except ImportError: + raise ExtractorError('pycrypto not found. Please install it.', expected=True) + video_id = self._match_id(url) - data = { + timestamp = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode())['result'] + + data = json.dumps({ 'method': 'da.content.get', 'params': [ video_id, { - 'site': 's183', + 'site': 's353', 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, 'contentid': video_id } ] - } + }).encode() video_json = self._download_json( - 'http://api.digitalaccess.ru/api/json/', video_id, - 'Downloading video JSON', data=json.dumps(data)) + self._LIGHT_URL, video_id, + 'Downloading video JSON', data=data, query={ + 'ts': timestamp, + 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + data, Blowfish).hexdigest(), + }) - if 'error' in video_json: - error = video_json['error'] + error = video_json.get('error') + if error: origin = error['origin'] if origin == 'NotAllowedForLocation': self.raise_geo_restricted( @@ -108,20 +126,24 @@ class IviIE(InfoExtractor): expected=True) result = video_json['result'] + title = result['title'] quality = qualities(self._KNOWN_FORMATS) - formats = [{ - 'url': x['url'], - 'format_id': x.get('content_format'), - 'quality': quality(x.get('content_format')), - } for x in result['files'] if x.get('url')] - + formats = [] + for f in result.get('files', []): + f_url = f.get('url') + content_format = f.get('content_format') + if not f_url or '-MDRM-' in content_format or '-FPS-' in content_format: + continue + formats.append({ + 'url': f_url, + 'format_id': content_format, + 'quality': quality(content_format), + 'filesize': int_or_none(f.get('size_in_bytes')), + }) self._sort_formats(formats) - title = result['title'] - - duration = int_or_none(result.get('duration')) compilation = result.get('compilation') episode = title if compilation else None @@ -158,7 +180,7 @@ class IviIE(InfoExtractor): 'episode_number': episode_number, 'thumbnails': thumbnails, 'description': description, - 'duration': duration, + 'duration': int_or_none(result.get('duration')), 'formats': formats, } From 1bba88efc7e1f82095f7ae38348e56026db4bf3c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Nov 2019 23:46:31 +0100 Subject: [PATCH 0188/1705] [ivi] sign content request only when pycryptodome is available --- youtube_dl/extractor/ivi.py | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index efdc3cc98..1dcb17c9b 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -80,38 +80,42 @@ class IviIE(InfoExtractor): 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): - try: - from Crypto.Cipher import Blowfish - from Crypto.Hash import CMAC - except ImportError: - raise ExtractorError('pycrypto not found. Please install it.', expected=True) - video_id = self._match_id(url) - timestamp = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading timestamp JSON', data=json.dumps({ - 'method': 'da.timestamp.get', - 'params': [] - }).encode())['result'] - data = json.dumps({ 'method': 'da.content.get', 'params': [ video_id, { - 'site': 's353', + 'site': 's%d', 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, 'contentid': video_id } ] }).encode() - video_json = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading video JSON', data=data, query={ + try: + from Crypto.Cipher import Blowfish + from Crypto.Hash import CMAC + + timestamp = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode())['result'] + + data = data % 353 + query = { 'ts': timestamp, 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + data, Blowfish).hexdigest(), - }) + } + except ImportError: + data = data % 183 + query = {} + + video_json = self._download_json( + self._LIGHT_URL, video_id, + 'Downloading video JSON', data=data, query=query) error = video_json.get('error') if error: @@ -121,6 +125,8 @@ class IviIE(InfoExtractor): msg=error['message'], countries=self._GEO_COUNTRIES) elif origin == 'NoRedisValidData': raise ExtractorError('Video %s does not exist' % video_id, expected=True) + elif origin == 'NotAllowedError': + raise ExtractorError('pycryptodome not found. Please install it.', expected=True) raise ExtractorError( 'Unable to download video %s: %s' % (video_id, error['message']), expected=True) From 7360c06facfd96ee603ad4fc27f5903d3f8f6694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Nov 2019 05:44:14 +0700 Subject: [PATCH 0189/1705] [extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use --- youtube_dl/extractor/common.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4c2f9303e..04d676378 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1455,14 +1455,14 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): + fatal=True, m3u8_id=None, data=None, headers={}, query={}): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source, - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if manifest is False: return [] @@ -1586,12 +1586,13 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, headers={}): + fatal=True, live=False, data=None, headers={}, + query={}): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal, headers=headers) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] @@ -2009,12 +2010,12 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, headers={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal, headers=headers) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] mpd_doc, urlh = res @@ -2317,12 +2318,12 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): + def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] ism_doc, urlh = res From 6c79785bb0c96d6fc22d942946196f0842d70a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Nov 2019 07:47:23 +0700 Subject: [PATCH 0190/1705] [travis] Add python 3.8 build --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index 6d16c2955..14d95fa84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,12 @@ matrix: - python: 3.7 dist: xenial env: YTDL_TEST_SET=download + - python: 3.8 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8 + dist: xenial + env: YTDL_TEST_SET=download - python: 3.8-dev dist: xenial env: YTDL_TEST_SET=core From 9e4e864639bf606a1931a684f130e219e869adfd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 16 Nov 2019 01:51:31 +0100 Subject: [PATCH 0191/1705] [ivi] improve error detection --- youtube_dl/extractor/ivi.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 1dcb17c9b..7f1146d95 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -119,17 +119,20 @@ class IviIE(InfoExtractor): error = video_json.get('error') if error: - origin = error['origin'] + origin = error.get('origin') + message = error.get('message') or error.get('user_message') + extractor_msg = 'Unable to download video %s' if origin == 'NotAllowedForLocation': - self.raise_geo_restricted( - msg=error['message'], countries=self._GEO_COUNTRIES) + self.raise_geo_restricted(message, self._GEO_COUNTRIES) elif origin == 'NoRedisValidData': - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - elif origin == 'NotAllowedError': - raise ExtractorError('pycryptodome not found. Please install it.', expected=True) - raise ExtractorError( - 'Unable to download video %s: %s' % (video_id, error['message']), - expected=True) + extractor_msg = 'Video %s does not exist' + elif message: + if 'недоступен для просмотра на площадке s183' in message: + raise ExtractorError( + 'pycryptodome not found. Please install it.', + expected=True) + extractor_msg += ': ' + message + raise ExtractorError(extractor_msg % video_id, expected=True) result = video_json['result'] title = result['title'] From 7e70620a342c57746812d4a8fae6f436bd90cf57 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 18 Nov 2019 12:51:25 +0100 Subject: [PATCH 0192/1705] [vk] fix wall audio thumbnails extraction(closes #23135) --- youtube_dl/extractor/vk.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 4c8ca4f41..195875938 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -634,14 +634,15 @@ class VKWallPostIE(VKBaseIE): if not a.url: continue title = unescapeHTML(a.title) + performer = unescapeHTML(a.performer) entries.append({ 'id': '%s_%s' % (a.owner_id, a.id), 'url': self._unmask_url(a.url, a.ads['vk_id']), - 'title': '%s - %s' % (a.performer, title) if a.performer else title, - 'thumbnail': a.cover_url.split(',') if a.cover_url else None, - 'duration': a.duration, + 'title': '%s - %s' % (performer, title) if performer else title, + 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, + 'duration': int_or_none(a.duration), 'uploader': uploader, - 'artist': a.performer, + 'artist': performer, 'track': title, 'ext': 'mp4', 'protocol': 'm3u8', From f9c4a4521068a02c583803ea422c6fedfa7598e3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 18 Nov 2019 21:40:53 +0100 Subject: [PATCH 0193/1705] [ntvru] add support for non relative file URLs(closes #23140) --- youtube_dl/extractor/ntvru.py | 49 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 4f9cedb84..c47d1dfa4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - clean_html, - xpath_text, int_or_none, + strip_or_none, + unescapeHTML, + xpath_text, ) @@ -47,10 +48,10 @@ class NTVRuIE(InfoExtractor): 'duration': 1496, }, }, { - 'url': 'http://www.ntv.ru/kino/Koma_film', - 'md5': 'f825770930937aa7e5aca0dc0d29319a', + 'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/', + 'md5': 'e9c7cde24d9d3eaed545911a04e6d4f4', 'info_dict': { - 'id': '1007609', + 'id': '1126480', 'ext': 'mp4', 'title': 'Остросюжетный фильм «Кома»', 'description': 'Остросюжетный фильм «Кома»', @@ -68,6 +69,10 @@ class NTVRuIE(InfoExtractor): 'thumbnail': r're:^http://.*\.jpg', 'duration': 2590, }, + }, { + # Schemeless file URL + 'url': 'https://www.ntv.ru/video/1797442', + 'only_matching': True, }] _VIDEO_ID_REGEXES = [ @@ -96,37 +101,31 @@ class NTVRuIE(InfoExtractor): 'http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML') - title = clean_html(xpath_text(player, './data/title', 'title', fatal=True)) - description = clean_html(xpath_text(player, './data/description', 'description')) + title = strip_or_none(unescapeHTML(xpath_text(player, './data/title', 'title', fatal=True))) video = player.find('./data/video') - video_id = xpath_text(video, './id', 'video id') - thumbnail = xpath_text(video, './splash', 'thumbnail') - duration = int_or_none(xpath_text(video, './totaltime', 'duration')) - view_count = int_or_none(xpath_text(video, './views', 'view count')) - - token = self._download_webpage( - 'http://stat.ntv.ru/services/access/token', - video_id, 'Downloading access token') formats = [] for format_id in ['', 'hi', 'webm']: - file_ = video.find('./%sfile' % format_id) - if file_ is None: + file_ = xpath_text(video, './%sfile' % format_id) + if not file_: continue - size = video.find('./%ssize' % format_id) + if file_.startswith('//'): + file_ = self._proto_relative_url(file_) + elif not file_.startswith('http'): + file_ = 'http://media.ntv.ru/vod/' + file_ formats.append({ - 'url': 'http://media2.ntv.ru/vod/%s&tok=%s' % (file_.text, token), - 'filesize': int_or_none(size.text if size is not None else None), + 'url': file_, + 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)), }) self._sort_formats(formats) return { - 'id': video_id, + 'id': xpath_text(video, './id'), 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, + 'description': strip_or_none(unescapeHTML(xpath_text(player, './data/description'))), + 'thumbnail': xpath_text(video, './splash'), + 'duration': int_or_none(xpath_text(video, './totaltime')), + 'view_count': int_or_none(xpath_text(video, './views')), 'formats': formats, } From 76d9eca43dd4fd7698d138b90ab6b2dd159559e0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Nov 2019 20:16:31 +0100 Subject: [PATCH 0194/1705] [ivi] fallback to old extraction method for unknown error codes --- youtube_dl/extractor/ivi.py | 79 +++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 7f1146d95..0db023622 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -93,46 +93,57 @@ class IviIE(InfoExtractor): ] }).encode() - try: - from Crypto.Cipher import Blowfish - from Crypto.Hash import CMAC + for site in (353, 183): + content_data = data % site + if site == 353: + try: + from Cryptodome.Cipher import Blowfish + from Cryptodome.Hash import CMAC + pycryptodomex_found = True + except ImportError: + pycryptodomex_found = False + continue - timestamp = self._download_json( + timestamp = (self._download_json( + self._LIGHT_URL, video_id, + 'Downloading timestamp JSON', data=json.dumps({ + 'method': 'da.timestamp.get', + 'params': [] + }).encode(), fatal=False) or {}).get('result') + if not timestamp: + continue + + query = { + 'ts': timestamp, + 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(), + } + else: + query = {} + + video_json = self._download_json( self._LIGHT_URL, video_id, - 'Downloading timestamp JSON', data=json.dumps({ - 'method': 'da.timestamp.get', - 'params': [] - }).encode())['result'] + 'Downloading video JSON', data=content_data, query=query) - data = data % 353 - query = { - 'ts': timestamp, - 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + data, Blowfish).hexdigest(), - } - except ImportError: - data = data % 183 - query = {} - - video_json = self._download_json( - self._LIGHT_URL, video_id, - 'Downloading video JSON', data=data, query=query) - - error = video_json.get('error') - if error: - origin = error.get('origin') - message = error.get('message') or error.get('user_message') - extractor_msg = 'Unable to download video %s' - if origin == 'NotAllowedForLocation': - self.raise_geo_restricted(message, self._GEO_COUNTRIES) - elif origin == 'NoRedisValidData': - extractor_msg = 'Video %s does not exist' - elif message: - if 'недоступен для просмотра на площадке s183' in message: + error = video_json.get('error') + if error: + origin = error.get('origin') + message = error.get('message') or error.get('user_message') + extractor_msg = 'Unable to download video %s' + if origin == 'NotAllowedForLocation': + self.raise_geo_restricted(message, self._GEO_COUNTRIES) + elif origin == 'NoRedisValidData': + extractor_msg = 'Video %s does not exist' + elif site == 353: + continue + elif not pycryptodomex_found: raise ExtractorError( 'pycryptodome not found. Please install it.', expected=True) - extractor_msg += ': ' + message - raise ExtractorError(extractor_msg % video_id, expected=True) + elif message: + extractor_msg += ': ' + message + raise ExtractorError(extractor_msg % video_id, expected=True) + else: + break result = video_json['result'] title = result['title'] From f0f6a7e73f55b6227c40af17c6fcab44b5a2df79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Nov 2019 23:21:03 +0700 Subject: [PATCH 0195/1705] [chaturbate] Fix extraction (closes #23010, closes #23012) --- youtube_dl/extractor/chaturbate.py | 42 +++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 656e715ae..a459dcb8d 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + lowercase_escape, + url_or_none, +) class ChaturbateIE(InfoExtractor): @@ -38,12 +42,31 @@ class ChaturbateIE(InfoExtractor): 'https://chaturbate.com/%s/' % video_id, video_id, headers=self.geo_verification_headers()) - m3u8_urls = [] + found_m3u8_urls = [] - for m in re.finditer( - r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): - m3u8_fast_url, m3u8_no_fast_url = m.group('url'), m.group( - 'url').replace('_fast', '') + data = self._parse_json( + self._search_regex( + r'initialRoomDossier\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if data: + m3u8_url = url_or_none(data.get('hls_source')) + if m3u8_url: + found_m3u8_urls.append(m3u8_url) + + if not found_m3u8_urls: + for m in re.finditer( + r'(\\u002[27])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(lowercase_escape(m.group('url'))) + + if not found_m3u8_urls: + for m in re.finditer( + r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(m.group('url')) + + m3u8_urls = [] + for found_m3u8_url in found_m3u8_urls: + m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '') for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url): if m3u8_url not in m3u8_urls: m3u8_urls.append(m3u8_url) @@ -63,7 +86,12 @@ class ChaturbateIE(InfoExtractor): formats = [] for m3u8_url in m3u8_urls: - m3u8_id = 'fast' if '_fast' in m3u8_url else 'slow' + for known_id in ('fast', 'slow'): + if '_%s' % known_id in m3u8_url: + m3u8_id = known_id + break + else: + m3u8_id = None formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4', # ffmpeg skips segments for fast m3u8 From 25d3f770e6ef518a4230ad41bd4ea69dd2e851af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Nov 2019 23:22:59 +0700 Subject: [PATCH 0196/1705] [ivi] Ask for pycryptodomex instead of pycryptodome See discussion at https://github.com/ytdl-org/youtube-dl/commit/1bba88efc7e1f82095f7ae38348e56026db4bf3c#r35982110 --- youtube_dl/extractor/ivi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 0db023622..52b53bfeb 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -137,7 +137,7 @@ class IviIE(InfoExtractor): continue elif not pycryptodomex_found: raise ExtractorError( - 'pycryptodome not found. Please install it.', + 'pycryptodomex not found. Please install it.', expected=True) elif message: extractor_msg += ': ' + message From f8015c15746e83394ecc395c6a13823d20971772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Nov 2019 23:38:39 +0700 Subject: [PATCH 0197/1705] [ivi] Fix python 3.4 support --- youtube_dl/extractor/ivi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 52b53bfeb..315ea03fa 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -91,10 +91,10 @@ class IviIE(InfoExtractor): 'contentid': video_id } ] - }).encode() + }) for site in (353, 183): - content_data = data % site + content_data = (data % site).encode() if site == 353: try: from Cryptodome.Cipher import Blowfish From 80a51fc2ef3ebb7d3e3d5fd0b6e9942bb4be6f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Nov 2019 01:10:24 +0700 Subject: [PATCH 0198/1705] [ivi] Skip s353 for bundled exe See https://github.com/Legrandin/pycryptodome/issues/228 --- youtube_dl/extractor/ivi.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 315ea03fa..a502e8806 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json +import re +import sys from .common import InfoExtractor from ..utils import ( @@ -93,9 +94,13 @@ class IviIE(InfoExtractor): ] }) + bundled = hasattr(sys, 'frozen') + for site in (353, 183): content_data = (data % site).encode() if site == 353: + if bundled: + continue try: from Cryptodome.Cipher import Blowfish from Cryptodome.Hash import CMAC @@ -135,6 +140,10 @@ class IviIE(InfoExtractor): extractor_msg = 'Video %s does not exist' elif site == 353: continue + elif bundled: + raise ExtractorError( + 'This feature does not work from bundled exe. Run youtube-dl from sources.', + expected=True) elif not pycryptodomex_found: raise ExtractorError( 'pycryptodomex not found. Please install it.', From fb8dfc5a2772ca35dd65bad7b7565ad6ec1ad4dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Nov 2019 01:21:00 +0700 Subject: [PATCH 0199/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ChangeLog b/ChangeLog index d46d20082..acee2a75a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,49 @@ +version + +Core ++ [extractor/common] Clean jwplayer description HTML tags ++ [extractor/common] Add data, headers and query to all major extract formats + methods + +Extractors +* [chaturbate] Fix extraction (#23010, #23012) ++ [ntvru] Add support for non relative file URLs (#23140) +* [vk] Fix wall audio thumbnails extraction (#23135) +* [ivi] Fix format extraction (#21991) +- [comcarcoff] Remove extractor ++ [drtv] Add support for new URL schema (#23059) ++ [nexx] Add support for Multi Player JS Setup (#23052) ++ [teamcoco] Add support for new videos (#23054) +* [soundcloud] Check if the soundtrack has downloads left (#23045) +* [facebook] Fix posts video data extraction (#22473) +- [addanime] Remove extractor +- [minhateca] Remove extractor +- [daisuki] Remove extractor +* [seeker] Fix extraction +- [revision3] Remove extractors +* [twitch] Fix video comments URL (#18593, #15828) +* [twitter] Improve extraction + + Add support for generic embeds (#22168) + * Always extract http formats for native videos (#14934) + + Add support for Twitter Broadcasts (#21369) + + Extract more metadata + * Improve VMap format extraction + * Unify extraction code for both twitter statuses and cards ++ [twitch] Add support for Clip embed URLs +* [lnkgo] Fix extraction (#16834) +* [mixcloud] Improve extraction + * Improve metadata extraction (#11721) + * Fix playlist extraction (#22378) + * Fix user mixes extraction (#15197, #17865) ++ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) +* [onionstudios] Fix extraction ++ [hotstar] Pass Referer header to format requests (#22836) +* [dplay] Minimize response size ++ [patreon] Extract uploader_id and filesize +* [patreon] Minimize response size +* [roosterteeth] Fix login request (#16094, #22689) + + version 2019.11.05 Extractors From 0de9fd24dc8723c78a90cb546e4a05818304521e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Nov 2019 01:24:27 +0700 Subject: [PATCH 0200/1705] release 2019.11.22 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 10 ++-------- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 22 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 12de9add2..d3e11cdcf 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.05 + [debug] youtube-dl version 2019.11.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8a6202cf6..51bf4db3b 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 83f91d5fe..19025ff25 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index be8e70f1e..a381b6979 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.05 + [debug] youtube-dl version 2019.11.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 7544d171c..9c945d5ec 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.05** +- [ ] I've verified that I'm running youtube-dl version **2019.11.22** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index acee2a75a..daaff3eef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.11.22 Core + [extractor/common] Clean jwplayer description HTML tags diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 536b87479..3dcb026c5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,7 +26,6 @@ - **AcademicEarth:Course** - **acast** - **acast:channel** - - **AddAnime** - **ADN**: Anime Digital Network - **AdobeConnect** - **AdobeTV** @@ -175,7 +174,6 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** - **ComedyCentralShortname** @@ -203,8 +201,6 @@ - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** - - **DaisukiMotto** - - **DaisukiMottoPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -404,6 +400,7 @@ - **Ketnet** - **KhanAcademy** - **KickStarter** + - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** - **kontrtube**: KontrTube.ru - Труба зовёт @@ -485,14 +482,12 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** - - **Minhateca** - **MinistryGrid** - **Minoto** - **miomio.tv** - **MiTele**: mitele.es - **mixcloud** - **mixcloud:playlist** - - **mixcloud:stream** - **mixcloud:user** - **Mixer:live** - **Mixer:vod** @@ -723,8 +718,6 @@ - **Restudy** - **Reuters** - **ReverbNation** - - **revision** - - **revision3:embed** - **RICE** - **RMCDecouverte** - **RockstarGames** @@ -958,6 +951,7 @@ - **twitch:vod** - **twitter** - **twitter:amplify** + - **twitter:broadcast** - **twitter:card** - **udemy** - **udemy:course** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8012a66db..361809681 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.11.05' +__version__ = '2019.11.22' From cf3c9eafad5e6b83788e15a605aa6804b1ab307c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Nov 2019 00:03:51 +0700 Subject: [PATCH 0201/1705] [soundcloud] Update client id (closes #23214) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index e8ffb2cbe..988dec4fa 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -245,7 +245,7 @@ class SoundcloudIE(InfoExtractor): _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' - _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI' + _CLIENT_ID = 'UW9ajvMgVdMMW3cdeBi8lPfN6dvOVGji' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { From 9d30c2132acf2d12bfa8e559987c341c76d9cd24 Mon Sep 17 00:00:00 2001 From: InfernalUnderling <42065091+InfernalUnderling@users.noreply.github.com> Date: Tue, 26 Nov 2019 17:08:37 +0000 Subject: [PATCH 0202/1705] [utils] Handle rd-suffixed day parts in unified_strdate (#23199) --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 3920542bb..0db37d9d8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -340,6 +340,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('July 15th, 2013'), '20130715') self.assertEqual(unified_strdate('September 1st, 2013'), '20130901') self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902') + self.assertEqual(unified_strdate('November 3rd, 2019'), '20191103') + self.assertEqual(unified_strdate('October 23rd, 2005'), '20051023') def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index aed988b88..0d30075aa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1718,13 +1718,16 @@ DATE_FORMATS = ( '%B %d %Y', '%B %dst %Y', '%B %dnd %Y', + '%B %drd %Y', '%B %dth %Y', '%b %d %Y', '%b %dst %Y', '%b %dnd %Y', + '%b %drd %Y', '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', + '%b %drd %Y %I:%M', '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', From 6ddd4bf6ac04ae0b8ba39fb4124e844afc49b5a9 Mon Sep 17 00:00:00 2001 From: InfernalUnderling <42065091+InfernalUnderling@users.noreply.github.com> Date: Tue, 26 Nov 2019 17:20:39 +0000 Subject: [PATCH 0203/1705] [bitchute] Extract upload date (closes #22990) (#23193) --- youtube_dl/extractor/bitchute.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 430663fbf..0c773e66e 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..utils import ( orderedSet, + unified_strdate, urlencode_postdata, ) @@ -23,6 +24,7 @@ class BitChuteIE(InfoExtractor): 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Victoria X Rave', + 'upload_date': '20170813', }, }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', @@ -74,12 +76,17 @@ class BitChuteIE(InfoExtractor): r'(?s)]+\bclass=["\']video-author[^>]+>(.+?)

    '), webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', + webpage, 'upload date', fatal=False)) + return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'upload_date': upload_date, 'formats': formats, } From 1ced222120c00854865c5b16e89838235ed549ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Nov 2019 02:26:42 +0700 Subject: [PATCH 0204/1705] [utils] Add generic caesar cipher and rot47 --- test/test_utils.py | 16 ++++++++++++++++ youtube_dl/utils.py | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 0db37d9d8..e83c8ea11 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,6 +19,7 @@ from youtube_dl.utils import ( age_restricted, args_to_str, encode_base_n, + caesar, clean_html, date_from_str, DateRange, @@ -69,6 +70,7 @@ from youtube_dl.utils import ( remove_start, remove_end, remove_quotes, + rot47, shell_quote, smuggle_url, str_to_int, @@ -1369,6 +1371,20 @@ Line 1 self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_caesar(self): + self.assertEqual(caesar('ace', 'abcdef', 2), 'cea') + self.assertEqual(caesar('cea', 'abcdef', -2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', -2), 'eac') + self.assertEqual(caesar('eac', 'abcdef', 2), 'ace') + self.assertEqual(caesar('ace', 'abcdef', 0), 'ace') + self.assertEqual(caesar('xyz', 'abcdef', 2), 'xyz') + self.assertEqual(caesar('abc', 'acegik', 2), 'ebg') + self.assertEqual(caesar('ebg', 'acegik', -2), 'abc') + + def test_rot47(self): + self.assertEqual(rot47('youtube-dl'), r'J@FEF36\5=') + self.assertEqual(rot47('YOUTUBE-DL'), r'*~&%&qt\s{') + def test_urshift(self): self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(-3, 1), 2147483646) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0d30075aa..b14603d8a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -5383,6 +5383,19 @@ def decode_packed_codes(code): obfucasted_code) +def caesar(s, alphabet, shift): + if shift == 0: + return s + l = len(alphabet) + return ''.join( + alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c + for c in s) + + +def rot47(s): + return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47) + + def parse_m3u8_attributes(attrib): info = {} for (key, val) in re.findall(r'(?P[A-Z0-9-]+)=(?P"[^"]+"|[^",]+)(?:,|$)', attrib): From edc2a1f68b267abc6b4c94991da4ad83fd8374bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 Nov 2019 02:28:06 +0700 Subject: [PATCH 0205/1705] [vivo] Fix extraction (closes #22328, closes #22279) --- youtube_dl/extractor/shared.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index ff575f592..02295d1a4 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,13 +1,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_b64decode +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote_plus, +) from ..utils import ( determine_ext, ExtractorError, int_or_none, + js_to_json, KNOWN_EXTENSIONS, parse_filesize, + rot47, url_or_none, urlencode_postdata, ) @@ -112,16 +117,22 @@ class VivoIE(SharedBaseIE): webpage, 'filesize', fatal=False)) def _extract_video_url(self, webpage, video_id, url): - def decode_url(encoded_url): + def decode_url_old(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') - stream_url = url_or_none(decode_url(self._search_regex( + stream_url = self._search_regex( r'data-stream\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'stream url', default=None, group='url'))) + 'stream url', default=None, group='url') + if stream_url: + stream_url = url_or_none(decode_url_old(stream_url)) if stream_url: return stream_url - return self._parse_json( + + def decode_url(encoded_url): + return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + + return decode_url(self._parse_json( self._search_regex( - r'InitializeStream\s*\(\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'stream', group='url'), - video_id, transform_source=decode_url)[0] + r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, + 'stream'), + video_id, transform_source=js_to_json)['source']) From df65a4a1ed3096b8210c097c77d00f0391f78503 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Nov 2019 21:53:51 +0100 Subject: [PATCH 0206/1705] [corus] improve extraction - add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com and disneylachaine.ca(closes #20861) - add support for self hosted videos(closes #22075) - detect DRM protection(closes #14910)(closes #9164) --- youtube_dl/extractor/corus.py | 169 ++++++++++++++++++++++------------ 1 file changed, 112 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py index a1b251804..e11aadf14 100644 --- a/youtube_dl/extractor/corus.py +++ b/youtube_dl/extractor/corus.py @@ -4,7 +4,12 @@ from __future__ import unicode_literals import re from .theplatform import ThePlatformFeedIE -from ..utils import int_or_none +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, +) class CorusIE(ThePlatformFeedIE): @@ -12,24 +17,49 @@ class CorusIE(ThePlatformFeedIE): https?:// (?:www\.)? (?P - (?:globaltv|etcanada)\.com| - (?:hgtv|foodnetwork|slice|history|showcase|bigbrothercanada)\.ca + (?: + globaltv| + etcanada| + seriesplus| + wnetwork| + ytv + )\.com| + (?: + hgtv| + foodnetwork| + slice| + history| + showcase| + bigbrothercanada| + abcspark| + disney(?:channel|lachaine) + )\.ca + ) + /(?:[^/]+/)* + (?: + video\.html\?.*?\bv=| + videos?/(?:[^/]+/)*(?:[a-z0-9-]+-)? + ) + (?P + [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}| + (?:[A-Z]{4})?\d{12,20} ) - /(?:video/(?:[^/]+/)?|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=)) - (?P\d+) ''' _TESTS = [{ 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', - 'md5': '05dcbca777bf1e58c2acbb57168ad3a6', 'info_dict': { 'id': '870923331648', 'ext': 'mp4', 'title': 'Movie Night Popcorn with Bryan', 'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.', - 'uploader': 'SHWM-NEW', 'upload_date': '20170206', 'timestamp': 1486392197, }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], }, { 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', 'only_matching': True, @@ -48,58 +78,83 @@ class CorusIE(ThePlatformFeedIE): }, { 'url': 'https://www.bigbrothercanada.ca/video/big-brother-canada-704/1457812035894/', 'only_matching': True + }, { + 'url': 'https://www.seriesplus.com/emissions/dre-mary-mort-sur-ordonnance/videos/deux-coeurs-battant/SERP0055626330000200/', + 'only_matching': True + }, { + 'url': 'https://www.disneychannel.ca/shows/gabby-duran-the-unsittables/video/crybaby-duran-clip/2f557eec-0588-11ea-ae2b-e2c6776b770e/', + 'only_matching': True }] - - _TP_FEEDS = { - 'globaltv': { - 'feed_id': 'ChQqrem0lNUp', - 'account_id': 2269680845, - }, - 'etcanada': { - 'feed_id': 'ChQqrem0lNUp', - 'account_id': 2269680845, - }, - 'hgtv': { - 'feed_id': 'L0BMHXi2no43', - 'account_id': 2414428465, - }, - 'foodnetwork': { - 'feed_id': 'ukK8o58zbRmJ', - 'account_id': 2414429569, - }, - 'slice': { - 'feed_id': '5tUJLgV2YNJ5', - 'account_id': 2414427935, - }, - 'history': { - 'feed_id': 'tQFx_TyyEq4J', - 'account_id': 2369613659, - }, - 'showcase': { - 'feed_id': '9H6qyshBZU3E', - 'account_id': 2414426607, - }, - 'bigbrothercanada': { - 'feed_id': 'ChQqrem0lNUp', - 'account_id': 2269680845, - }, + _GEO_BYPASS = False + _SITE_MAP = { + 'globaltv': 'series', + 'etcanada': 'series', + 'foodnetwork': 'food', + 'bigbrothercanada': 'series', + 'disneychannel': 'disneyen', + 'disneylachaine': 'disneyfr', } def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() - feed_info = self._TP_FEEDS[domain.split('.')[0]] - return self._extract_feed_info('dtjsEC', feed_info['feed_id'], 'byId=' + video_id, video_id, lambda e: { - 'episode_number': int_or_none(e.get('pl1$episode')), - 'season_number': int_or_none(e.get('pl1$season')), - 'series': e.get('pl1$show'), - }, { - 'HLS': { - 'manifest': 'm3u', - }, - 'DesktopHLS Default': { - 'manifest': 'm3u', - }, - 'MP4 MBR': { - 'manifest': 'm3u', - }, - }, feed_info['account_id']) + site = domain.split('.')[0] + path = self._SITE_MAP.get(site, site) + if path != 'series': + path = 'migration/' + path + video = self._download_json( + 'https://globalcontent.corusappservices.com/templates/%s/playlist/' % path, + video_id, query={'byId': video_id}, + headers={'Accept': 'application/json'})[0] + title = video['title'] + + formats = [] + for source in video.get('sources', []): + smil_url = source.get('file') + if not smil_url: + continue + source_type = source.get('type') + note = 'Downloading%s smil file' % (' ' + source_type if source_type else '') + resp = self._download_webpage( + smil_url, video_id, note, fatal=False, + headers=self.geo_verification_headers()) + if not resp: + continue + error = self._parse_json(resp, video_id, fatal=False) + if error: + if error.get('exception') == 'GeoLocationBlocked': + self.raise_geo_restricted(countries=['CA']) + raise ExtractorError(error['description']) + smil = self._parse_xml(resp, video_id, fatal=False) + if smil is None: + continue + namespace = self._parse_smil_namespace(smil) + formats.extend(self._parse_smil_formats( + smil, smil_url, video_id, namespace)) + if not formats and video.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) + + subtitles = {} + for track in video.get('tracks', []): + track_url = track.get('file') + if not track_url: + continue + lang = 'fr' if site in ('disneylachaine', 'seriesplus') else 'en' + subtitles.setdefault(lang, []).append({'url': track_url}) + + metadata = video.get('metadata') or {} + get_number = lambda x: int_or_none(video.get('pl1$' + x) or metadata.get(x + 'Number')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': dict_get(video, ('defaultThumbnailUrl', 'thumbnail', 'image')), + 'description': video.get('description'), + 'timestamp': int_or_none(video.get('availableDate'), 1000), + 'subtitles': subtitles, + 'duration': float_or_none(metadata.get('duration')), + 'series': dict_get(video, ('show', 'pl1$show')), + 'season_number': get_number('season'), + 'episode_number': get_number('episode'), + } From 5ef62fc4ce1f255343d67b70f3cee2f2240cdfba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Nov 2019 22:01:34 +0100 Subject: [PATCH 0207/1705] [dailymotion] improve extraction - extract http formats included in m3u8 manifest - fix user extraction(closes #3553)(closes #21415) - add suport for User Authentication(closes #11491) - fix password protected videos extraction(closes #23176) - respect age limit option and family filter cookie value(closes #18437) - handle video url playlist query param - report alowed countries for geo-restricted videos --- youtube_dl/extractor/common.py | 13 + youtube_dl/extractor/dailymotion.py | 559 +++++++++++----------------- youtube_dl/extractor/vk.py | 3 +- 3 files changed, 234 insertions(+), 341 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 04d676378..eaae5e484 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1766,6 +1766,19 @@ class InfoExtractor(object): # the same GROUP-ID f['acodec'] = 'none' formats.append(f) + + # for DailyMotion + progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') + if progressive_uri: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': progressive_uri, + }) + formats.append(http_f) + last_stream_inf = {} return formats diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 745971900..327fdb04a 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,50 +1,93 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import functools -import hashlib -import itertools import json -import random import re -import string from .common import InfoExtractor -from ..compat import compat_struct_pack +from ..compat import compat_HTTPError from ..utils import ( - determine_ext, - error_to_compat_str, + age_restricted, + clean_html, ExtractorError, int_or_none, - mimetype2ext, OnDemandPagedList, - parse_iso8601, - sanitized_Request, - str_to_int, try_get, unescapeHTML, - update_url_query, - url_or_none, urlencode_postdata, ) class DailymotionBaseInfoExtractor(InfoExtractor): + _FAMILY_FILTER = None + _HEADERS = { + 'Content-Type': 'application/json', + 'Origin': 'https://www.dailymotion.com', + } + _NETRC_MACHINE = 'dailymotion' + + def _get_dailymotion_cookies(self): + return self._get_cookies('https://www.dailymotion.com/') + @staticmethod - def _build_request(url): - """Build a request with the family filter disabled""" - request = sanitized_Request(url) - request.add_header('Cookie', 'family_filter=off; ff=off') - return request + def _get_cookie_value(cookies, name): + cookie = cookies.get('name') + if cookie: + return cookie.value - def _download_webpage_handle_no_ff(self, url, *args, **kwargs): - request = self._build_request(url) - return self._download_webpage_handle(request, *args, **kwargs) + def _set_dailymotion_cookie(self, name, value): + self._set_cookie('www.dailymotion.com', name, value) - def _download_webpage_no_ff(self, url, *args, **kwargs): - request = self._build_request(url) - return self._download_webpage(request, *args, **kwargs) + def _real_initialize(self): + cookies = self._get_dailymotion_cookies() + ff = self._get_cookie_value(cookies, 'ff') + self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self._downloader.params.get('age_limit')) + self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off') + + def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): + if not self._HEADERS.get('Authorization'): + cookies = self._get_dailymotion_cookies() + token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') + if not token: + data = { + 'client_id': 'f1a362d288c1b98099c7', + 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', + } + username, password = self._get_login_info() + if username: + data.update({ + 'grant_type': 'password', + 'password': password, + 'username': username, + }) + else: + data['grant_type'] = 'client_credentials' + try: + token = self._download_json( + 'https://graphql.api.dailymotion.com/oauth/token', + None, 'Downloading Access Token', + data=urlencode_postdata(data))['access_token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), xid)['error_description'], expected=True) + raise + self._set_dailymotion_cookie('access_token' if username else 'client_token', token) + self._HEADERS['Authorization'] = 'Bearer ' + token + + resp = self._download_json( + 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({ + 'query': '''{ + %s(xid: "%s"%s) { + %s + } +}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields), + }).encode(), headers=self._HEADERS) + obj = resp['data'][object_type] + if not obj: + raise ExtractorError(resp['errors'][0]['message'], expected=True) + return obj class DailymotionIE(DailymotionBaseInfoExtractor): @@ -54,18 +97,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P[^/?_]+) + /(?P[^/?_]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? ''' IE_NAME = 'dailymotion' - - _FORMATS = [ - ('stream_h264_ld_url', 'ld'), - ('stream_h264_url', 'standard'), - ('stream_h264_hq_url', 'hq'), - ('stream_h264_hd_url', 'hd'), - ('stream_h264_hd1080_url', 'hd180'), - ] - _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -74,7 +108,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'ext': 'mp4', 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', 'duration': 187, 'timestamp': 1493651285, 'upload_date': '20170501', @@ -146,7 +179,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2', 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw', + 'only_matching': True, }] + _GEO_BYPASS = False + _COMMON_MEDIA_FIELDS = '''description + geoblockedCountries { + allowed + } + xid''' @staticmethod def _extract_urls(webpage): @@ -162,264 +204,140 @@ class DailymotionIE(DailymotionBaseInfoExtractor): return urls def _real_extract(self, url): - video_id = self._match_id(url) + video_id, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage_no_ff( - 'https://www.dailymotion.com/video/%s' % video_id, video_id) + if playlist_id: + if not self._downloader.params.get('noplaylist'): + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + return self.url_result( + 'http://www.dailymotion.com/playlist/' + playlist_id, + 'DailymotionPlaylist', playlist_id) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - age_limit = self._rta_search(webpage) - - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') - - view_count_str = self._search_regex( - (r']+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', - r'video_views_count[^>]+>\s+([\s\d\,.]+)'), - webpage, 'view count', default=None) - if view_count_str: - view_count_str = re.sub(r'\s', '', view_count_str) - view_count = str_to_int(view_count_str) - comment_count = int_or_none(self._search_regex( - r']+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', - webpage, 'comment count', default=None)) - - player_v5 = self._search_regex( - [r'buildPlayer\(({.+?})\);\n', # See https://github.com/ytdl-org/youtube-dl/issues/7826 - r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', - r'buildPlayer\(({.+?})\);', - r'var\s+config\s*=\s*({.+?});', - # New layout regex (see https://github.com/ytdl-org/youtube-dl/issues/13580) - r'__PLAYER_CONFIG__\s*=\s*({.+?});'], - webpage, 'player v5', default=None) - if player_v5: - player = self._parse_json(player_v5, video_id, fatal=False) or {} - metadata = try_get(player, lambda x: x['metadata'], dict) - if not metadata: - metadata_url = url_or_none(try_get( - player, lambda x: x['context']['metadata_template_url1'])) - if metadata_url: - metadata_url = metadata_url.replace(':videoId', video_id) - else: - metadata_url = update_url_query( - 'https://www.dailymotion.com/player/metadata/video/%s' - % video_id, { - 'embedder': url, - 'integration': 'inline', - 'GK_PV5_NEON': '1', - }) - metadata = self._download_json( - metadata_url, video_id, 'Downloading metadata JSON') - - if try_get(metadata, lambda x: x['error']['type']) == 'password_protected': - password = self._downloader.params.get('videopassword') - if password: - r = int(metadata['id'][1:], 36) - us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=') - t = ''.join(random.choice(string.ascii_letters) for i in range(10)) - n = us64e(compat_struct_pack('I', r)) - i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest()) - metadata = self._download_json( - 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id) - - self._check_error(metadata) - - formats = [] - for quality, media_list in metadata['qualities'].items(): - for media in media_list: - media_url = media.get('url') - if not media_url: - continue - type_ = media.get('type') - if type_ == 'application/vnd.lumberjack.manifest': - continue - ext = mimetype2ext(type_) or determine_ext(media_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - media_url, video_id, 'mp4', preference=-1, - m3u8_id='hls', fatal=False) - for f in m3u8_formats: - f['url'] = f['url'].split('#')[0] - formats.append(f) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) - else: - f = { - 'url': media_url, - 'format_id': 'http-%s' % quality, - 'ext': ext, - } - m = re.search(r'H264-(?P\d+)x(?P\d+)', media_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(f) - self._sort_formats(formats) - - title = metadata['title'] - duration = int_or_none(metadata.get('duration')) - timestamp = int_or_none(metadata.get('created_time')) - thumbnail = metadata.get('poster_url') - uploader = metadata.get('owner', {}).get('screenname') - uploader_id = metadata.get('owner', {}).get('id') - - subtitles = {} - subtitles_data = metadata.get('subtitles', {}).get('data', {}) - if subtitles_data and isinstance(subtitles_data, dict): - for subtitle_lang, subtitle in subtitles_data.items(): - subtitles[subtitle_lang] = [{ - 'ext': determine_ext(subtitle_url), - 'url': subtitle_url, - } for subtitle_url in subtitle.get('urls', [])] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - 'view_count': view_count, - 'comment_count': comment_count, - 'formats': formats, - 'subtitles': subtitles, - } - - # vevo embed - vevo_id = self._search_regex( - r'[\w]*)', - webpage, 'vevo embed', default=None) - if vevo_id: - return self.url_result('vevo:%s' % vevo_id, 'Vevo') - - # fallback old player - embed_page = self._download_webpage_no_ff( - 'https://www.dailymotion.com/embed/video/%s' % video_id, - video_id, 'Downloading embed page') - - timestamp = parse_iso8601(self._html_search_meta( - 'video:release_date', webpage, 'upload date')) - - info = self._parse_json( - self._search_regex( - r'var info = ({.*?}),$', embed_page, - 'video info', flags=re.MULTILINE), - video_id) - - self._check_error(info) - - formats = [] - for (key, format_id) in self._FORMATS: - video_url = info.get(key) - if video_url is not None: - m_size = re.search(r'H264-(\d+)x(\d+)', video_url) - if m_size is not None: - width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) - else: - width, height = None, None - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'format_id': format_id, - 'width': width, - 'height': height, - }) - self._sort_formats(formats) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, webpage) - - title = self._og_search_title(webpage, default=None) - if title is None: - title = self._html_search_regex( - r'(?s)]*>(.*?)', webpage, - 'title') - - return { - 'id': video_id, - 'formats': formats, - 'uploader': info['owner.screenname'], - 'timestamp': timestamp, - 'title': title, - 'description': description, - 'subtitles': video_subtitles, - 'thumbnail': info['thumbnail_url'], - 'age_limit': age_limit, - 'view_count': view_count, - 'duration': info['duration'] + password = self._downloader.params.get('videopassword') + media = self._call_api( + 'media', video_id, '''... on Video { + %s + stats { + likes { + total } + views { + total + } + } + } + ... on Live { + %s + audienceCount + isOnAir + }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata', + 'password: "%s"' % self._downloader.params.get('videopassword') if password else None) + xid = media['xid'] - def _check_error(self, info): - error = info.get('error') + metadata = self._download_json( + 'https://www.dailymotion.com/player/metadata/video/' + xid, + xid, 'Downloading metadata JSON', + query={'app': 'com.dailymotion.neon'}) + + error = metadata.get('error') if error: - title = error.get('title') or error['message'] + title = error.get('title') or error['raw_message'] # See https://developer.dailymotion.com/api#access-error if error.get('code') == 'DM007': - self.raise_geo_restricted(msg=title) + allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list) + self.raise_geo_restricted(msg=title, countries=allowed_countries) raise ExtractorError( '%s said: %s' % (self.IE_NAME, title), expected=True) - def _get_subtitles(self, video_id, webpage): - try: - sub_list = self._download_webpage( - 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - info = json.loads(sub_list) - if (info['total'] > 0): - sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) - return sub_lang_list - self._downloader.report_warning('video doesn\'t have subtitles') - return {} + title = metadata['title'] + is_live = media.get('isOnAir') + formats = [] + for quality, media_list in metadata['qualities'].items(): + for m in media_list: + media_url = m.get('url') + media_type = m.get('type') + if not media_url or media_type == 'application/vnd.lumberjack.manifest': + continue + if media_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + f = { + 'url': media_url, + 'format_id': 'http-' + quality, + } + m = re.search(r'/H264-(\d+)x(\d+)(?:-(60)/)?', media_url) + if m: + width, height, fps = map(int_or_none, m.groups()) + f.update({ + 'fps': fps, + 'height': height, + 'width': width, + }) + formats.append(f) + for f in formats: + f['url'] = f['url'].split('#')[0] + if not f.get('fps') and f['format_id'].endswith('@60'): + f['fps'] = 60 + self._sort_formats(formats) + + subtitles = {} + subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} + for subtitle_lang, subtitle in subtitles_data.items(): + subtitles[subtitle_lang] = [{ + 'url': subtitle_url, + } for subtitle_url in subtitle.get('urls', [])] + + thumbnails = [] + for height, poster_url in metadata.get('posters', {}).items(): + thumbnails.append({ + 'height': int_or_none(height), + 'id': height, + 'url': poster_url, + }) + + owner = metadata.get('owner') or {} + stats = media.get('stats') or {} + get_count = lambda x: int_or_none(try_get(stats, lambda y: y[x + 's']['total'])) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(media.get('description')), + 'thumbnails': thumbnails, + 'duration': int_or_none(metadata.get('duration')) or None, + 'timestamp': int_or_none(metadata.get('created_time')), + 'uploader': owner.get('screenname'), + 'uploader_id': owner.get('id') or metadata.get('screenname'), + 'age_limit': 18 if metadata.get('explicit') else 0, + 'tags': metadata.get('tags'), + 'view_count': get_count('view') or int_or_none(media.get('audienceCount')), + 'like_count': get_count('like'), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } -class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): - IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?Px[0-9a-z]+)' - _TESTS = [{ - 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', - 'info_dict': { - 'title': 'SPORT', - 'id': 'xv4bw', - }, - 'playlist_mincount': 20, - }] +class DailymotionPlaylistBaseIE(DailymotionBaseInfoExtractor): _PAGE_SIZE = 100 - def _fetch_page(self, playlist_id, authorizaion, page): + def _fetch_page(self, playlist_id, page): page += 1 - videos = self._download_json( - 'https://graphql.api.dailymotion.com', - playlist_id, 'Downloading page %d' % page, - data=json.dumps({ - 'query': '''{ - collection(xid: "%s") { - videos(first: %d, page: %d) { - pageInfo { - hasNextPage - nextPage - } + videos = self._call_api( + self._OBJECT_TYPE, playlist_id, + '''videos(allowExplicit: %s, first: %d, page: %d) { edges { node { xid url } } - } - } -}''' % (playlist_id, self._PAGE_SIZE, page) - }).encode(), headers={ - 'Authorization': authorizaion, - 'Origin': 'https://www.dailymotion.com', - })['data']['collection']['videos'] + }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page), + 'Downloading page %d' % page)['videos'] for edge in videos['edges']: node = edge['node'] yield self.url_result( @@ -427,86 +345,49 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - api = self._parse_json(self._search_regex( - r'__PLAYER_CONFIG__\s*=\s*({.+?});', - webpage, 'player config'), playlist_id)['context']['api'] - auth = self._download_json( - api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), - playlist_id, data=urlencode_postdata({ - 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), - 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), - 'grant_type': 'client_credentials', - })) - authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) entries = OnDemandPagedList(functools.partial( - self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + self._fetch_page, playlist_id), self._PAGE_SIZE) return self.playlist_result( - entries, playlist_id, - self._og_search_title(webpage)) + entries, playlist_id) -class DailymotionUserIE(DailymotionBaseInfoExtractor): +class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): + IE_NAME = 'dailymotion:playlist' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?Px[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', + 'info_dict': { + 'id': 'xv4bw', + }, + 'playlist_mincount': 20, + }] + _OBJECT_TYPE = 'collection' + + +class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P[^/]+)' - _MORE_PAGES_INDICATOR = r'(?s)
    .*?[^/]+)' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', 'info_dict': { 'id': 'nqtv', - 'title': 'Rémi Gaillard', }, - 'playlist_mincount': 100, + 'playlist_mincount': 152, }, { 'url': 'http://www.dailymotion.com/user/UnderProject', 'info_dict': { 'id': 'UnderProject', - 'title': 'UnderProject', }, - 'playlist_mincount': 1800, - 'expected_warnings': [ - 'Stopped at duplicated page', - ], + 'playlist_mincount': 1000, 'skip': 'Takes too long time', + }, { + 'url': 'https://www.dailymotion.com/user/nqtv', + 'info_dict': { + 'id': 'nqtv', + }, + 'playlist_mincount': 148, + 'params': { + 'age_limit': 0, + }, }] - - def _extract_entries(self, id): - video_ids = set() - processed_urls = set() - for pagenum in itertools.count(1): - page_url = self._PAGE_TEMPLATE % (id, pagenum) - webpage, urlh = self._download_webpage_handle_no_ff( - page_url, id, 'Downloading page %s' % pagenum) - if urlh.geturl() in processed_urls: - self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( - page_url, urlh.geturl()), id) - break - - processed_urls.add(urlh.geturl()) - - for video_id in re.findall(r'data-xid="(.+?)"', webpage): - if video_id not in video_ids: - yield self.url_result( - 'http://www.dailymotion.com/video/%s' % video_id, - DailymotionIE.ie_key(), video_id) - video_ids.add(video_id) - - if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: - break - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user = mobj.group('user') - webpage = self._download_webpage( - 'https://www.dailymotion.com/user/%s' % user, user) - full_user = unescapeHTML(self._html_search_regex( - r'' % re.escape(user), - webpage, 'user')) - - return { - '_type': 'playlist', - 'id': user, - 'title': full_user, - 'entries': self._extract_entries(user), - } + _OBJECT_TYPE = 'channel' diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 195875938..a5e4a3e67 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -216,8 +216,7 @@ class VKIE(VKBaseIE): 'id': 'k3lz2cmXyRuJQSjGHUv', 'ext': 'mp4', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', - # TODO: fix test by fixing dailymotion description extraction - 'description': 'md5:c651358f03c56f1150b555c26d90a0fd', + 'description': 'md5:424b8e88cc873217f520e582ba28bb36', 'uploader': 'AniLibria.Tv', 'upload_date': '20160914', 'uploader_id': 'x1p5vl5', From 6471d0d3b8086b282622c84a9eea968d4edfcf9b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 26 Nov 2019 23:57:37 +0100 Subject: [PATCH 0208/1705] [openload] remove OpenLoad related extractors(closes #11999)(closes #15406) --- youtube_dl/extractor/extractors.py | 5 - youtube_dl/extractor/generic.py | 16 -- youtube_dl/extractor/openload.py | 263 ----------------------------- youtube_dl/extractor/streamango.py | 128 -------------- 4 files changed, 412 deletions(-) delete mode 100644 youtube_dl/extractor/streamango.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cf4bb8f20..0e349b778 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -796,10 +796,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openload import ( - OpenloadIE, - VerystreamIE, -) from .ora import OraTVIE from .orf import ( ORFTVthekIE, @@ -1060,7 +1056,6 @@ from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE -from .streamango import StreamangoIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3d919f656..743ef47db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -88,10 +88,6 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE -from .openload import ( - OpenloadIE, - VerystreamIE, -) from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -3048,18 +3044,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - # Look for Openload embeds - openload_urls = OpenloadIE._extract_urls(webpage) - if openload_urls: - return self.playlist_from_matches( - openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) - - # Look for Verystream embeds - verystream_urls = VerystreamIE._extract_urls(webpage) - if verystream_urls: - return self.playlist_from_matches( - verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key()) - # Look for VideoPress embeds videopress_urls = VideoPressIE._extract_urls(webpage) if videopress_urls: diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 66e38cdb4..0c20d0177 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -3,21 +3,17 @@ from __future__ import unicode_literals import json import os -import re import subprocess import tempfile -from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_kwargs, ) from ..utils import ( check_executable, - determine_ext, encodeArgument, ExtractorError, - get_element_by_id, get_exe_version, is_outdated_version, std_headers, @@ -240,262 +236,3 @@ class PhantomJSwrapper(object): self._load_cookies() return (html, encodeArgument(out)) - - -class OpenloadIE(InfoExtractor): - _DOMAINS = r''' - (?: - openload\.(?:co|io|link|pw)| - oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|online|monster|press|pw|life|live|space|services|website|vip)| - oladblock\.(?:services|xyz|me)|openloed\.co - ) - ''' - _VALID_URL = r'''(?x) - https?:// - (?P - (?:www\.)? - %s - )/ - (?:f|embed)/ - (?P[a-zA-Z0-9-_]+) - ''' % _DOMAINS - _EMBED_WORD = 'embed' - _STREAM_WORD = 'f' - _REDIR_WORD = 'stream' - _URL_IDS = ('streamurl', 'streamuri', 'streamurj') - _TESTS = [{ - 'url': 'https://openload.co/f/kUEfGclsU9o', - 'md5': 'bf1c059b004ebc7a256f89408e65c36e', - 'info_dict': { - 'id': 'kUEfGclsU9o', - 'ext': 'mp4', - 'title': 'skyrim_no-audio_1080.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://openload.co/embed/rjC09fkPLYs', - 'info_dict': { - 'id': 'rjC09fkPLYs', - 'ext': 'mp4', - 'title': 'movie.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': { - 'en': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, # test subtitles only - }, - }, { - 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', - 'only_matching': True, - }, { - 'url': 'https://openload.io/f/ZAn6oz-VZGE/', - 'only_matching': True, - }, { - 'url': 'https://openload.co/f/_-ztPaZtMhM/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout - # for title and ext - 'url': 'https://openload.co/embed/Sxz5sADo82g/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available - # via https://openload.co/f/e-Ixz9ZR5L0/ - 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', - 'only_matching': True, - }, { - 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', - 'only_matching': True, - }, { - 'url': 'http://www.openload.link/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.stream/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', - 'only_matching': True, - }, { - 'url': 'https://oload.win/f/kUEfGclsU9o', - 'only_matching': True, - }, { - 'url': 'https://oload.download/f/kUEfGclsU9o', - 'only_matching': True, - }, { - 'url': 'https://oload.cloud/f/4ZDnBXRWiB8', - 'only_matching': True, - }, { - # Its title has not got its extension but url has it - 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', - 'only_matching': True, - }, { - 'url': 'https://oload.cc/embed/5NEAbI2BDSk', - 'only_matching': True, - }, { - 'url': 'https://oload.icu/f/-_i4y_F_Hs8', - 'only_matching': True, - }, { - 'url': 'https://oload.fun/f/gb6G1H4sHXY', - 'only_matching': True, - }, { - 'url': 'https://oload.club/f/Nr1L-aZ2dbQ', - 'only_matching': True, - }, { - 'url': 'https://oload.info/f/5NEAbI2BDSk', - 'only_matching': True, - }, { - 'url': 'https://openload.pw/f/WyKgK8s94N0', - 'only_matching': True, - }, { - 'url': 'https://oload.pw/f/WyKgK8s94N0', - 'only_matching': True, - }, { - 'url': 'https://oload.live/f/-Z58UZ-GR4M', - 'only_matching': True, - }, { - 'url': 'https://oload.space/f/IY4eZSst3u8/', - 'only_matching': True, - }, { - 'url': 'https://oload.services/embed/bs1NWj1dCag/', - 'only_matching': True, - }, { - 'url': 'https://oload.online/f/W8o2UfN1vNY/', - 'only_matching': True, - }, { - 'url': 'https://oload.monster/f/W8o2UfN1vNY/', - 'only_matching': True, - }, { - 'url': 'https://oload.press/embed/drTBl1aOTvk/', - 'only_matching': True, - }, { - 'url': 'https://oload.website/embed/drTBl1aOTvk/', - 'only_matching': True, - }, { - 'url': 'https://oload.life/embed/oOzZjNPw9Dc/', - 'only_matching': True, - }, { - 'url': 'https://oload.biz/f/bEk3Gp8ARr4/', - 'only_matching': True, - }, { - 'url': 'https://oload.best/embed/kkz9JgVZeWc/', - 'only_matching': True, - }, { - 'url': 'https://oladblock.services/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://oladblock.xyz/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://oladblock.me/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://openloed.co/f/b8NWEgkqNLI/', - 'only_matching': True, - }, { - 'url': 'https://oload.vip/f/kUEfGclsU9o', - 'only_matching': True, - }] - - @classmethod - def _extract_urls(cls, webpage): - return re.findall( - r'(?x)]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' - % (cls._DOMAINS, cls._EMBED_WORD), webpage) - - def _extract_decrypted_page(self, page_url, webpage, video_id): - phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id) - return webpage - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') - - url_pattern = 'https://%s/%%s/%s/' % (host, video_id) - - for path in (self._EMBED_WORD, self._STREAM_WORD): - page_url = url_pattern % path - last = path == self._STREAM_WORD - webpage = self._download_webpage( - page_url, video_id, 'Downloading %s webpage' % path, - fatal=last) - if not webpage: - continue - if 'File not found' in webpage or 'deleted by the owner' in webpage: - if not last: - continue - raise ExtractorError('File not found', expected=True, video_id=video_id) - break - - webpage = self._extract_decrypted_page(page_url, webpage, video_id) - for element_id in self._URL_IDS: - decoded_id = get_element_by_id(element_id, webpage) - if decoded_id: - break - if not decoded_id: - decoded_id = self._search_regex( - (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', - r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, - 'stream URL') - video_url = 'https://%s/%s/%s?mime=true' % (host, self._REDIR_WORD, decoded_id) - - title = self._og_search_title(webpage, default=None) or self._search_regex( - r']+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - entries = self._parse_html5_media_entries(page_url, webpage, video_id) - entry = entries[0] if entries else {} - subtitles = entry.get('subtitles') - - return { - 'id': video_id, - 'title': title, - 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), - 'url': video_url, - 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), - 'subtitles': subtitles, - } - - -class VerystreamIE(OpenloadIE): - IE_NAME = 'verystream' - - _DOMAINS = r'(?:verystream\.com|woof\.tube)' - _VALID_URL = r'''(?x) - https?:// - (?P - (?:www\.)? - %s - )/ - (?:stream|e)/ - (?P[a-zA-Z0-9-_]+) - ''' % _DOMAINS - _EMBED_WORD = 'e' - _STREAM_WORD = 'stream' - _REDIR_WORD = 'gettoken' - _URL_IDS = ('videolink', ) - _TESTS = [{ - 'url': 'https://verystream.com/stream/c1GWQ9ngBBx/', - 'md5': 'd3e8c5628ccb9970b65fd65269886795', - 'info_dict': { - 'id': 'c1GWQ9ngBBx', - 'ext': 'mp4', - 'title': 'Big Buck Bunny.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://verystream.com/e/c1GWQ9ngBBx/', - 'only_matching': True, - }] - - def _extract_decrypted_page(self, page_url, webpage, video_id): - return webpage # for Verystream, the webpage is already decrypted diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py deleted file mode 100644 index f1e17dd88..000000000 --- a/youtube_dl/extractor/streamango.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_chr -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - js_to_json, -) - - -class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net|streamcherry\.com)/(?:f|embed)/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', - 'md5': 'e992787515a182f55e38fc97588d802a', - 'info_dict': { - 'id': 'clapasobsptpkdfe', - 'ext': 'mp4', - 'title': '20170315_150006.mp4', - } - }, { - # no og:title - 'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4', - 'info_dict': { - 'id': 'foqebrpftarclpob', - 'ext': 'mp4', - 'title': 'foqebrpftarclpob', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'gone', - }, { - 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', - 'only_matching': True, - }, { - 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', - 'only_matching': True, - }, { - 'url': 'https://streamcherry.com/f/clapasobsptpkdfe/', - 'only_matching': True, - }] - - def _real_extract(self, url): - def decrypt_src(encoded, val): - ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA' - encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded) - decoded = '' - sm = [None] * 4 - i = 0 - str_len = len(encoded) - while i < str_len: - for j in range(4): - sm[j % 4] = ALPHABET.index(encoded[i]) - i += 1 - char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val - decoded += compat_chr(char_code) - if sm[2] != 0x40: - char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2) - decoded += compat_chr(char_code) - if sm[3] != 0x40: - char_code = ((sm[2] & 0x3) << 0x6) | sm[3] - decoded += compat_chr(char_code) - return decoded - - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage, default=video_id) - - formats = [] - for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): - mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_) - if mobj is None: - continue - - format_ = format_.replace(mobj.group(0), '') - - video = self._parse_json( - format_, video_id, transform_source=js_to_json, - fatal=False) or {} - - mobj = re.search( - r'([\'"])(?P(?:(?!\1).)+)\1\s*,\s*(?P\d+)', - mobj.group(1)) - if mobj is None: - continue - - src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val'))) - if not src: - continue - - ext = determine_ext(src, default_ext=None) - if video.get('type') == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': src, - 'ext': ext or 'mp4', - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'tbr': int_or_none(video.get('bitrate')), - }) - - if not formats: - error = self._search_regex( - r']+\bclass=["\']lead[^>]+>(.+?)

    ', webpage, - 'error', default=None) - if not error and '>Sorry' in webpage: - error = 'Video %s is not available' % video_id - if error: - raise ExtractorError(error, expected=True) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'url': url, - 'title': title, - 'formats': formats, - } From 681ac7c92abbbd55be9796de86c2cc0d1d70a4c9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 27 Nov 2019 13:57:30 +0100 Subject: [PATCH 0209/1705] [vimeo] improve extraction - fix review extraction - fix ondemand extraction - make password protected player case as an expected error(closes #22896) - simplify channel based extractors code --- youtube_dl/extractor/vimeo.py | 177 +++++++++++++++++----------------- 1 file changed, 87 insertions(+), 90 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9abd59d98..baa46d5f3 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -15,18 +15,20 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, determine_ext, + dict_get, ExtractorError, js_to_json, int_or_none, merge_dicts, - NO_DEFAULT, OnDemandPagedList, parse_filesize, RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, + str_or_none, try_get, unified_timestamp, unsmuggle_url, @@ -210,7 +212,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_uploader_url = owner.get('url') return { - 'id': video_id, + 'id': str_or_none(video_data.get('id')) or video_id, 'title': self._live_title(video_title) if is_live else video_title, 'uploader': owner.get('name'), 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, @@ -258,11 +260,11 @@ class VimeoIE(VimeoBaseInfoExtractor): (?: (?: www| - (?Pplayer) + player ) \. )? - vimeo(?Ppro)?\.com/ + vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: @@ -284,7 +286,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '56015672', 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - 'description': 'md5:509a9ad5c9bf97c60faee9203aca4479', + 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'timestamp': 1355990239, 'upload_date': '20121220', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', @@ -293,6 +295,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'license': 'by-sa', }, + 'params': { + 'format': 'best[protocol=https]', + }, }, { 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', @@ -305,8 +310,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30', + 'description': 'md5:2c362968038d4499f4d79f88458590c1', 'duration': 1595, + 'upload_date': '20130610', + 'timestamp': 1370893156, + }, + 'params': { + 'format': 'best[protocol=https]', }, }, { @@ -323,6 +333,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 3610, 'description': None, }, + 'params': { + 'format': 'best[protocol=https]', + }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/68375962', @@ -341,6 +355,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', }, 'params': { + 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, }, @@ -441,10 +456,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': '10Ft Films', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms', 'uploader_id': 'tenfootfilms', + 'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384', + 'upload_date': '20130830', + 'timestamp': 1377853339, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -459,6 +478,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, }, 'params': { + 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, }, @@ -523,7 +543,7 @@ class VimeoIE(VimeoBaseInfoExtractor): def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option') + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -552,28 +572,26 @@ class VimeoIE(VimeoBaseInfoExtractor): r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) orig_url = url - if mobj.group('pro'): + is_pro = 'vimeopro.com/' in url + is_player = '://player.vimeo.com/video/' in url + if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) - elif mobj.group('player'): + if not url: + url = 'https://vimeo.com/' + video_id + elif is_player: url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id - # Retrieve video webpage to extract further information - request = sanitized_Request(url, headers=headers) try: - webpage, urlh = self._download_webpage_handle(request, video_id) + # Retrieve video webpage to extract further information + webpage, urlh = self._download_webpage_handle( + url, video_id, headers=headers) redirect_url = compat_str(urlh.geturl()) - # Some URLs redirect to ondemand can't be extracted with - # this extractor right away thus should be passed through - # ondemand extractor (e.g. https://vimeo.com/73445910) - if VimeoOndemandIE.suitable(redirect_url): - return self.url_result(redirect_url, VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -600,6 +618,7 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None + video_description = None # Extract the config JSON try: @@ -611,17 +630,17 @@ class VimeoIE(VimeoBaseInfoExtractor): # Sometimes new react-based page is served instead of old one that require # different config URL extraction approach (see # https://github.com/ytdl-org/youtube-dl/pull/7209) - vimeo_clip_page_config = self._search_regex( - r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, - 'vimeo clip page config') - page_config = self._parse_json(vimeo_clip_page_config, video_id) + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config'), video_id) config_url = page_config['player']['config_url'] cc_license = page_config.get('cc_license') timestamp = try_get( page_config, lambda x: x['clip']['uploaded_on'], compat_str) - config_json = self._download_webpage(config_url, video_id) - config = json.loads(config_json) + video_description = clean_html(dict_get( + page_config, ('description', 'description_html_escaped'))) + config = self._download_json(config_url, video_id) except RegexNotFoundError: # For pro videos or player.vimeo.com urls # We try to find out to which variable is assigned the config dic @@ -675,14 +694,14 @@ class VimeoIE(VimeoBaseInfoExtractor): {'force_feature_id': True}), 'Vimeo') # Extract video description - - video_description = self._html_search_regex( - r'(?s)]*>(.*?)
    ', - webpage, 'description', default=None) + if not video_description: + video_description = self._html_search_regex( + r'(?s)]*>(.*?)', + webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( 'description', webpage, default=None) - if not video_description and mobj.group('pro'): + if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, note='Downloading webpage for description', @@ -690,7 +709,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not mobj.group('player'): + if not video_description and not is_player: self._downloader.report_warning('Cannot find video description') # Extract upload date @@ -747,9 +766,9 @@ class VimeoIE(VimeoBaseInfoExtractor): return info_dict -class VimeoOndemandIE(VimeoBaseInfoExtractor): +class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -761,24 +780,32 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader': 'גם סרטים', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', + 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'upload_date': '20140906', + 'timestamp': 1410032453, }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', 'info_dict': { - 'id': '126682985', + 'id': '126584684', 'ext': 'mp4', 'title': 'Rävlock, rätt läte på rätt plats', 'uploader': 'Lindroth & Norin', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847', - 'uploader_id': 'user14430847', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin', + 'uploader_id': 'lindrothnorin', + 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', + 'upload_date': '20150502', + 'timestamp': 1430586422, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -790,16 +817,6 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - return self.url_result( - # Some videos require Referer to be passed along with og:video:url - # similarly to generic vimeo embeds (e.g. - # https://vimeo.com/ondemand/36938/126682985). - VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), - VimeoIE.ie_key()) - class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' @@ -815,6 +832,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): }, 'playlist_mincount': 25, }] + _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' def _page_url(self, base_url, pagenum): return '%s/videos/page:%d/' % (base_url, pagenum) @@ -886,14 +904,13 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self.playlist_result(title_and_entries, list_id, list_title) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id) + channel_id = self._match_id(url) + return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r']+?class="user">([^<>]+?)' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', @@ -903,11 +920,7 @@ class VimeoUserIE(VimeoChannelIE): }, 'playlist_mincount': 66, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/%s' class VimeoAlbumIE(VimeoChannelIE): @@ -969,25 +982,18 @@ class VimeoAlbumIE(VimeoChannelIE): r'\s*(.+?)(?:\s+on Vimeo)?', webpage, 'title', fatal=False)) -class VimeoGroupsIE(VimeoAlbumIE): +class VimeoGroupsIE(VimeoChannelIE): IE_NAME = 'vimeo:group' - _VALID_URL = r'https://vimeo\.com/groups/(?P[^/]+)(?:/(?!videos?/\d+)|$)' + _VALID_URL = r'https://vimeo\.com/groups/(?P[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ - 'url': 'https://vimeo.com/groups/rolexawards', + 'url': 'https://vimeo.com/groups/kattykay', 'info_dict': { - 'id': 'rolexawards', - 'title': 'Rolex Awards for Enterprise', + 'id': 'kattykay', + 'title': 'Katty Kay', }, - 'playlist_mincount': 73, + 'playlist_mincount': 27, }] - - def _extract_list_title(self, webpage): - return self._og_search_title(webpage, fatal=False) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name) + _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s' class VimeoReviewIE(VimeoBaseInfoExtractor): @@ -1003,7 +1009,9 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'title': "DICK HARDWICK 'Comedian'", 'uploader': 'Richard Hardwick', 'uploader_id': 'user21297594', - } + 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", + }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1016,7 +1024,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'duration': 2773, 'thumbnail': r're:^https?://.*\.jpg$', 'uploader_id': 'user22258446', - } + }, + 'skip': 'video gone', }, { 'note': 'Password protected', 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', @@ -1036,32 +1045,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() - def _get_config_url(self, webpage_url, video_id, video_password_verified=False): - webpage = self._download_webpage(webpage_url, video_id) - config_url = self._html_search_regex( - r'data-config-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'config URL', default=None, group='url') - if not config_url: - data = self._parse_json(self._search_regex( - r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', - default=NO_DEFAULT if video_password_verified else '{}'), video_id) - config = data.get('vimeo_esi', {}).get('config', {}) - config_url = config.get('configUrl') or try_get(config, lambda x: x['clipData']['configUrl']) - if config_url is None: - self._verify_video_password(webpage_url, video_id, webpage) - config_url = self._get_config_url( - webpage_url, video_id, video_password_verified=True) - return config_url - def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - config_url = self._get_config_url(url, video_id) + clip_data = self._download_json( + page_url.replace('/review/', '/review/data/'), + video_id)['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) - source_format = self._extract_original_format(page_url, video_id) + source_format = self._extract_original_format( + page_url + '/action', video_id) if source_format: info_dict['formats'].append(source_format) self._vimeo_sort_formats(info_dict['formats']) + info_dict['description'] = clean_html(clip_data.get('description')) return info_dict From e3f00f139fc227217325c8e84e0b340e12ee9bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 28 Nov 2019 23:09:48 +0700 Subject: [PATCH 0210/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/ChangeLog b/ChangeLog index daaff3eef..d724d75ce 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,36 @@ +version + +Core ++ [utils] Add generic caesar cipher and rot47 +* [utils] Handle rd-suffixed day parts in unified_strdate (#23199) + +Extractors +* [vimeo] Improve extraction + * Fix review extraction + * Fix ondemand extraction + * Make password protected player case as an expected error (#22896) + * Simplify channel based extractors code +- [openload] Remove extractor (#11999) +- [verystream] Remove extractor +- [streamango] Remove extractor (#15406) +* [dailymotion] Improve extraction + * Extract http formats included in m3u8 manifest + * Fix user extraction (#3553, #21415) + + Add suport for User Authentication (#11491) + * Fix password protected videos extraction (#23176) + * Respect age limit option and family filter cookie value (#18437) + * Handle video url playlist query param + * Report allowed countries for geo-restricted videos +* [corus] Improve extraction + + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com + and disneylachaine.ca (#20861) + + Add support for self hosted videos (#22075) + * Detect DRM protection (#14910, #9164) +* [vivo] Fix extraction (#22328, #22279) ++ [bitchute] Extract upload date (#22990, #23193) +* [soundcloud] Update client id (#23214) + + version 2019.11.22 Core From b568561eba6f4aceb87419e21aba11567c5de7da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 28 Nov 2019 23:25:25 +0700 Subject: [PATCH 0211/1705] release 2019.11.28 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 --- youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 17 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d3e11cdcf..3a94bd621 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.22 + [debug] youtube-dl version 2019.11.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 51bf4db3b..72bee12aa 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 19025ff25..ddf67e951 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a381b6979..7122e2714 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.11.22 + [debug] youtube-dl version 2019.11.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 9c945d5ec..a93882b39 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.11.22** +- [ ] I've verified that I'm running youtube-dl version **2019.11.28** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index d724d75ce..d4f809fc6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.11.28 Core + [utils] Add generic caesar cipher and rot47 diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3dcb026c5..2744dfca8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -618,7 +618,6 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** - - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories @@ -825,7 +824,6 @@ - **Steam** - **Stitcher** - **Streamable** - - **Streamango** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -976,7 +974,6 @@ - **Vbox7** - **VeeHD** - **Veoh** - - **verystream** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 361809681..1227abc0a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.11.22' +__version__ = '2019.11.28' From 348c6bf1c1a00eec323d6e21ff7b9b12699afe04 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:05:06 +0100 Subject: [PATCH 0212/1705] [utils] handle int values passed to str_to_int --- test/test_utils.py | 1 + youtube_dl/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e83c8ea11..fed94a906 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -499,6 +499,7 @@ class TestUtil(unittest.TestCase): def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) + self.assertEqual(str_to_int(523), 523) def test_url_basename(self): self.assertEqual(url_basename('http://foo.de/'), '') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b14603d8a..328f037a8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3519,8 +3519,8 @@ def str_or_none(v, default=None): def str_to_int(int_str): """ A more relaxed version of int_or_none """ - if int_str is None: - return None + if not isinstance(int_str, compat_str): + return int_str int_str = re.sub(r'[,\.\+]', '', int_str) return int(int_str) From 7f641d2c7a68b70d6c1e273af108741e5779bc28 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:06:34 +0100 Subject: [PATCH 0213/1705] [adobetv] improve extaction - use OnDemandPagedList for list extractors - reduce show extraction requests - extract original video format and subtitles - add support for adobe tv embeds --- youtube_dl/extractor/adobetv.py | 239 ++++++++++++++++++++--------- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 166 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 008c98e51..80060f037 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -1,25 +1,119 @@ from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - parse_duration, - unified_strdate, - str_to_int, - int_or_none, float_or_none, + int_or_none, ISO639Utils, - determine_ext, + OnDemandPagedList, + parse_duration, + str_or_none, + str_to_int, + unified_strdate, ) class AdobeTVBaseIE(InfoExtractor): - _API_BASE_URL = 'http://tv.adobe.com/api/v4/' + def _call_api(self, path, video_id, query, note=None): + return self._download_json( + 'http://tv.adobe.com/api/v4/' + path, + video_id, note, query=query)['data'] + + def _parse_subtitles(self, video_data, url_key): + subtitles = {} + for translation in video_data.get('translations', []): + vtt_path = translation.get(url_key) + if not vtt_path: + continue + lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + subtitles.setdefault(lang, []).append({ + 'ext': 'vtt', + 'url': vtt_path, + }) + return subtitles + + def _parse_video_data(self, video_data): + video_id = compat_str(video_data['id']) + title = video_data['title'] + + s3_extracted = False + formats = [] + for source in video_data.get('videos', []): + source_url = source.get('url') + if not source_url: + continue + f = { + 'format_id': source.get('quality_level'), + 'fps': int_or_none(source.get('frame_rate')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('video_data_rate')), + 'width': int_or_none(source.get('width')), + 'url': source_url, + } + original_filename = source.get('original_filename') + if original_filename: + if not (f.get('height') and f.get('width')): + mobj = re.search(r'_(\d+)x(\d+)', original_filename) + if mobj: + f.update({ + 'height': int(mobj.group(2)), + 'width': int(mobj.group(1)), + }) + if original_filename.startswith('s3://') and not s3_extracted: + formats.append({ + 'format_id': 'original', + 'preference': 1, + 'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'), + }) + s3_extracted = True + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'upload_date': unified_strdate(video_data.get('start_date')), + 'duration': parse_duration(video_data.get('duration')), + 'view_count': str_to_int(video_data.get('playcount')), + 'formats': formats, + 'subtitles': self._parse_subtitles(video_data, 'vtt'), + } + + +class AdobeTVEmbedIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:embed' + _VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P\d+)' + _TEST = { + 'url': 'https://tv.adobe.com/embed/22/4153', + 'md5': 'c8c0461bf04d54574fc2b4d07ac6783a', + 'info_dict': { + 'id': '4153', + 'ext': 'flv', + 'title': 'Creating Graphics Optimized for BlackBerry', + 'description': 'md5:eac6e8dced38bdaae51cd94447927459', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20091109', + 'duration': 377, + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'episode/' + video_id, video_id, {'disclosure': 'standard'})[0] + return self._parse_video_data(video_data) class AdobeTVIE(AdobeTVBaseIE): + IE_NAME = 'adobetv' _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?watch/(?P[^/]+)/(?P[^/]+)' _TEST = { @@ -42,45 +136,33 @@ class AdobeTVIE(AdobeTVBaseIE): if not language: language = 'en' - video_data = self._download_json( - self._API_BASE_URL + 'episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname), - urlname)['data'][0] - - formats = [{ - 'url': source['url'], - 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('video_data_rate')), - } for source in video_data['videos']] - self._sort_formats(formats) - - return { - 'id': compat_str(video_data['id']), - 'title': video_data['title'], - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail'), - 'upload_date': unified_strdate(video_data.get('start_date')), - 'duration': parse_duration(video_data.get('duration')), - 'view_count': str_to_int(video_data.get('playcount')), - 'formats': formats, - } + video_data = self._call_api( + 'episode/get', urlname, { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + 'urlname': urlname, + })[0] + return self._parse_video_data(video_data) class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): - def _parse_page_data(self, page_data): - return [self.url_result(self._get_element_url(element_data)) for element_data in page_data] + _PAGE_SIZE = 25 - def _extract_playlist_entries(self, url, display_id): - page = self._download_json(url, display_id) - entries = self._parse_page_data(page['data']) - for page_num in range(2, page['paging']['pages'] + 1): - entries.extend(self._parse_page_data( - self._download_json(url + '&page=%d' % page_num, display_id)['data'])) - return entries + def _fetch_page(self, display_id, query, page): + page += 1 + query['page'] = page + for element_data in self._call_api( + self._RESOURCE, display_id, query, 'Download Page %d' % page): + yield self._process_data(element_data) + + def _extract_playlist_entries(self, display_id, query): + return OnDemandPagedList(functools.partial( + self._fetch_page, display_id, query), self._PAGE_SIZE) class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:show' _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?show/(?P[^/]+)' _TEST = { @@ -92,26 +174,31 @@ class AdobeTVShowIE(AdobeTVPlaylistBaseIE): }, 'playlist_mincount': 136, } - - def _get_element_url(self, element_data): - return element_data['urls'][0] + _RESOURCE = 'episode' + _process_data = AdobeTVBaseIE._parse_video_data def _real_extract(self, url): language, show_urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' - query = 'language=%s&show_urlname=%s' % (language, show_urlname) + query = { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + } - show_data = self._download_json(self._API_BASE_URL + 'show/get/?%s' % query, show_urlname)['data'][0] + show_data = self._call_api( + 'show/get', show_urlname, query)[0] return self.playlist_result( - self._extract_playlist_entries(self._API_BASE_URL + 'episode/?%s' % query, show_urlname), - compat_str(show_data['id']), - show_data['show_name'], - show_data['show_description']) + self._extract_playlist_entries(show_urlname, query), + str_or_none(show_data.get('id')), + show_data.get('show_name'), + show_data.get('show_description')) class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:channel' _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?channel/(?P[^/]+)(?:/(?P[^/]+))?' _TEST = { @@ -121,24 +208,30 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): }, 'playlist_mincount': 96, } + _RESOURCE = 'show' - def _get_element_url(self, element_data): - return element_data['url'] + def _process_data(self, show_data): + return self.url_result( + show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id'))) def _real_extract(self, url): language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' - query = 'language=%s&channel_urlname=%s' % (language, channel_urlname) + query = { + 'channel_urlname': channel_urlname, + 'language': language, + } if category_urlname: - query += '&category_urlname=%s' % category_urlname + query['category_urlname'] = category_urlname return self.playlist_result( - self._extract_playlist_entries(self._API_BASE_URL + 'show/?%s' % query, channel_urlname), + self._extract_playlist_entries(channel_urlname, query), channel_urlname) -class AdobeTVVideoIE(InfoExtractor): +class AdobeTVVideoIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:video' _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' _TEST = { @@ -160,38 +253,36 @@ class AdobeTVVideoIE(InfoExtractor): video_data = self._parse_json(self._search_regex( r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) + title = video_data['title'] - formats = [{ - 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), - 'url': source['src'], - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'tbr': int_or_none(source.get('bitrate')), - } for source in video_data['sources']] + formats = [] + sources = video_data.get('sources') or [] + for source in sources: + source_src = source.get('src') + if not source_src: + continue + formats.append({ + 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), + 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'height': int_or_none(source.get('height') or None), + 'tbr': int_or_none(source.get('bitrate') or None), + 'width': int_or_none(source.get('width') or None), + 'url': source_src, + }) self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one duration = max(filter(None, [ float_or_none(source.get('duration'), scale=1000) - for source in video_data['sources']])) - - subtitles = {} - for translation in video_data.get('translations', []): - lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) - if lang_id not in subtitles: - subtitles[lang_id] = [] - subtitles[lang_id].append({ - 'url': translation['vttPath'], - 'ext': 'vtt', - }) + for source in sources])) return { 'id': video_id, 'formats': formats, - 'title': video_data['title'], + 'title': title, 'description': video_data.get('description'), - 'thumbnail': video_data['video'].get('poster'), + 'thumbnail': video_data.get('video', {}).get('poster'), 'duration': duration, - 'subtitles': subtitles, + 'subtitles': self._parse_subtitles(video_data, 'vttPath'), } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0e349b778..0f27c9678 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -21,6 +21,7 @@ from .acast import ( from .adn import ADNIE from .adobeconnect import AdobeConnectIE from .adobetv import ( + AdobeTVEmbedIE, AdobeTVIE, AdobeTVShowIE, AdobeTVChannelIE, From a15adbe461584e2e631d1be97805e81c17cfd3fe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:12:55 +0100 Subject: [PATCH 0214/1705] [channel9] reduce response size and update tests --- youtube_dl/extractor/channel9.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 81108e704..09cacf6d3 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -32,7 +32,7 @@ class Channel9IE(InfoExtractor): 'upload_date': '20130828', 'session_code': 'KOS002', 'session_room': 'Arena 1A', - 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'], + 'session_speakers': 'count:5', }, }, { 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', @@ -64,15 +64,15 @@ class Channel9IE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', - 'info_dict': { - 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', - 'title': 'Channel 9', - }, - 'playlist_mincount': 100, }, { 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', + 'info_dict': { + 'id': 'Events/DEVintersection/DEVintersection-2016', + 'title': 'DEVintersection 2016 Orlando Sessions', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', 'only_matching': True, }, { 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', @@ -112,11 +112,11 @@ class Channel9IE(InfoExtractor): episode_data), content_path) content_id = episode_data['contentId'] is_session = '/Sessions(' in episode_data['api'] - content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' if is_session: - content_url += '?$expand=Speakers' + content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' else: - content_url += '?$expand=Authors' + content_url += 'Authors,Body&$expand=Authors' content_data = self._download_json(content_url, content_id) title = content_data['Title'] @@ -210,7 +210,7 @@ class Channel9IE(InfoExtractor): 'id': content_id, 'title': title, 'description': clean_html(content_data.get('Description') or content_data.get('Body')), - 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), + 'thumbnail': content_data.get('VideoPlayerPreviewImage'), 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), 'timestamp': parse_iso8601(content_data.get('PublishedDate')), 'avg_rating': int_or_none(content_data.get('Rating')), From 88a7a9089a0f3ccdd5e0e6f10b529652a24cbc7e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Nov 2019 17:22:54 +0100 Subject: [PATCH 0215/1705] [abcotvs] relax _VALID_URL regex and improve metadata extraction(closes #18014) --- youtube_dl/extractor/abcotvs.py | 79 ++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 03b92a39c..0bc69a64f 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -4,29 +4,30 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + dict_get, int_or_none, - parse_iso8601, + try_get, ) class ABCOTVSIE(InfoExtractor): IE_NAME = 'abcotvs' IE_DESC = 'ABC Owned Television Stations' - _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' + _VALID_URL = r'https?://(?Pabc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P[^/]+))?/(?P\d+)' _TESTS = [ { 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', 'info_dict': { - 'id': '472581', + 'id': '472548', 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', - 'title': 'East Bay museum celebrates vintage synthesizers', + 'title': 'East Bay museum celebrates synthesized music', 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421123075, + 'timestamp': 1421118520, 'upload_date': '20150113', - 'uploader': 'Jonathan Bloom', }, 'params': { # m3u8 download @@ -37,39 +38,63 @@ class ABCOTVSIE(InfoExtractor): 'url': 'http://abc7news.com/472581', 'only_matching': True, }, + { + 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', + 'only_matching': True, + }, ] + _SITE_MAP = { + '6abc': 'wpvi', + 'abc11': 'wtvd', + 'abc13': 'ktrk', + 'abc30': 'kfsn', + 'abc7': 'kabc', + 'abc7chicago': 'wls', + 'abc7news': 'kgo', + 'abc7ny': 'wabc', + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = display_id or video_id + station = self._SITE_MAP[site] - webpage = self._download_webpage(url, display_id) + data = self._download_json( + 'https://api.abcotvs.com/v2/content', display_id, query={ + 'id': video_id, + 'key': 'otv.web.%s.story' % station, + 'station': station, + })['data'] + video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data + video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) + title = video.get('title') or video['linkText'] - m3u8 = self._html_search_meta( - 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0] - - formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + formats = [] + m3u8_url = video.get('m3u8') + if m3u8_url: + formats = self._extract_m3u8_formats( + video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'abr': 128, + 'format_id': 'https', + 'height': 360, + 'url': mp4_url, + 'width': 640, + }) self._sort_formats(formats) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'
    \s*
    ', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) - - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + domain, path, display_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + modules = self._call_api( + site, 'content/pages', display_id, { + 'includeContent': 'true', + 'moduleOffset': 1, + 'path': path, + 'site': site, + })['modules'] + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'ie_key': 'ViewLiftEmbed', + } From 51c7f40c83a12f9dc0fce0b9e5102a0c13467b6a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 27 Jan 2020 23:37:29 +0100 Subject: [PATCH 0313/1705] [vimeo] fix album extraction(closes #23864) --- youtube_dl/extractor/vimeo.py | 68 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index baa46d5f3..f378aa283 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -841,33 +841,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self._TITLE or self._html_search_regex( self._TITLE_RE, webpage, 'list title', fatal=False) - def _login_list_password(self, page_url, list_id, webpage): - login_form = self._search_regex( - r'(?s)]+?id="pw_form"(.*?)', - webpage, 'login form', default=None) - if not login_form: - return webpage - - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) - fields = self._hidden_inputs(login_form) - token, vuid = self._extract_xsrft_and_vuid(webpage) - fields['token'] = token - fields['password'] = password - post = urlencode_postdata(fields) - password_path = self._search_regex( - r'action="([^"]+)"', login_form, 'password URL') - password_url = compat_urlparse.urljoin(page_url, password_path) - password_request = sanitized_Request(password_url, post) - password_request.add_header('Content-type', 'application/x-www-form-urlencoded') - self._set_vimeo_cookie('vuid', vuid) - self._set_vimeo_cookie('xsrft', token) - - return self._download_webpage( - password_request, list_id, - 'Verifying the password', 'Wrong password') - def _title_and_entries(self, list_id, base_url): for pagenum in itertools.count(1): page_url = self._page_url(base_url, pagenum) @@ -876,7 +849,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'Downloading page %s' % pagenum) if pagenum == 1: - webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) # Try extracting href first since not all videos are available via @@ -923,7 +895,7 @@ class VimeoUserIE(VimeoChannelIE): _BASE_URL_TEMPL = 'https://vimeo.com/%s' -class VimeoAlbumIE(VimeoChannelIE): +class VimeoAlbumIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:album' _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'
  • ', webpage, 'video upload_date', - fatal=False)) - view_count_str = self._html_search_regex( - r'
  • Hits: ([0-9]+?)
  • ', webpage, 'video view_count', - fatal=False) - comment_count_str = self._html_search_regex( - r'

    ([0-9]+?) comments

    ', webpage, 'video comment_count', - fatal=False) - - return { - 'id': video_id, - 'url': video_url, - 'title': video_title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'view_count': int_or_none(view_count_str), - 'comment_count': int_or_none(comment_count_str), - } From de1121d749089ea62d17aadb43493c00492bb37a Mon Sep 17 00:00:00 2001 From: jxu <7989982+jxu@users.noreply.github.com> Date: Tue, 28 Jan 2020 05:20:19 -0500 Subject: [PATCH 0329/1705] [YoutubeDL] Fix playlist entry indexing with --playlist-items (closes #10591, closes #10622) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f5cb46308..b09cb0a79 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -990,7 +990,7 @@ class YoutubeDL(object): 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': i + playliststart, + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), From e0abaab29332f22f438617a7c425b1b2ee1c14d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Feb 2020 03:30:06 +0700 Subject: [PATCH 0330/1705] [test_YoutubeDL] Fix get_ids --- test/test_YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ce9666171..0769e2ed2 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -818,8 +818,9 @@ class TestYoutubeDL(unittest.TestCase): def get_ids(params): ydl = YDL(params) - # make a copy because the dictionary can be modified - ydl.process_ie_result(playlist.copy()) + # make a deep copy because the dictionary and nested entries + # can be modified + ydl.process_ie_result(copy.deepcopy(playlist)) return [int(v['id']) for v in ydl.downloaded_info_dicts] result = get_ids({}) From 4e9e1e240dea1c37c95ceef5e52c15534d56ca7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Feb 2020 03:31:31 +0700 Subject: [PATCH 0331/1705] [test_YoutubeDL] Add tests for #10591 (closes #23873) --- test/test_YoutubeDL.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 0769e2ed2..1e204e551 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -816,12 +816,15 @@ class TestYoutubeDL(unittest.TestCase): 'webpage_url': 'http://example.com', } - def get_ids(params): + def get_downloaded_info_dicts(params): ydl = YDL(params) # make a deep copy because the dictionary and nested entries # can be modified ydl.process_ie_result(copy.deepcopy(playlist)) - return [int(v['id']) for v in ydl.downloaded_info_dicts] + return ydl.downloaded_info_dicts + + def get_ids(params): + return [int(v['id']) for v in get_downloaded_info_dicts(params)] result = get_ids({}) self.assertEqual(result, [1, 2, 3, 4]) @@ -853,6 +856,22 @@ class TestYoutubeDL(unittest.TestCase): result = get_ids({'playlist_items': '2-4,3-4,3'}) self.assertEqual(result, [2, 3, 4]) + # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 + # @{ + result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result[0]['playlist_index'], 2) + self.assertEqual(result[1]['playlist_index'], 3) + + result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'}) + self.assertEqual(result[0]['playlist_index'], 2) + self.assertEqual(result[1]['playlist_index'], 3) + self.assertEqual(result[2]['playlist_index'], 4) + + result = get_downloaded_info_dicts({'playlist_items': '4,2'}) + self.assertEqual(result[0]['playlist_index'], 4) + self.assertEqual(result[1]['playlist_index'], 2) + # @} + def test_urlopen_no_file_protocol(self): # see https://github.com/ytdl-org/youtube-dl/issues/8227 ydl = YDL() From f6052ec923fbe4270a3733aeb8d4a9d5727ef80f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Feb 2020 03:49:29 +0700 Subject: [PATCH 0332/1705] [24video] Add support for porn.24video.net (closes #23779, closes #23784) --- youtube_dl/extractor/twentyfourvideo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 2830c212e..74d14049b 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -17,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?P - (?:(?:www|porno)\.)?24video\. + (?:(?:www|porno?)\.)?24video\. (?:net|me|xxx|sexy?|tube|adult|site|vip) )/ (?: @@ -62,6 +62,9 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://www.24video.vip/video/view/1044982', 'only_matching': True, + }, { + 'url': 'https://porn.24video.net/video/2640421-vsya-takay', + 'only_matching': True, }] def _real_extract(self, url): From 7bf27721d61dadb2e4f2f6215cbc9d66ba3fd708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Feb 2020 05:35:55 +0700 Subject: [PATCH 0333/1705] [npr] Add support for streams (closes #24042) --- youtube_dl/extractor/npr.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index a5e8baa7e..53acc6e57 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, qualities, + url_or_none, ) @@ -48,6 +49,10 @@ class NprIE(InfoExtractor): }, }], 'expected_warnings': ['Failed to download m3u8 information'], + }, { + # multimedia, no formats, stream + 'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert', + 'only_matching': True, }] def _real_extract(self, url): @@ -95,6 +100,17 @@ class NprIE(InfoExtractor): 'format_id': format_id, 'quality': quality(format_id), }) + for stream_id, stream_entry in media.get('stream', {}).items(): + if not isinstance(stream_entry, dict): + continue + if stream_id != 'hlsUrl': + continue + stream_url = url_or_none(stream_entry.get('$text')) + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + stream_url, stream_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) entries.append({ From 0d718db6233d1ac93df2fe6cf9c3fb908ad75b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 Feb 2020 22:40:44 +0700 Subject: [PATCH 0334/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/ChangeLog b/ChangeLog index 94aa9f327..42f04fa9b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,37 @@ +version + +Core +* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, + #10622) +* [update] Fix updating via symlinks (#23991) ++ [compat] Introduce compat_realpath (#23991) + +Extractors ++ [npr] Add support for streams (#24042) ++ [24video] Add support for porn.24video.net (#23779, #23784) +- [jpopsuki] Remove extractor (#23858) +* [nova] Improve extraction (#23690) +* [nova:embed] Improve (#23690) +* [nova:embed] Fix extraction (#23672) ++ [abc:iview] Add support for 720p (#22907, #22921) +* [nytimes] Improve format sorting (#24010) ++ [toggle] Add support for mewatch.sg (#23895, #23930) +* [thisoldhouse] Fix extraction (#23951) ++ [popcorntimes] Add support for popcorntimes.tv (#23949) +* [sportdeutschland] Update to new API +* [twitch:stream] Lowercase channel id for stream request (#23917) +* [tv5mondeplus] Fix extraction (#23907, #23911) +* [tva] Relax URL regular expression (#23903) +* [vimeo] Fix album extraction (#23864) +* [viewlift] Improve extraction + * Fix extraction (#23851) + + Add support for authentication + + Add support for more domains +* [svt] Fix series extraction (#22297) +* [svt] Fix article extraction (#22897, #22919) +* [soundcloud] Imporve private playlist/set tracks extraction (#3707) + + version 2020.01.24 Extractors From 117ba9e9df641655f00509b34591e0a6c44395c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 Feb 2020 22:43:42 +0700 Subject: [PATCH 0335/1705] release 2020.02.16 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 +++--- youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 73f46ec04..716768242 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.02.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.24 + [debug] youtube-dl version 2020.02.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7e3c9f669..3fd6a0bd6 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.02.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index b9bb3bd11..d160fcce9 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.02.16** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 265ea80c1..f97644f65 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.02.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.24 + [debug] youtube-dl version 2020.02.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index e71778a3d..dedef6d53 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.24** +- [ ] I've verified that I'm running youtube-dl version **2020.02.16** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 42f04fa9b..a6e2c3c19 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.02.16 Core * [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e9a8cc27a..02bc088ab 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -389,7 +389,6 @@ - **JeuxVideo** - **Joj** - **Jove** - - **jpopsuki.tv** - **JWPlatform** - **Kakao** - **Kaltura** @@ -663,6 +662,7 @@ - **Pokemon** - **PolskieRadio** - **PolskieRadioCategory** + - **Popcorntimes** - **PopcornTV** - **PornCom** - **PornerBros** @@ -1004,8 +1004,8 @@ - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - - **ViewLift** - - **ViewLiftEmbed** + - **viewlift** + - **viewlift:embed** - **Viidea** - **viki** - **viki:channel** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fa6f7289a..0163333ac 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.01.24' +__version__ = '2020.02.16' From 97c822b3d5221d612748e1b589c87603bbae8a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Str=C3=B6m?= Date: Tue, 18 Feb 2020 19:02:05 +0100 Subject: [PATCH 0336/1705] [tv2dk:bornholm:play] Fix extraction (#24076) --- youtube_dl/extractor/tv2dk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 611fdc0c6..8bda9348d 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -106,7 +106,7 @@ class TV2DKBornholmPlayIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, + 'https://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id, data=json.dumps({ 'playlist_id': video_id, 'serienavn': '', From 5d9f6cbc5afa033b6f1cfd2abe4327e366da2ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Feb 2020 04:33:29 +0700 Subject: [PATCH 0337/1705] [imdb] Fix extraction (closes #23443) --- youtube_dl/extractor/imdb.py | 58 ++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 436759da5..a31301985 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import base64 +import json import re from .common import InfoExtractor @@ -8,6 +10,7 @@ from ..utils import ( mimetype2ext, parse_duration, qualities, + try_get, url_or_none, ) @@ -15,15 +18,16 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'No. 2 from Ice Age: Continental Drift (2012)', + 'title': 'No. 2', 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', + 'duration': 152, } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -47,21 +51,23 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.imdb.com/videoplayer/vi' + video_id, video_id) - video_metadata = self._parse_json(self._search_regex( - r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage, - 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id] - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'(.+?)', webpage, 'title', fatal=False) or video_metadata['title'] + + data = self._download_json( + 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, + query={ + 'key': base64.b64encode(json.dumps({ + 'type': 'VIDEO_PLAYER', + 'subType': 'FORCE_LEGACY', + 'id': 'vi%s' % video_id, + }).encode()).decode(), + })[0] quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] - for encoding in video_metadata.get('encodings', []): + for encoding in data['videoLegacyEncodings']: if not encoding or not isinstance(encoding, dict): continue - video_url = url_or_none(encoding.get('videoUrl')) + video_url = url_or_none(encoding.get('url')) if not video_url: continue ext = mimetype2ext(encoding.get( @@ -69,7 +75,7 @@ class ImdbIE(InfoExtractor): if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + preference=1, m3u8_id='hls', fatal=False)) continue format_id = encoding.get('definition') formats.append({ @@ -80,13 +86,33 @@ class ImdbIE(InfoExtractor): }) self._sort_formats(formats) + webpage = self._download_webpage( + 'https://www.imdb.com/video/vi' + video_id, video_id) + video_metadata = self._parse_json(self._search_regex( + r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, + 'video metadata'), video_id) + + video_info = video_metadata.get('VIDEO_INFO') + if video_info and isinstance(video_info, dict): + info = try_get( + video_info, lambda x: x[list(video_info.keys())[0]][0], dict) + else: + info = {} + + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage) or self._html_search_regex( + r'(.+?)', webpage, 'title', + default=None) or info['videoTitle'] + return { 'id': video_id, 'title': title, + 'alt_title': info.get('videoSubTitle'), 'formats': formats, - 'description': video_metadata.get('description'), - 'thumbnail': video_metadata.get('slate', {}).get('url'), - 'duration': parse_duration(video_metadata.get('duration')), + 'description': info.get('videoDescription'), + 'thumbnail': url_or_none(try_get( + video_metadata, lambda x: x['videoSlate']['source'])), + 'duration': parse_duration(info.get('videoRuntime')), } From fda6d237a5b664cc8a9a45562d4113c51fd0280d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Feb 2020 06:47:11 +0700 Subject: [PATCH 0338/1705] [wistia] Add support for multiple generic embeds (closes #8347, closes #11385) --- youtube_dl/extractor/generic.py | 17 +++++++++-------- youtube_dl/extractor/wistia.py | 31 ++++++++++++++++--------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3c002472f..04c026984 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2537,14 +2537,15 @@ class GenericIE(InfoExtractor): dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) # Look for embedded Wistia player - wistia_url = WistiaIE._extract_url(webpage) - if wistia_url: - return { - '_type': 'url_transparent', - 'url': self._proto_relative_url(wistia_url), - 'ie_key': WistiaIE.ie_key(), - 'uploader': video_uploader, - } + wistia_urls = WistiaIE._extract_urls(webpage) + if wistia_urls: + playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) + for entry in playlist['entries']: + entry.update({ + '_type': 'url_transparent', + 'uploader': video_uploader, + }) + return playlist # Look for SVT player svt_url = SVTIE._extract_url(webpage) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 085514d47..168e5e901 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -45,22 +45,23 @@ class WistiaIE(InfoExtractor): # https://wistia.com/support/embed-and-share/video-on-your-website @staticmethod def _extract_url(webpage): - match = re.search( - r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage) - if match: - return unescapeHTML(match.group('url')) + urls = WistiaIE._extract_urls(webpage) + return urls[0] if urls else None - match = re.search( - r'''(?sx) - ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]{10})\b.*?\2 - ''', webpage) - if match: - return 'wistia:%s' % match.group('id') - - match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P[a-z0-9]{10})', webpage) - if match: - return 'wistia:%s' % match.group('id') + @staticmethod + def _extract_urls(webpage): + urls = [] + for match in re.finditer( + r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): + urls.append(unescapeHTML(match.group('url'))) + for match in re.finditer( + r'''(?sx) + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]{10})\b.*?\2 + ''', webpage): + urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P[a-z0-9]{10})', webpage): + urls.append('wistia:%s' % match.group('id')) + return urls def _real_extract(self, url): video_id = self._match_id(url) From 00d798b7c25f0a03adb252c882df46abc8c23b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Feb 2020 06:49:45 +0700 Subject: [PATCH 0339/1705] [teachable] Add support for multiple videos per lecture (closes #24101) --- youtube_dl/extractor/teachable.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 6b7f13b43..cca89a4a8 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -160,8 +160,8 @@ class TeachableIE(TeachableBaseIE): webpage = self._download_webpage(url, video_id) - wistia_url = WistiaIE._extract_url(webpage) - if not wistia_url: + wistia_urls = WistiaIE._extract_urls(webpage) + if not wistia_urls: if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', r'>\s*Lecture contents locked', @@ -174,12 +174,14 @@ class TeachableIE(TeachableBaseIE): title = self._og_search_title(webpage, default=None) - return { + entries = [{ '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, - } + } for wistia_url in wistia_urls] + + return self.playlist_result(entries, video_id, title) class TeachableCourseIE(TeachableBaseIE): From bee6451fe88dbe8d983150cb17565d1c3e9024f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 24 Feb 2020 04:47:56 +0700 Subject: [PATCH 0340/1705] [pornhd] Fix extraction (closes #24128) --- youtube_dl/extractor/pornhd.py | 36 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 27d65d4b9..c6052ac9f 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, int_or_none, js_to_json, + merge_dicts, urljoin, ) @@ -27,23 +28,22 @@ class PornHdIE(InfoExtractor): 'view_count': int, 'like_count': int, 'age_limit': 18, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { - # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '956b8ca569f7f4d8ec563e2c41598441', + 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', 'info_dict': { 'id': '1962', 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'ext': 'mp4', - 'title': 'Sierra loves doing laundry', + 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'like_count': int, 'age_limit': 18, }, - 'skip': 'Not available anymore', }] def _real_extract(self, url): @@ -61,7 +61,13 @@ class PornHdIE(InfoExtractor): r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) + info = {} if not sources: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info = entries[0] + + if not sources and not info: message = self._html_search_regex( r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class="description"[^>]*>(?P[^<]+)]+class=["\']video-description[^>]+>(?P.+?)', + r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, - 'thumbnail', fatal=False, group='url') + 'thumbnail', default=None, group='url') like_count = int_or_none(self._search_regex( - (r'(\d+)\s*]+>(?: |\s)*\blikes', + (r'(\d+)\s*likes', + r'(\d+)\s*]+>(?: |\s)*\blikes', r'class=["\']save-count["\'][^>]*>\s*(\d+)'), webpage, 'like count', fatal=False)) - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -106,4 +118,4 @@ class PornHdIE(InfoExtractor): 'like_count': like_count, 'formats': formats, 'age_limit': 18, - } + }) From b76f0e58f750fd420ac3078c7183b2de66da562c Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Sat, 29 Feb 2020 18:33:09 +0900 Subject: [PATCH 0341/1705] [options] Remove duplicate short option -v for --version (#24162) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 1ffabc62b..8826b382c 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -134,7 +134,7 @@ def parseOpts(overrideArguments=None): action='help', help='Print this help text and exit') general.add_option( - '-v', '--version', + '--version', action='version', help='Print program version and exit') general.add_option( From e2f8bf5888274b95513b430e0f20261120699b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 17:29:30 +0700 Subject: [PATCH 0342/1705] [extractor/common] Convert ISM manifest to unicode before processing on python 2 (#24152) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..ab7d473d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2341,7 +2341,7 @@ class InfoExtractor(object): return [] ism_doc, urlh = res - return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats(ism_doc, compat_str(urlh.geturl()), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): """ From fca6dba8b80286ae6d3ca0a60c4799c220a52650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 19:08:44 +0700 Subject: [PATCH 0343/1705] [YoutubeDL] Force redirect URL to unicode on python 2 --- youtube_dl/YoutubeDL.py | 4 +++- youtube_dl/utils.py | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b09cb0a79..19370f62b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -92,6 +92,7 @@ from .utils import ( YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, + YoutubeDLRedirectHandler, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -2343,6 +2344,7 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + redirect_handler = YoutubeDLRedirectHandler() data_handler = compat_urllib_request_DataHandler() # When passing our own FileHandler instance, build_opener won't add the @@ -2356,7 +2358,7 @@ class YoutubeDL(object): file_handler.file_open = file_open opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) + proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a..8ccf25489 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2795,6 +2795,15 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): + if sys.version_info[0] < 3: + def redirect_request(self, req, fp, code, msg, headers, newurl): + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + + def extract_timezone(date_str): m = re.search( r'^.{8,}?(?PZ$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', From 7947a1f7dbc6ba47a6f22ab67fd330e57c0ef87c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 19:17:27 +0700 Subject: [PATCH 0344/1705] Remove no longer needed compat_str around geturl --- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/eporner.py | 3 +-- youtube_dl/extractor/generic.py | 6 +++--- youtube_dl/extractor/lecturio.py | 3 +-- youtube_dl/extractor/linuxacademy.py | 5 ++--- youtube_dl/extractor/mediaset.py | 3 +-- youtube_dl/extractor/mediasite.py | 2 +- youtube_dl/extractor/platzi.py | 2 +- youtube_dl/extractor/safari.py | 5 ++--- youtube_dl/extractor/teachable.py | 3 +-- youtube_dl/extractor/tumblr.py | 3 +-- youtube_dl/extractor/vimeo.py | 2 +- 12 files changed, 16 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ab7d473d0..eaae5e484 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2341,7 +2341,7 @@ class InfoExtractor(object): return [] ism_doc, urlh = res - return self._parse_ism_formats(ism_doc, compat_str(urlh.geturl()), ism_id) + return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): """ diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index c050bf9df..fe42821c7 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( encode_base_n, ExtractorError, @@ -55,7 +54,7 @@ class EpornerIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(compat_str(urlh.geturl())) + video_id = self._match_id(urlh.geturl()) hash = self._search_regex( r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 04c026984..d1ec56be9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2287,7 +2287,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = compat_str(head_response.geturl()) + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2387,12 +2387,12 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=compat_str(full_response.geturl())), + xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, - mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], + mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py index 6ed7da4ab..1b2dcef46 100644 --- a/youtube_dl/extractor/lecturio.py +++ b/youtube_dl/extractor/lecturio.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, determine_ext, @@ -36,7 +35,7 @@ class LecturioBaseIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in compat_str(url_handle.geturl()) + return self._LOGIN_URL not in url_handle.geturl() # Already logged in if is_logged(urlh): diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py index a78c6556e..23ca965d9 100644 --- a/youtube_dl/extractor/linuxacademy.py +++ b/youtube_dl/extractor/linuxacademy.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, - compat_str, ) from ..utils import ( ExtractorError, @@ -99,7 +98,7 @@ class LinuxAcademyIE(InfoExtractor): 'sso': 'true', }) - login_state_url = compat_str(urlh.geturl()) + login_state_url = urlh.geturl() try: login_page = self._download_webpage( @@ -129,7 +128,7 @@ class LinuxAcademyIE(InfoExtractor): }) access_token = self._search_regex( - r'access_token=([^=&]+)', compat_str(urlh.geturl()), + r'access_token=([^=&]+)', urlh.geturl(), 'access token') self._download_webpage( diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 027a790b8..933df1495 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -6,7 +6,6 @@ import re from .theplatform import ThePlatformBaseIE from ..compat import ( compat_parse_qs, - compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -114,7 +113,7 @@ class MediasetIE(ThePlatformBaseIE): continue urlh = ie._request_webpage( embed_url, video_id, note='Following embed URL redirect') - embed_url = compat_str(urlh.geturl()) + embed_url = urlh.geturl() program_guid = _program_guid(_qs(embed_url)) if program_guid: entries.append(embed_url) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 694a264d6..d6eb15740 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -129,7 +129,7 @@ class MediasiteIE(InfoExtractor): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() # XXX: might have also extracted UrlReferrer and QueryString from the html service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py index 602207beb..23c8256b5 100644 --- a/youtube_dl/extractor/platzi.py +++ b/youtube_dl/extractor/platzi.py @@ -46,7 +46,7 @@ class PlatziBaseIE(InfoExtractor): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in compat_str(urlh.geturl()): + if 'platzi.com/login' not in urlh.geturl(): return login_error = self._webpage_read_content( diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 4942437c7..2cc665122 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_str, compat_urlparse, ) from ..utils import ( @@ -39,13 +38,13 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in compat_str(urlh.geturl()) + return 'learning.oreilly.com/home/' in urlh.geturl() if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index cca89a4a8..4316a6962 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from .wistia import WistiaIE -from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, @@ -58,7 +57,7 @@ class TeachableBaseIE(InfoExtractor): self._logged_in = True return - login_url = compat_str(urlh.geturl()) + login_url = urlh.geturl() login_form = self._hidden_inputs(login_page) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index edbb0aa69..ae584ad69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -151,7 +150,7 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): raise ExtractorError( 'This Tumblr may contain sensitive media. ' diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f378aa283..1da4ced96 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -591,7 +591,7 @@ class VimeoIE(VimeoBaseInfoExtractor): # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( url, video_id, headers=headers) - redirect_url = compat_str(urlh.geturl()) + redirect_url = urlh.geturl() except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() From 886d9859590c423b1d036d4a2e6f43ee639560d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 21:58:22 +0700 Subject: [PATCH 0345/1705] [youjizz] Fix extraction (closes #24181) --- youtube_dl/extractor/youjizz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index dff69fcb7..88aabd272 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -44,7 +44,7 @@ class YouJizzIE(InfoExtractor): encodings = self._parse_json( self._search_regex( - r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', default='[]'), video_id, fatal=False) for encoding in encodings: From ea17979d83e6dcd61de0901fc55e2672d577e5a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 22:08:43 +0700 Subject: [PATCH 0346/1705] [test_subtitles] Remove obsolete test --- test/test_subtitles.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 7d57a628e..17aaaf20d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,7 +26,6 @@ from youtube_dl.extractor import ( ThePlatformIE, ThePlatformFeedIE, RTVEALaCartaIE, - FunnyOrDieIE, DemocracynowIE, ) @@ -322,18 +321,6 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') -class TestFunnyOrDieSubtitles(BaseTestSubtitles): - url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' - IE = FunnyOrDieIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') - - class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE From b4cbdbd4b37a496c6539694bebd0ec50ef02c864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 23:06:36 +0700 Subject: [PATCH 0347/1705] [zdf:channel] Fix tests --- youtube_dl/extractor/zdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 145c123a4..656864b2e 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -244,14 +244,14 @@ class ZDFChannelIE(ZDFBaseIE): 'id': 'das-aktuelle-sportstudio', 'title': 'das aktuelle sportstudio | ZDF', }, - 'playlist_count': 21, + 'playlist_mincount': 23, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { 'id': 'planet-e', 'title': 'planet e.', }, - 'playlist_count': 4, + 'playlist_mincount': 50, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, From 278355bae4b4aed9cd3bf09bdf251fcb7396e303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 23:09:13 +0700 Subject: [PATCH 0348/1705] [zapiks] Fix test --- youtube_dl/extractor/zapiks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py index bacb82eee..f6496f516 100644 --- a/youtube_dl/extractor/zapiks.py +++ b/youtube_dl/extractor/zapiks.py @@ -29,7 +29,6 @@ class ZapiksIE(InfoExtractor): 'timestamp': 1359044972, 'upload_date': '20130124', 'view_count': int, - 'comment_count': int, }, }, { From e88b450771027e3a36e7b3721e0bc1f105331cf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Feb 2020 23:51:34 +0700 Subject: [PATCH 0349/1705] [xtube] Fix metadata extraction (closes #21073, closes #22455) --- youtube_dl/extractor/xtube.py | 46 +++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index c6c0b3291..994cf1d0a 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -47,7 +47,7 @@ class XTubeIE(InfoExtractor): 'display_id': 'A-Super-Run-Part-1-YT', 'ext': 'flv', 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', 'uploader': 'tshirtguy59', 'duration': 579, 'view_count': int, @@ -87,10 +87,24 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - sources = self._parse_json(self._search_regex( - r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', - webpage, 'sources', group='sources'), video_id, - transform_source=js_to_json) + title, thumbnail, duration = [None] * 3 + + config = self._parse_json(self._search_regex( + r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + default='{}'), video_id, transform_source=js_to_json, fatal=False) + if config: + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') + + if isinstance(sources, dict): + sources = self._parse_json(self._search_regex( + r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -102,20 +116,25 @@ class XTubeIE(InfoExtractor): self._remove_duplicate_formats(formats) self._sort_formats(formats) - title = self._search_regex( - (r'

    \s*(?P[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), - webpage, 'title', group='title') - description = self._search_regex( + if not title: + title = self._search_regex( + (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, default=None) or self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) uploader = self._search_regex( (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', r'<span[^>]+class="nickname"[^>]*>([^<]+)'), webpage, 'uploader', fatal=False) - duration = parse_duration(self._search_regex( - r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', - webpage, 'duration', fatal=False)) + if not duration: + duration = parse_duration(self._search_regex( + r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( - r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>', + (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', + r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'), webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( r'>Comments? \(([\d,\.]+)\)<', @@ -126,6 +145,7 @@ class XTubeIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'uploader': uploader, 'duration': duration, 'view_count': view_count, From 838f051c4b9a0c974da32e79c011bbf85fa186fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 29 Feb 2020 23:51:56 +0700 Subject: [PATCH 0350/1705] [xtube:user] Fix test --- youtube_dl/extractor/xtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 994cf1d0a..47caec1de 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -164,7 +164,7 @@ class XTubeUserIE(InfoExtractor): 'id': 'greenshowers-4056496', 'age_limit': 18, }, - 'playlist_mincount': 155, + 'playlist_mincount': 154, } def _real_extract(self, url): From f8cbd8c96367871db974621d5a92b324acb57938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Mar 2020 01:04:51 +0700 Subject: [PATCH 0351/1705] [telecinco] Fix extraction (refs #24195) --- youtube_dl/extractor/telecinco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d37e1b055..d79ab1e82 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -135,7 +135,7 @@ class TelecincoIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})', + r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', webpage, 'article'), display_id)['article'] title = article.get('title') description = clean_html(article.get('leadParagraph')) From 6d475d01d8f1fc9f5e92ac2caa986453bee95ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Mar 2020 03:09:19 +0700 Subject: [PATCH 0352/1705] [telecinco] Add support for article opening videos --- youtube_dl/extractor/telecinco.py | 44 ++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d79ab1e82..9ba3da341 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, int_or_none, str_or_none, + try_get, urljoin, ) @@ -24,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'info_dict': { 'id': '1876350223', 'title': 'Bacalao con kokotxas al pil-pil', - 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', + 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -55,6 +56,26 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, + }, { + # video in opening's content + 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', + 'info_dict': { + 'id': '2907195140', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'description': 'md5:73f340a7320143d37ab895375b2bf13a', + }, + 'playlist': [{ + 'md5': 'adb28c37238b675dad0f042292f209a7', + 'info_dict': { + 'id': 'TpI2EttSDAReWpJ1o0NVh2', + 'ext': 'mp4', + 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', + 'duration': 1015, + }, + }], + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, @@ -138,14 +159,25 @@ class TelecincoIE(InfoExtractor): r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', webpage, 'article'), display_id)['article'] title = article.get('title') - description = clean_html(article.get('leadParagraph')) + description = clean_html(article.get('leadParagraph')) or '' if article.get('editorialType') != 'VID': entries = [] - for p in article.get('body', []): - content = p.get('content') - if p.get('type') != 'video' or not content: + body = [article.get('opening')] + body.extend(try_get(article, lambda x: x['body'], list) or []) + for p in body: + if not isinstance(p, dict): continue - entries.append(self._parse_content(content, url)) + content = p.get('content') + if not content: + continue + type_ = p.get('type') + if type_ == 'paragraph': + content_str = str_or_none(content) + if content_str: + description += content_str + continue + if type_ == 'video' and isinstance(content, dict): + entries.append(self._parse_content(content, url)) return self.playlist_result( entries, str_or_none(article.get('id')), title, description) content = article['opening']['content'] From 50d19895a1a3515d48dc952bf9280cbdc4f405f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Mar 2020 19:22:09 +0700 Subject: [PATCH 0353/1705] [franceculture] Fix extraction (closes #24204) --- youtube_dl/extractor/franceculture.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index b8fa17588..306b45fc9 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -31,7 +31,13 @@ class FranceCultureIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_data = extract_attributes(self._search_regex( - r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)', + r'''(?sx) + (?: + </h1>| + <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> + ).*? + (<button[^>]+data-asset-source="[^"]+"[^>]+>) + ''', webpage, 'video data')) video_url = video_data['data-asset-source'] From b274e48d56cced250f5abbc88b4cda2e5b4338d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Mar 2020 20:04:48 +0700 Subject: [PATCH 0354/1705] [xhamster] Fix extraction (closes #24205) --- youtube_dl/extractor/xhamster.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index a5b94d279..0f7be6a7d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -113,7 +113,7 @@ class XHamsterIE(InfoExtractor): display_id = mobj.group('display_id') or mobj.group('display_id_2') desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + webpage, urlh = self._download_webpage_handle(desktop_url, video_id) error = self._html_search_regex( r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', @@ -161,6 +161,9 @@ class XHamsterIE(InfoExtractor): 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), 'filesize': filesize, + 'http_headers': { + 'Referer': urlh.geturl(), + }, }) self._sort_formats(formats) From 170f5b7c27f2e68b0a0a4e799647cbe780399fbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Mar 2020 20:09:05 +0700 Subject: [PATCH 0355/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index a6e2c3c19..9002a26e3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version <unreleased> + +Core +* [YoutubeDL] Force redirect URL to unicode on python 2 +- [options] Remove duplicate short option -v for --version (#24162) + +Extractors +* [xhamster] Fix extraction (#24205) +* [franceculture] Fix extraction (#24204) ++ [telecinco] Add support for article opening videos +* [telecinco] Fix extraction (#24195) +* [xtube] Fix metadata extraction (#21073, #22455) +* [youjizz] Fix extraction (#24181) +- Remove no longer needed compat_str around geturl +* [pornhd] Fix extraction (#24128) ++ [teachable] Add support for multiple videos per lecture (#24101) ++ [wistia] Add support for multiple generic embeds (#8347, 11385) +* [imdb] Fix extraction (#23443) +* [tv2dk:bornholm:play] Fix extraction (#24076) + + version 2020.02.16 Core From 669625a32cce386d0c19b5a0eea03707570a60d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Mar 2020 20:11:32 +0700 Subject: [PATCH 0356/1705] release 2020.03.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 716768242..0721d49c3 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.02.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.02.16 + [debug] youtube-dl version 2020.03.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 3fd6a0bd6..1e67f724d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.02.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d160fcce9..1290b55c4 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.02.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index f97644f65..3f006bef8 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.02.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.02.16 + [debug] youtube-dl version 2020.03.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dedef6d53..202bb9b2f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.02.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.02.16** +- [ ] I've verified that I'm running youtube-dl version **2020.03.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 9002a26e3..1a676f4f2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.03.01 Core * [YoutubeDL] Force redirect URL to unicode on python 2 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0163333ac..fabc1e543 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.02.16' +__version__ = '2020.03.01' From 1c45ff5572e0844b2ad26c2c0d477edc81e6b5b0 Mon Sep 17 00:00:00 2001 From: tsia <github@tsia.de> Date: Mon, 2 Mar 2020 19:27:40 +0100 Subject: [PATCH 0357/1705] [vimeo] Fix subtitles URLs (#24209) --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 1da4ced96..8cd611e1e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -33,6 +33,7 @@ from ..utils import ( unified_timestamp, unsmuggle_url, urlencode_postdata, + urljoin, unescapeHTML, ) @@ -191,7 +192,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': 'https://vimeo.com' + tt['url'], + 'url': urljoin('https://vimeo.com', tt['url']), }] thumbnails = [] From 3b5399ce0f85f46ce856d47c725a437c72dcce6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Mar 2020 01:40:48 +0700 Subject: [PATCH 0358/1705] [servus] Add support for new URL schema (closes #23475, closes #23583, closes #24142) --- youtube_dl/extractor/servus.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index e579d42cf..9401bf2cf 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -7,9 +7,18 @@ from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P<id>[aA]{2}-\w+|\d+-\d+) + ''' _TESTS = [{ - 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', @@ -18,6 +27,10 @@ class ServusIE(InfoExtractor): 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, From 0e30a7b9732dbecc63527df6037c5fbea964b1fd Mon Sep 17 00:00:00 2001 From: jxu <7989982+jxu@users.noreply.github.com> Date: Mon, 2 Mar 2020 13:46:00 -0500 Subject: [PATCH 0359/1705] [youtube:playlist] Fix tests (closes #23872) (#23885) --- youtube_dl/extractor/youtube.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eacaa5ecd..e06290427 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2495,20 +2495,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', }, - 'playlist_count': 3, + 'playlist_count': 1, }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, - 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2518,7 +2521,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Christiaan008', 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 95, + 'playlist_count': 96, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', From ac379fa236c01ed1d3601f013d755066b92709a4 Mon Sep 17 00:00:00 2001 From: 3risian <59593325+3risian@users.noreply.github.com> Date: Tue, 7 Jan 2020 18:34:51 +1100 Subject: [PATCH 0360/1705] [peertube] Improve extraction --- youtube_dl/extractor/peertube.py | 87 +++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d3a83ea2b..307712196 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -8,6 +8,7 @@ from ..compat import compat_str from ..utils import ( int_or_none, parse_resolution, + str_or_none, try_get, unified_timestamp, url_or_none, @@ -423,26 +424,30 @@ class PeerTubeIE(InfoExtractor): (?P<id>%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ - 'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', - 'md5': '80f24ff364cc9d333529506a263e7feb', + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '9bed8c0137913e17b86334e5885aacff', 'info_dict': { - 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', - 'title': 'wow', - 'description': 'wow such video, so gif', + 'title': 'What is PeerTube?', + 'description': '**[Want to help to translate this video?](https://weblate.framasoft.org/projects/what-is-peertube-video/)**\r\n\r\n**Take back the control of your videos! [#JoinPeertube](https://joinpeertube.org)**\r\n*A decentralized video hosting network, based on free/libre software!*\r\n\r\n**Animation Produced by:** [LILA](https://libreart.info) - [ZeMarmot Team](https://film.zemarmot.net)\r\n*Directed by* Aryeom\r\n*Assistant* Jehan\r\n**Licence**: [CC-By-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)\r\n\r\n**Sponsored by** [Framasoft](https://framasoft.org)\r\n\r\n**Music**: [Red Step Forward](http://play.dogmazic.net/song.php?song_id=52491) - CC-By Ken Bushima\r\n\r\n**Movie Clip**: [Caminades 3: Llamigos](http://www.caminandes.com/) CC-By Blender Institute\r\n\r\n**Video sources**: https://gitlab.gnome.org/Jehan/what-is-peertube/', 'thumbnail': r're:https?://.*\.(?:jpg|png)', - 'timestamp': 1519297480, - 'upload_date': '20180222', - 'uploader': 'Luclu7', - 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', - 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', - 'license': 'Unknown', - 'duration': 3, + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'Les vidéos de Framasoft', + 'channel_id': '2', + 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, 'view_count': int, 'like_count': int, 'dislike_count': int, - 'tags': list, - 'categories': list, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', @@ -484,6 +489,23 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries + def _get_subtitles(self, host, video_id): + video_captions = self._download_json( + 'https://%s/api/v1/videos/%s/captions' % (host, video_id), video_id, fatal=False) + if not isinstance(video_captions, dict): + return None + + subtitles = {} + for entry in video_captions.get('data'): + language_id = try_get(entry, lambda x: x['language']['id'], compat_str) + caption_path = str_or_none(entry.get('captionPath')) + if language_id and caption_path: + caption_url = urljoin('https://%s' % host, entry.get('captionPath')) + subtitles.setdefault(language_id, []).append({ + 'url': caption_url, + }) + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or mobj.group('host_2') @@ -513,10 +535,25 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - def account_data(field): - return try_get(video, lambda x: x['account'][field], compat_str) + video_description = self._download_json( + 'https://%s/api/v1/videos/%s/description' % (host, video_id), video_id, fatal=False) - category = try_get(video, lambda x: x['category']['label'], compat_str) + description = None + if isinstance(video_description, dict): + description = str_or_none(video_description.get('description')) + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) categories = [category] if category else None nsfw = video.get('nsfw') @@ -528,14 +565,17 @@ class PeerTubeIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video.get('description'), + 'description': description, 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName'), - 'uploader_id': account_data('uuid'), - 'uploder_url': account_data('url'), - 'license': try_get( - video, lambda x: x['licence']['label'], compat_str), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), @@ -544,4 +584,5 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, + 'subtitles': subtitles } From 1e1c1960aa154a6e257e83e94e86ee6dc8b0b362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Mar 2020 03:01:23 +0700 Subject: [PATCH 0361/1705] [peertube] Fix issues and improve extraction (closes #23657) --- youtube_dl/extractor/peertube.py | 56 +++++++++++++++++++------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 307712196..48fb95416 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -416,6 +416,7 @@ class PeerTubeIE(InfoExtractor): peertube\.cpy\.re )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' _VALID_URL = r'''(?x) (?: peertube:(?P<host>[^:]+):| @@ -430,7 +431,7 @@ class PeerTubeIE(InfoExtractor): 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', 'title': 'What is PeerTube?', - 'description': '**[Want to help to translate this video?](https://weblate.framasoft.org/projects/what-is-peertube-video/)**\r\n\r\n**Take back the control of your videos! [#JoinPeertube](https://joinpeertube.org)**\r\n*A decentralized video hosting network, based on free/libre software!*\r\n\r\n**Animation Produced by:** [LILA](https://libreart.info) - [ZeMarmot Team](https://film.zemarmot.net)\r\n*Directed by* Aryeom\r\n*Assistant* Jehan\r\n**Licence**: [CC-By-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)\r\n\r\n**Sponsored by** [Framasoft](https://framasoft.org)\r\n\r\n**Music**: [Red Step Forward](http://play.dogmazic.net/song.php?song_id=52491) - CC-By Ken Bushima\r\n\r\n**Movie Clip**: [Caminades 3: Llamigos](http://www.caminandes.com/) CC-By Blender Institute\r\n\r\n**Video sources**: https://gitlab.gnome.org/Jehan/what-is-peertube/', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', 'thumbnail': r're:https?://.*\.(?:jpg|png)', 'timestamp': 1538391166, 'upload_date': '20181001', @@ -489,21 +490,29 @@ class PeerTubeIE(InfoExtractor): entries = [peertube_url] return entries - def _get_subtitles(self, host, video_id): - video_captions = self._download_json( - 'https://%s/api/v1/videos/%s/captions' % (host, video_id), video_id, fatal=False) - if not isinstance(video_captions, dict): - return None + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return subtitles = {} - for entry in video_captions.get('data'): - language_id = try_get(entry, lambda x: x['language']['id'], compat_str) - caption_path = str_or_none(entry.get('captionPath')) - if language_id and caption_path: - caption_url = urljoin('https://%s' % host, entry.get('captionPath')) - subtitles.setdefault(language_id, []).append({ - 'url': caption_url, - }) + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) return subtitles def _real_extract(self, url): @@ -511,8 +520,8 @@ class PeerTubeIE(InfoExtractor): host = mobj.group('host') or mobj.group('host_2') video_id = mobj.group('id') - video = self._download_json( - 'https://%s/api/v1/videos/%s' % (host, video_id), video_id) + video = self._call_api( + host, video_id, '', note='Downloading video JSON') title = video['name'] @@ -535,12 +544,15 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - video_description = self._download_json( - 'https://%s/api/v1/videos/%s/description' % (host, video_id), video_id, fatal=False) + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) description = None - if isinstance(video_description, dict): - description = str_or_none(video_description.get('description')) + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) + if not description: + description = video.get('description') subtitles = self.extract_subtitles(host, video_id) @@ -569,10 +581,10 @@ class PeerTubeIE(InfoExtractor): 'thumbnail': urljoin(url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), - 'uploader_id': str(account_data('id', int)), + 'uploader_id': str_or_none(account_data('id', int)), 'uploader_url': url_or_none(account_data('url', compat_str)), 'channel': channel_data('displayName', compat_str), - 'channel_id': str(channel_data('id', int)), + 'channel_id': str_or_none(channel_data('id', int)), 'channel_url': url_or_none(channel_data('url', compat_str)), 'language': data('language', 'id', compat_str), 'license': data('licence', 'label', compat_str), From 46cc54ca8f13c7b823c1a12446cdd76d060c74b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Mar 2020 06:23:39 +0700 Subject: [PATCH 0362/1705] [pornhub] Improve title extraction (closes #24184) --- youtube_dl/extractor/pornhub.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b3251ccd9..b8f65af7c 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -189,10 +189,10 @@ class PornHubIE(PornHubBaseIE): # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. title = self._html_search_meta( - 'twitter:title', webpage, default=None) or self._search_regex( - (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', - r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', - r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), webpage, 'title', group='title') video_urls = [] From 12ee431676bb655f04c7dd416a73c1f142ed368d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 3 Mar 2020 12:33:38 +0100 Subject: [PATCH 0363/1705] [vimeo] fix showcase password protected video extraction(closes #24224) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8cd611e1e..cea686afc 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -585,7 +585,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = 'https://vimeo.com/' + video_id elif is_player: url = 'https://player.vimeo.com/video/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf', '/album/', '/showcase/')): url = 'https://vimeo.com/' + video_id try: From dc879c5a37dae588a5bb35d416635678356ad1b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Mar 2020 23:48:25 +0700 Subject: [PATCH 0364/1705] [youtube] Fix age-gated videos support without login (closes #24248) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e06290427..91b9d59c6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1256,7 +1256,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', + r'.*?[-.](?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -2035,7 +2035,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version From 5429d6a9cb2db07472735b4555eeec89ea4f3c87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Mar 2020 00:05:50 +0700 Subject: [PATCH 0365/1705] [youtube] Fix tests --- youtube_dl/extractor/youtube.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 91b9d59c6..d3e18a6ad 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -570,7 +570,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', + 'description': 'md5:19a2f98d9032b9311e686ed039564f63', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], @@ -685,12 +685,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', + 'description': 'md5:307195cd21ff7fa352270fe884570ef0', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', - 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -755,11 +754,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', + 'creator': 'Dada Life, deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', + 'alt_title': 'This Machine Kills Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1135,6 +1134,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, + 'skip': 'not actual anymore', }, { # Youtube Music Auto-generated description @@ -1145,8 +1145,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Voyeur Girl', 'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', + 'uploader': 'Stephen - Topic', + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'artist': 'Stephen', 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', @@ -1210,7 +1210,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '-hcAI0g-f5M', 'ext': 'mp4', 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', + 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', 'upload_date': '20180426', 'uploader': 'Matt Maeson - Topic', 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', From 2db9ac228d0f7ccc1d8078d5e29030665f0e3239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Mar 2020 00:23:14 +0700 Subject: [PATCH 0366/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1a676f4f2..00b97f965 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version <unreleased> + +Extractors +* [youtube] Fix age-gated videos support without login (#24248) +* [vimeo] Fix showcase password protected video extraction (#24224) +* [pornhub] Improve title extraction (#24184) +* [peertube] Improve extraction (#23657) ++ [servus] Add support for new URL schema (#23475, #23583, #24142) +* [vimeo] Fix subtitles URLs (#24209) + + version 2020.03.01 Core From 34525a3885946bbbdb2bfda85f3dcc67c66018f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Mar 2020 00:25:43 +0700 Subject: [PATCH 0367/1705] release 2020.03.06 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 0721d49c3..444a86ee3 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.03.01** +- [ ] I've verified that I'm running youtube-dl version **2020.03.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.01 + [debug] youtube-dl version 2020.03.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 1e67f724d..a1c69a45b 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.03.01** +- [ ] I've verified that I'm running youtube-dl version **2020.03.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 1290b55c4..d391b6d6b 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.01** +- [ ] I've verified that I'm running youtube-dl version **2020.03.06** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 3f006bef8..7422446b0 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.03.01** +- [ ] I've verified that I'm running youtube-dl version **2020.03.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.01 + [debug] youtube-dl version 2020.03.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 202bb9b2f..247d3594d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.01. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.01** +- [ ] I've verified that I'm running youtube-dl version **2020.03.06** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 00b97f965..0efae7d9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.03.06 Extractors * [youtube] Fix age-gated videos support without login (#24248) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fabc1e543..56330ea2e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.03.01' +__version__ = '2020.03.06' From 0ec9d4e565c1471c1234634bb3be0c7c7662d864 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 6 Mar 2020 20:12:35 +0100 Subject: [PATCH 0368/1705] [nhk] update API version(closes #24270) --- youtube_dl/extractor/nhk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 6a2c6cb7b..d2cbc9f54 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -31,7 +31,7 @@ class NhkVodIE(InfoExtractor): 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7/episode/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() From f93abcf1da5aa1ca122896254bdec3a3c831ac24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 05:09:02 +0700 Subject: [PATCH 0369/1705] [youtube] Improve extraction in 429 error conditions (closes #24283) --- youtube_dl/extractor/youtube.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d3e18a6ad..9cbdc7ac5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1790,11 +1790,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): query['el'] = el if sts: query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) + try: + video_info_webpage = self._download_webpage( + '%s://www.youtube.com/get_video_info' % proto, + video_id, note=False, + errnote='unable to download video info webpage', + query=query) + except ExtractorError as e: + # Skip further retries if we get 429 since solving + # captcha only unblocks access to website but + # not get_video_info end point + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 429: + break + continue if not video_info_webpage: continue get_video_info = compat_parse_qs(video_info_webpage) @@ -1833,13 +1841,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if messages: return '\n'.join(messages) - if not video_info: + if not video_info and not player_response: unavailable_message = extract_unavailable_message() if not unavailable_message: unavailable_message = 'Unable to extract video data' raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) + if not isinstance(video_info, dict): + video_info = {} + video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} From d332ec725db5b1102473a75b9fc3212913bda618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 05:37:47 +0700 Subject: [PATCH 0370/1705] [youtube] Improve age-gated videos extraction in 429 error conditions (refs #24283) --- youtube_dl/extractor/youtube.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9cbdc7ac5..906988e6f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1726,6 +1726,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True + video_info = None # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id @@ -1737,15 +1738,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) + try: + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + except ExtractorError: + video_info_webpage = None + if video_info_webpage: + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False video_info = None From 43ebf77df3bbd93dbbd0336b0243d8d50895ab72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 08:34:17 +0700 Subject: [PATCH 0371/1705] [youtube] Remove outdated code Additional get_video_info requests don't seem to provide any extra itags any longer --- youtube_dl/extractor/youtube.py | 108 ++++++-------------------------- 1 file changed, 18 insertions(+), 90 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 906988e6f..908defecd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,7 +29,6 @@ from ..compat import ( from ..utils import ( bool_or_none, clean_html, - dict_get, error_to_compat_str, extract_attributes, ExtractorError, @@ -1708,9 +1707,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - def extract_player_response(player_response, video_id): pl_response = str_or_none(player_response) if not pl_response: @@ -1723,10 +1719,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = {} # Get video info + video_info = {} embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True - video_info = None # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id @@ -1753,8 +1749,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): view_count = extract_view_count(video_info) else: age_gate = False - video_info = None - sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1771,69 +1765,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True - sts = ytplayer_config.get('sts') if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - try: - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - query=query) - except ExtractorError as e: - # Skip further retries if we get 429 since solving - # captcha only unblocks access to website but - # not get_video_info end point - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 429: - break - continue - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break def extract_unavailable_message(): messages = [] @@ -2408,30 +2343,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['stretched_ratio'] = ratio if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) + if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) + reason = video_info['reason'][0] + if 'Invalid parameters' in reason: + unavailable_message = extract_unavailable_message() + if unavailable_message: + reason = unavailable_message + raise ExtractorError( + 'YouTube said: %s' % reason, + expected=True, video_id=video_id) + if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) From ea782aca520ff17fbf32771bfcfd9cbd36123900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 09:17:17 +0700 Subject: [PATCH 0372/1705] [README.md] Clarify 429 error --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 01f975958..4f54a5240 100644 --- a/README.md +++ b/README.md @@ -835,7 +835,9 @@ In February 2015, the new YouTube player contained a character sequence in a str ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Usually this is a soft block meaning that you can gain access again after solving CAPTCHA. Just open a browser and solve a CAPTCHA the service suggests you and after that [pass cookies](#how-do-i-pass-cookies-to-youtube-dl) to youtube-dl. Note that if your machine has multiple external IPs then you should also pass exactly the same IP you've used for solving CAPTCHA with [`--source-address`](#network-options). Also you may need to pass a `User-Agent` HTTP header of your browser with [`--user-agent`](#workarounds). + +If this is not the case (no CAPTCHA suggested to solve by the service) then you can contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character From fa9b8c662808a50605bb05f90af101e13b30fce6 Mon Sep 17 00:00:00 2001 From: Tristan Waddington <tristan.waddington@gmail.com> Date: Sun, 8 Mar 2020 04:00:25 -0700 Subject: [PATCH 0373/1705] [pornhub] Add support for pornhubpremium.com (#24288) --- youtube_dl/extractor/pornhub.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b8f65af7c..3567a3283 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -52,7 +52,7 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -149,6 +149,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, }] @staticmethod @@ -166,6 +169,13 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') + if 'premium' in host: + if not self._downloader.params.get('cookiefile'): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies.', + expected=True) + self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): @@ -405,7 +415,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -473,7 +483,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -588,7 +598,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { From cff99c91d150df2a4e21962a3ca8d4ae94533b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 17:52:19 +0700 Subject: [PATCH 0374/1705] [utils] Add support for cookies with spaces used instead of tabs --- test/test_YoutubeDLCookieJar.py | 14 +++++++++----- test/testdata/cookies/cookie_file_with_spaces.txt | 5 +++++ youtube_dl/utils.py | 5 +++++ 3 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 test/testdata/cookies/cookie_file_with_spaces.txt diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index f959798de..f833efac5 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -14,6 +14,9 @@ from youtube_dl.utils import YoutubeDLCookieJar class TestYoutubeDLCookieJar(unittest.TestCase): + def __assert_cookie_has_value(self, cookiejar, key): + self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') + def test_keep_session_cookies(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') cookiejar.load(ignore_discard=True, ignore_expires=True) @@ -32,12 +35,13 @@ class TestYoutubeDLCookieJar(unittest.TestCase): def test_strip_httponly_prefix(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') cookiejar.load(ignore_discard=True, ignore_expires=True) + self.__assert_cookie_has_value(cookiejar, 'HTTPONLY_COOKIE') + self.__assert_cookie_has_value(cookiejar, 'JS_ACCESSIBLE_COOKIE') - def assert_cookie_has_value(key): - self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') - - assert_cookie_has_value('HTTPONLY_COOKIE') - assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') + def test_convert_spaces_to_tabs(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/cookie_file_with_spaces.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + self.__assert_cookie_has_value(cookiejar, 'COOKIE') if __name__ == '__main__': diff --git a/test/testdata/cookies/cookie_file_with_spaces.txt b/test/testdata/cookies/cookie_file_with_spaces.txt new file mode 100644 index 000000000..6fda35fa0 --- /dev/null +++ b/test/testdata/cookies/cookie_file_with_spaces.txt @@ -0,0 +1,5 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +www.foobar.foobar FALSE / TRUE 2147483647 COOKIE COOKIE_VALUE diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8ccf25489..93d1dec05 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2752,6 +2752,11 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): for line in f: if line.startswith(self._HTTPONLY_PREFIX): line = line[len(self._HTTPONLY_PREFIX):] + # Cookie file may contain spaces instead of tabs. + # Replace all spaces with tabs to make such cookie files work + # with MozillaCookieJar. + if not line.startswith('#'): + line = re.sub(r' +', r'\t', line) cf.write(compat_str(line)) cf.seek(0) self._really_load(cf, filename, ignore_discard, ignore_expires) From 434f57304683552b5d3c1a76130319ebd8139340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 18:16:17 +0700 Subject: [PATCH 0375/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 0efae7d9e..12815f3e1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +version <unreleased> + +Core ++ [utils] Add support for cookie files with spaces + +Extractors ++ [pornhub] Add support for pornhubpremium.com (#24288) +- [youtube] Remove outdated code and unnecessary requests +* [youtube] Improve extraction in 429 HTTP error conditions (#24283) +* [nhk] Update API version (#24270) + + version 2020.03.06 Extractors From 68fa15155f40f7d646079e6649df03696a5e7d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 Mar 2020 18:27:20 +0700 Subject: [PATCH 0376/1705] release 2020.03.08 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 444a86ee3..d82ff9111 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.08. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.08** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.06 + [debug] youtube-dl version 2020.03.08 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index a1c69a45b..04b350f76 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.08. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.08** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d391b6d6b..6f17ad7bc 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.08. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.08** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7422446b0..efb179ea5 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.08. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.08** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.06 + [debug] youtube-dl version 2020.03.08 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 247d3594d..cf4874bcc 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.03.08. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.06** +- [ ] I've verified that I'm running youtube-dl version **2020.03.08** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 12815f3e1..84b43c642 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.03.08 Core + [utils] Add support for cookie files with spaces diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 56330ea2e..0f768f7c1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.03.06' +__version__ = '2020.03.08' From 042b66493398dd8c3bb31216e3f828b98716810d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Mar 2020 04:51:20 +0700 Subject: [PATCH 0377/1705] Revert "[utils] Add support for cookies with spaces used instead of tabs" According to [1] TABs must be used as separators between fields. Files produces by some tools with spaces as separators are considered malformed. 1. https://curl.haxx.se/docs/http-cookies.html This reverts commit cff99c91d150df2a4e21962a3ca8d4ae94533b8c. --- test/test_YoutubeDLCookieJar.py | 14 +++++--------- test/testdata/cookies/cookie_file_with_spaces.txt | 5 ----- youtube_dl/utils.py | 5 ----- 3 files changed, 5 insertions(+), 19 deletions(-) delete mode 100644 test/testdata/cookies/cookie_file_with_spaces.txt diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index f833efac5..f959798de 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -14,9 +14,6 @@ from youtube_dl.utils import YoutubeDLCookieJar class TestYoutubeDLCookieJar(unittest.TestCase): - def __assert_cookie_has_value(self, cookiejar, key): - self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') - def test_keep_session_cookies(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') cookiejar.load(ignore_discard=True, ignore_expires=True) @@ -35,13 +32,12 @@ class TestYoutubeDLCookieJar(unittest.TestCase): def test_strip_httponly_prefix(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') cookiejar.load(ignore_discard=True, ignore_expires=True) - self.__assert_cookie_has_value(cookiejar, 'HTTPONLY_COOKIE') - self.__assert_cookie_has_value(cookiejar, 'JS_ACCESSIBLE_COOKIE') - def test_convert_spaces_to_tabs(self): - cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/cookie_file_with_spaces.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) - self.__assert_cookie_has_value(cookiejar, 'COOKIE') + def assert_cookie_has_value(key): + self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') + + assert_cookie_has_value('HTTPONLY_COOKIE') + assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') if __name__ == '__main__': diff --git a/test/testdata/cookies/cookie_file_with_spaces.txt b/test/testdata/cookies/cookie_file_with_spaces.txt deleted file mode 100644 index 6fda35fa0..000000000 --- a/test/testdata/cookies/cookie_file_with_spaces.txt +++ /dev/null @@ -1,5 +0,0 @@ -# Netscape HTTP Cookie File -# http://curl.haxx.se/rfc/cookie_spec.html -# This is a generated file! Do not edit. - -www.foobar.foobar FALSE / TRUE 2147483647 COOKIE COOKIE_VALUE diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 93d1dec05..8ccf25489 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2752,11 +2752,6 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): for line in f: if line.startswith(self._HTTPONLY_PREFIX): line = line[len(self._HTTPONLY_PREFIX):] - # Cookie file may contain spaces instead of tabs. - # Replace all spaces with tabs to make such cookie files work - # with MozillaCookieJar. - if not line.startswith('#'): - line = re.sub(r' +', r'\t', line) cf.write(compat_str(line)) cf.seek(0) self._really_load(cf, filename, ignore_discard, ignore_expires) From f1a8511f7b5bbbd9e64f597c87791ad3b4310efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 10 Mar 2020 04:59:02 +0700 Subject: [PATCH 0378/1705] [utils] Add reference to cookie file format --- youtube_dl/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8ccf25489..38262bee4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2729,6 +2729,11 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ _HTTPONLY_PREFIX = '#HttpOnly_' def save(self, filename=None, ignore_discard=False, ignore_expires=False): From 40b6495d403a4636ea8be1dd9a2dad33c136a1e3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 13 Mar 2020 08:59:10 +0100 Subject: [PATCH 0379/1705] Revert "[vimeo] fix showcase password protected video extraction(closes #24224)" This reverts commit 12ee431676bb655f04c7dd416a73c1f142ed368d. --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cea686afc..8cd611e1e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -585,7 +585,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = 'https://vimeo.com/' + video_id elif is_player: url = 'https://player.vimeo.com/video/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf', '/album/', '/showcase/')): + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id try: From fcaf4d7a067e9ac6b36a5f7b1100be391f492a79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Mar 2020 04:39:21 +0700 Subject: [PATCH 0380/1705] [nhk] Relax _VALID_URL (#24329) --- youtube_dl/extractor/nhk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index d2cbc9f54..45bc4d85e 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[a-z]+-\d{8}-\d+)' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)' # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -30,6 +30,9 @@ class NhkVodIE(InfoExtractor): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', + 'only_matching': True, }] _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' From 9bfe08859491882e40603ed6c4eb59760ed35ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Mar 2020 04:40:11 +0700 Subject: [PATCH 0381/1705] [nhk] Remove obsolete rtmp formats (closes #24329) --- youtube_dl/extractor/nhk.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 45bc4d85e..ce13f6bb9 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -87,13 +87,6 @@ class NhkVodIE(InfoExtractor): info['formats'] = self._extract_m3u8_formats( 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, episode_id, 'm4a', m3u8_id='hls', fatal=False) - for proto in ('rtmpt', 'rtmp'): - info['formats'].append({ - 'ext': 'flv', - 'format_id': proto, - 'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path), - 'vcodec': 'none', - }) for f in info['formats']: f['language'] = lang return info From 541fe3eaff579f72ccc14f97009a8904f739368f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Mar 2020 04:42:40 +0700 Subject: [PATCH 0382/1705] [nhk] Update m3u8 URL and use native hls (#24329) --- youtube_dl/extractor/nhk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index ce13f6bb9..de6a707c4 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -85,8 +85,9 @@ class NhkVodIE(InfoExtractor): audio = episode['audio'] audio_path = audio['audio'] info['formats'] = self._extract_m3u8_formats( - 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', m3u8_id='hls', fatal=False) + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang return info From 4cbce88f8b44ab17d55fe1b7615e37a2c1f142d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Mar 2020 04:58:24 +0700 Subject: [PATCH 0383/1705] [ndr] Fix extraction (closes #24326) --- youtube_dl/extractor/ndr.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 9c8bf05af..2447c812e 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + merge_dicts, parse_iso8601, qualities, try_get, @@ -87,21 +88,25 @@ class NDRIE(NDRBaseIE): def _extract_embed(self, webpage, display_id): embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', fatal=True) + 'embedURL', webpage, 'embed URL', + default=None) or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url') description = self._search_regex( r'<p[^>]+itemprop="description">([^<]+)</p>', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', fatal=False)) - return { + webpage, 'upload date', default=None)) + info = self._search_json_ld(webpage, display_id, default={}) + return merge_dicts({ '_type': 'url_transparent', 'url': embed_url, 'display_id': display_id, 'description': description, 'timestamp': timestamp, - } + }, info) class NJoyIE(NDRBaseIE): From 4568a11802b47c22ac43d1187435ab2e6d7a3c6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Mar 2020 22:57:10 +0700 Subject: [PATCH 0384/1705] [xtube] Fix formats extraction (closes #24348) --- youtube_dl/extractor/xtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 47caec1de..79dd647b3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -98,7 +98,7 @@ class XTubeIE(InfoExtractor): title = config.get('title') thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) - sources = config.get('sources') + sources = config.get('sources') or config.get('format') if isinstance(sources, dict): sources = self._parse_json(self._search_regex( From 158bc5ac03a175d95337979afa02f7423c2bf445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Mar 2020 22:58:10 +0700 Subject: [PATCH 0385/1705] [xtube] Fix typo --- youtube_dl/extractor/xtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 79dd647b3..01b253dcb 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -100,7 +100,7 @@ class XTubeIE(InfoExtractor): duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') - if isinstance(sources, dict): + if not isinstance(sources, dict): sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', webpage, 'sources', group='sources'), video_id, From 73453430c11002a7193eaa9fb8cf5349fa326c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 21 Mar 2020 00:59:48 +0700 Subject: [PATCH 0386/1705] [hellporno] Fix extraction (closes #24399) --- youtube_dl/extractor/hellporno.py | 73 ++++++++++++++++--------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index 0ee8ea712..fae425103 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -1,12 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - js_to_json, + int_or_none, + merge_dicts, remove_end, - determine_ext, + unified_timestamp, ) @@ -14,15 +13,21 @@ class HellPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', - 'md5': '1fee339c610d2049699ef2aa699439f1', + 'md5': 'f0a46ebc0bed0c72ae8fe4629f7de5f3', 'info_dict': { 'id': '149116', 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', 'ext': 'mp4', 'title': 'Dixie is posing with naked ass very erotic', + 'description': 'md5:9a72922749354edb1c4b6e540ad3d215', + 'categories': list, 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 240, + 'timestamp': 1398762720, + 'upload_date': '20140429', + 'view_count': int, 'age_limit': 18, - } + }, }, { 'url': 'http://hellporno.net/v/186271/', 'only_matching': True, @@ -36,40 +41,36 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_search_regex( r'<title>([^<]+)', webpage, 'title'), ' - Hell Porno') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), - display_id, transform_source=js_to_json) + info = self._parse_html5_media_entries(url, webpage, display_id)[0] + self._sort_formats(info['formats']) - video_id = flashvars.get('video_id') - thumbnail = flashvars.get('preview_url') - ext = determine_ext(flashvars.get('postfix'), 'mp4') + video_id = self._search_regex( + (r'chs_object\s*=\s*["\'](\d+)', + r'params\[["\']video_id["\']\]\s*=\s*(\d+)'), webpage, 'video id', + default=display_id) + description = self._search_regex( + r'class=["\']desc_video_view_v2[^>]+>([^<]+)', webpage, + 'description', fatal=False) + categories = [ + c.strip() + for c in self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + if c.strip()] + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, fatal=False)) + timestamp = unified_timestamp(self._og_search_property( + 'video:release_date', webpage, fatal=False)) + view_count = int_or_none(self._search_regex( + r'>Views\s+(\d+)', webpage, 'view count', fatal=False)) - formats = [] - for video_url_key in ['video_url', 'video_alt_url']: - video_url = flashvars.get(video_url_key) - if not video_url: - continue - video_text = flashvars.get('%s_text' % video_url_key) - fmt = { - 'url': video_url, - 'ext': ext, - 'format_id': video_text, - } - m = re.search(r'^(?P\d+)[pP]', video_text) - if m: - fmt['height'] = int(m.group('height')) - formats.append(fmt) - self._sort_formats(formats) - - categories = self._html_search_meta( - 'keywords', webpage, 'categories', default='').split(',') - - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, 'categories': categories, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, 'age_limit': 18, - 'formats': formats, - } + }) From 787c3604671283bd4945eefb87866d01fb973097 Mon Sep 17 00:00:00 2001 From: Devon Meunier Date: Sun, 19 May 2019 07:32:46 -0400 Subject: [PATCH 0387/1705] [cbc:watch] Add support for authentication --- youtube_dl/extractor/cbc.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 751a3a8f2..b02cddbfd 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import json import re +from xml.sax.saxutils import escape from .common import InfoExtractor from ..compat import ( @@ -216,6 +217,29 @@ class CBCWatchBaseIE(InfoExtractor): 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } _GEO_COUNTRIES = ['CA'] + _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' + _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' + _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcwatch' + + def _signature(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._API_KEY} + resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + # token + query = { + 'access_token': access_token, + 'apikey': self._API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) + return resp['signature'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -249,13 +273,21 @@ class CBCWatchBaseIE(InfoExtractor): return self._device_id and self._device_token def _register_device(self): - self._device_id = self._device_token = None result = self._download_xml( self._API_BASE_URL + 'device/register', None, 'Acquiring device token', data=b'web') self._device_id = xpath_text(result, 'deviceId', fatal=True) - self._device_token = xpath_text(result, 'deviceToken', fatal=True) + anon_device_token = xpath_text(result, 'deviceToken', fatal=True) + email, password = self._get_login_info() + if email and password: + signature = self._signature(email, password) + data = '{0}{1}web'.format(escape(signature), escape(self._device_id)).encode() + url = self._API_BASE_URL + 'device/login' + result = self._download_xml(url, None, data=data, headers={'content-type': 'application/xml'}) + self._device_token = xpath_text(result, 'token', fatal=True) + else: + self._device_token = anon_device_token self._downloader.cache.store( 'cbcwatch', 'device', { 'id': self._device_id, From c76cdf2382c91af13de0c7580b1b5e1b24484664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 21 Mar 2020 01:41:54 +0700 Subject: [PATCH 0388/1705] [cbc:watch] Fix authenticated device token caching (closes #19160) --- youtube_dl/extractor/cbc.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index b02cddbfd..fd5ec6033 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import hashlib import json import re from xml.sax.saxutils import escape @@ -263,7 +264,8 @@ class CBCWatchBaseIE(InfoExtractor): def _real_initialize(self): if self._valid_device_token(): return - device = self._downloader.cache.load('cbcwatch', 'device') or {} + device = self._downloader.cache.load( + 'cbcwatch', self._cache_device_key()) or {} self._device_id, self._device_token = device.get('id'), device.get('token') if self._valid_device_token(): return @@ -272,24 +274,30 @@ class CBCWatchBaseIE(InfoExtractor): def _valid_device_token(self): return self._device_id and self._device_token + def _cache_device_key(self): + email, _ = self._get_login_info() + return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' + def _register_device(self): result = self._download_xml( self._API_BASE_URL + 'device/register', None, 'Acquiring device token', data=b'web') self._device_id = xpath_text(result, 'deviceId', fatal=True) - anon_device_token = xpath_text(result, 'deviceToken', fatal=True) email, password = self._get_login_info() if email and password: signature = self._signature(email, password) - data = '{0}{1}web'.format(escape(signature), escape(self._device_id)).encode() + data = '{0}{1}web'.format( + escape(signature), escape(self._device_id)).encode() url = self._API_BASE_URL + 'device/login' - result = self._download_xml(url, None, data=data, headers={'content-type': 'application/xml'}) + result = self._download_xml( + url, None, data=data, + headers={'content-type': 'application/xml'}) self._device_token = xpath_text(result, 'token', fatal=True) else: - self._device_token = anon_device_token + self._device_token = xpath_text(result, 'deviceToken', fatal=True) self._downloader.cache.store( - 'cbcwatch', 'device', { + 'cbcwatch', self._cache_device_key(), { 'id': self._device_id, 'token': self._device_token, }) From a6c5859d6b106733905c3a95fc52b53a784c94da Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 22 Mar 2020 09:24:07 +0100 Subject: [PATCH 0389/1705] [soundcloud] fix download url extraction(closes #24394) --- youtube_dl/extractor/soundcloud.py | 91 ++++++++---------------------- 1 file changed, 24 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index a1372d389..ff6be0b54 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -27,6 +27,7 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, + urlhandle_detect_ext, ) @@ -96,7 +97,7 @@ class SoundcloudIE(InfoExtractor): 'repost_count': int, } }, - # not streamable song, preview + # geo-restricted { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { @@ -108,17 +109,13 @@ class SoundcloudIE(InfoExtractor): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 30, + 'duration': 227.155, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - 'params': { - # rtmp - 'skip_download': True, - }, }, # private link { @@ -229,7 +226,6 @@ class SoundcloudIE(InfoExtractor): 'skip_download': True, }, }, - # not available via api.soundcloud.com/i1/tracks/id/streams { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', @@ -250,11 +246,9 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - 'expected_warnings': ['Unable to download JSON metadata'], } ] - _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' @@ -316,10 +310,9 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url - def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): + def _extract_info_dict(self, info, full_title=None, secret_token=None): track_id = compat_str(info['id']) title = info['title'] - track_base_url = self._API_BASE + 'tracks/%s' % track_id format_urls = set() formats = [] @@ -328,21 +321,22 @@ class SoundcloudIE(InfoExtractor): query['secret_token'] = secret_token if info.get('downloadable') and info.get('has_downloads_left'): - format_url = update_url_query( - info.get('download_url') or track_base_url + '/download', query) - format_urls.add(format_url) - if version == 2: - v1_info = self._download_json( - track_base_url, track_id, query=query, fatal=False) or {} - else: - v1_info = info - formats.append({ - 'format_id': 'download', - 'ext': v1_info.get('original_format') or 'mp3', - 'filesize': int_or_none(v1_info.get('original_content_size')), - 'url': format_url, - 'preference': 10, - }) + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'preference': 10, + }) def invalid_url(url): return not url or url in format_urls @@ -406,42 +400,11 @@ class SoundcloudIE(InfoExtractor): }, 'http' if protocol == 'progressive' else protocol, t.get('snipped') or '/preview/' in format_url) - if not formats: - # Old API, does not work for some tracks (e.g. - # https://soundcloud.com/giovannisarani/mezzo-valzer) - # and might serve preview URLs (e.g. - # http://www.soundcloud.com/snbrn/ele) - format_dict = self._download_json( - track_base_url + '/streams', track_id, - 'Downloading track url', query=query, fatal=False) or {} - - for key, stream_url in format_dict.items(): - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key) - if mobj: - protocol, ext, abr = mobj.groups() - add_format({ - 'abr': abr, - 'ext': ext, - 'url': stream_url, - }, protocol) - - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - urlh = self._request_webpage( - HEADRequest(info.get('stream_url') or track_base_url + '/stream'), - track_id, query=query, fatal=False) - if urlh: - stream_url = urlh.geturl() - if not invalid_url(stream_url): - add_format({'url': stream_url}, 'http') - for f in formats: f['vcodec'] = 'none' + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted() self._sort_formats(formats) user = info.get('user') or {} @@ -511,16 +474,10 @@ class SoundcloudIE(InfoExtractor): resolve_title += '/%s' % token info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - version = 2 info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) - if not info: - info = self._download_json( - info_json_url.replace(self._API_V2_BASE, self._API_BASE), - full_title, 'Downloading info JSON', query=query) - version = 1 + info_json_url, full_title, 'Downloading info JSON', query=query) - return self._extract_info_dict(info, full_title, token, version) + return self._extract_info_dict(info, full_title, token) class SoundcloudPlaylistBaseIE(SoundcloudIE): From 2e20cb36364b91c1d928ce896064fdc7c49e82f8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 23 Mar 2020 12:57:10 +0100 Subject: [PATCH 0390/1705] [limelight] remove disabled API requests(closes #24255) --- youtube_dl/extractor/limelight.py | 125 ++++++++++++----------------- youtube_dl/extractor/pokemon.py | 12 +-- youtube_dl/extractor/telequebec.py | 2 - youtube_dl/extractor/tfo.py | 6 +- 4 files changed, 59 insertions(+), 86 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 729d8de50..39f74d282 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -18,7 +18,6 @@ from ..utils import ( class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' - _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' @classmethod def _extract_urls(cls, webpage, source_url): @@ -70,7 +69,8 @@ class LimelightBaseIE(InfoExtractor): try: return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) + item_id, 'Downloading PlaylistService %s JSON' % method, + fatal=fatal, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] @@ -79,22 +79,22 @@ class LimelightBaseIE(InfoExtractor): raise ExtractorError(error, expected=True) raise - def _call_api(self, organization_id, item_id, method): - return self._download_json( - self._API_URL % (organization_id, self._API_PATH, item_id, method), - item_id, 'Downloading API %s JSON' % method) - - def _extract(self, item_id, pc_method, mobile_method, meta_method, referer=None): + def _extract(self, item_id, pc_method, mobile_method, referer=None): pc = self._call_playlist_service(item_id, pc_method, referer=referer) - metadata = self._call_api(pc['orgId'], item_id, meta_method) - mobile = self._call_playlist_service(item_id, mobile_method, fatal=False, referer=referer) - return pc, mobile, metadata + mobile = self._call_playlist_service( + item_id, mobile_method, fatal=False, referer=referer) + return pc, mobile + + def _extract_info(self, pc, mobile, i, referer): + get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {} + pc_item = get_item(pc, 'playlistItems') + mobile_item = get_item(mobile, 'mediaList') + video_id = pc_item.get('mediaId') or mobile_item['mediaId'] + title = pc_item.get('title') or mobile_item['title'] - def _extract_info(self, streams, mobile_urls, properties): - video_id = properties['media_id'] formats = [] urls = [] - for stream in streams: + for stream in pc_item.get('streams', []): stream_url = stream.get('url') if not stream_url or stream.get('drmProtected') or stream_url in urls: continue @@ -155,7 +155,7 @@ class LimelightBaseIE(InfoExtractor): }) formats.append(fmt) - for mobile_url in mobile_urls: + for mobile_url in mobile_item.get('mobileUrls', []): media_url = mobile_url.get('mobileUrl') format_id = mobile_url.get('targetMediaPlatform') if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: @@ -179,54 +179,34 @@ class LimelightBaseIE(InfoExtractor): self._sort_formats(formats) - title = properties['title'] - description = properties.get('description') - timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) - duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) - filesize = int_or_none(properties.get('total_storage_in_bytes')) - categories = [properties.get('category')] - tags = properties.get('tags', []) - thumbnails = [{ - 'url': thumbnail['url'], - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] - subtitles = {} - for caption in properties.get('captions', []): - lang = caption.get('language_code') - subtitles_url = caption.get('url') - if lang and subtitles_url: - subtitles.setdefault(lang, []).append({ - 'url': subtitles_url, - }) - closed_captions_url = properties.get('closed_captions_url') - if closed_captions_url: - subtitles.setdefault('en', []).append({ - 'url': closed_captions_url, - 'ext': 'ttml', - }) + for flag in mobile_item.get('flags'): + if flag == 'ClosedCaptions': + closed_captions = self._call_playlist_service( + video_id, 'getClosedCaptionsDetailsByMediaId', + False, referer) or [] + for cc in closed_captions: + cc_url = cc.get('webvttFileUrl') + if not cc_url: + continue + lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + break + + get_meta = lambda x: pc_item.get(x) or mobile_item.get(x) return { 'id': video_id, 'title': title, - 'description': description, + 'description': get_meta('description'), 'formats': formats, - 'timestamp': timestamp, - 'duration': duration, - 'filesize': filesize, - 'categories': categories, - 'tags': tags, - 'thumbnails': thumbnails, + 'duration': float_or_none(get_meta('durationInMilliseconds'), 1000), + 'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'), 'subtitles': subtitles, } - def _extract_info_helper(self, pc, mobile, i, metadata): - return self._extract_info( - try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], - try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], - metadata) - class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -251,8 +231,6 @@ class LimelightMediaIE(LimelightBaseIE): 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 144.23, - 'timestamp': 1244136834, - 'upload_date': '20090604', }, 'params': { # m3u8 download @@ -268,30 +246,29 @@ class LimelightMediaIE(LimelightBaseIE): 'title': '3Play Media Overview Video', 'thumbnail': r're:^https?://.*\.jpeg$', 'duration': 78.101, - 'timestamp': 1338929955, - 'upload_date': '20120605', - 'subtitles': 'mincount:9', + # TODO: extract all languages that were accessible via API + # 'subtitles': 'mincount:9', + 'subtitles': 'mincount:1', }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'media' - _API_PATH = 'media' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) + source_url = smuggled_data.get('source_url') self._initialize_geo_bypass({ 'countries': smuggled_data.get('geo_countries'), }) - pc, mobile, metadata = self._extract( + pc, mobile = self._extract( video_id, 'getPlaylistByMediaId', - 'getMobilePlaylistByMediaId', 'properties', - smuggled_data.get('source_url')) + 'getMobilePlaylistByMediaId', source_url) - return self._extract_info_helper(pc, mobile, 0, metadata) + return self._extract_info(pc, mobile, 0, source_url) class LimelightChannelIE(LimelightBaseIE): @@ -313,6 +290,7 @@ class LimelightChannelIE(LimelightBaseIE): 'info_dict': { 'id': 'ab6a524c379342f9b23642917020c082', 'title': 'Javascript Sample Code', + 'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html', }, 'playlist_mincount': 3, }, { @@ -320,22 +298,23 @@ class LimelightChannelIE(LimelightBaseIE): 'only_matching': True, }] _PLAYLIST_SERVICE_PATH = 'channel' - _API_PATH = 'channels' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) channel_id = self._match_id(url) + source_url = smuggled_data.get('source_url') - pc, mobile, medias = self._extract( + pc, mobile = self._extract( channel_id, 'getPlaylistByChannelId', 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', - 'media', smuggled_data.get('source_url')) + source_url) entries = [ - self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) - for i in range(len(medias['media_list']))] + self._extract_info(pc, mobile, i, source_url) + for i in range(len(pc['playlistItems']))] - return self.playlist_result(entries, channel_id, pc['title']) + return self.playlist_result( + entries, channel_id, pc.get('title'), mobile.get('description')) class LimelightChannelListIE(LimelightBaseIE): @@ -368,10 +347,12 @@ class LimelightChannelListIE(LimelightBaseIE): def _real_extract(self, url): channel_list_id = self._match_id(url) - channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') + channel_list = self._call_playlist_service( + channel_list_id, 'getMobileChannelListById') entries = [ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') for channel in channel_list['channelList']] - return self.playlist_result(entries, channel_list_id, channel_list['title']) + return self.playlist_result( + entries, channel_list_id, channel_list['title']) diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index dd5f17f11..80222d428 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -20,20 +20,16 @@ class PokemonIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Ol’ Raise and Switch!', 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - 'timestamp': 1511824728, - 'upload_date': '20171127', }, 'add_id': ['LimelightMedia'], }, { # no data-video-title - 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', 'info_dict': { - 'id': '99f3bae270bf4e5097274817239ce9c8', + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', 'ext': 'mp4', - 'title': 'Pokémon: The Rise of Darkrai', - 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', - 'timestamp': 1417778347, - 'upload_date': '20141205', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', }, 'add_id': ['LimelightMedia'], 'params': { diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index ae9f66787..c82c94b3a 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -38,8 +38,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', - 'upload_date': '20180222', - 'timestamp': 1519326631, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py index 0e2370cd8..0631cb7ab 100644 --- a/youtube_dl/extractor/tfo.py +++ b/youtube_dl/extractor/tfo.py @@ -17,14 +17,12 @@ class TFOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P\d+)' _TEST = { 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', - 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'md5': 'cafbe4f47a8dae0ca0159937878100d6', 'info_dict': { - 'id': '100463871', + 'id': '7da3d50e495c406b8fc0b997659cc075', 'ext': 'mp4', 'title': 'Video Game Hackathon', 'description': 'md5:558afeba217c6c8d96c60e5421795c07', - 'upload_date': '20160212', - 'timestamp': 1455310233, } } From b4eb08bb03f69c587f8440912cf56aadc9e52879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 00:11:39 +0700 Subject: [PATCH 0391/1705] [bilibili] Add support for new URL schema with BV ids (closes #24439, closes #24442) --- youtube_dl/extractor/bilibili.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80bd696e2..e9d0a8d0c 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -24,7 +24,18 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|bangumi)\.)? + bilibili\.(?:tv|com)/ + (?: + (?: + video/[aA][vV]| + anime/(?P\d+)/play\# + )(?P\d+)| + video/[bB][vV](?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -92,6 +103,10 @@ class BiliBiliIE(InfoExtractor): 'skip_download': True, # Test metadata only }, }] + }, { + # new BV video id format + 'url': 'https://www.bilibili.com/video/BV1JE411F741', + 'only_matching': True, }] _APP_KEY = 'iVGUTjsxvpLeuDCf' @@ -109,7 +124,7 @@ class BiliBiliIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('id') or mobj.group('id_bv') anime_id = mobj.group('anime_id') webpage = self._download_webpage(url, video_id) From 63dce3094bf45964b49a2c9f26c94b10cf60c2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 00:24:39 +0700 Subject: [PATCH 0392/1705] [bilibili] Add support for player.bilibili.com (closes #24402) --- youtube_dl/extractor/bilibili.py | 14 ++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 15 insertions(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index e9d0a8d0c..4dc597e16 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -434,3 +434,17 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): entries, am_id, album_title, album_data.get('intro')) return self.playlist_result(entries, am_id) + + +class BiliBiliPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P\d+)' + _TEST = { + 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.bilibili.tv/video/av%s/' % video_id, + ie=BiliBiliIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d1fa251..ef803b8a7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -105,6 +105,7 @@ from .bilibili import ( BiliBiliBangumiIE, BilibiliAudioIE, BilibiliAudioAlbumIE, + BiliBiliPlayerIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( From 4560adc820a5d4bda5babc62f0f7fc306b13ad86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 02:43:24 +0700 Subject: [PATCH 0393/1705] [teachable] Extract chapter metadata (closes #24421) --- youtube_dl/extractor/teachable.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 4316a6962..290c65754 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -7,7 +7,9 @@ from .wistia import WistiaIE from ..utils import ( clean_html, ExtractorError, + int_or_none, get_element_by_class, + strip_or_none, urlencode_postdata, urljoin, ) @@ -173,11 +175,34 @@ class TeachableIE(TeachableBaseIE): title = self._og_search_title(webpage, default=None) + chapter = None + chapter_number = None + section_item = self._search_regex( + r'(?s)(?P
  • ]+\bdata-lecture-id=["\']%s[^>]+>.+?
  • )' % video_id, + webpage, 'section item', default=None, group='li') + if section_item: + chapter_number = int_or_none(self._search_regex( + r'data-ss-position=["\'](\d+)', section_item, 'section id', + default=None)) + if chapter_number is not None: + sections = [] + for s in re.findall( + r'(?s)]+\bclass=["\']section-title[^>]+>(.+?)', webpage): + section = strip_or_none(clean_html(s)) + if not section: + sections = [] + break + sections.append(section) + if chapter_number <= len(sections): + chapter = sections[chapter_number - 1] + entries = [{ '_type': 'url_transparent', 'url': wistia_url, 'ie_key': WistiaIE.ie_key(), 'title': title, + 'chapter': chapter, + 'chapter_number': chapter_number, } for wistia_url in wistia_urls] return self.playlist_result(entries, video_id, title) From be7dacf9cfc3603ba6e4f818a8988a527f06d6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 02:46:37 +0700 Subject: [PATCH 0394/1705] [generic] Look for teachable embeds before wistia --- youtube_dl/extractor/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d1ec56be9..a495ee15a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2536,6 +2536,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) + # Look for Teachable embeds, must be before Wistia + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + # Look for embedded Wistia player wistia_urls = WistiaIE._extract_urls(webpage) if wistia_urls: @@ -3141,10 +3146,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( From 08a27407c45745239de819f059a86559e7a75087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 02:46:55 +0700 Subject: [PATCH 0395/1705] [teachable] Update upskillcourses domain New version does not use teachable platform any longer --- youtube_dl/extractor/teachable.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 290c65754..4de67b75e 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -21,7 +21,7 @@ class TeachableBaseIE(InfoExtractor): _SITES = { # Only notable ones here - 'upskillcourses.com': 'upskill', + 'v1.upskillcourses.com': 'upskill', 'academy.gns3.com': 'gns3', 'academyhacker.com': 'academyhacker', 'stackskills.com': 'stackskills', @@ -111,7 +111,7 @@ class TeachableIE(TeachableBaseIE): ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'info_dict': { 'id': 'uzw6zw58or', 'ext': 'mp4', @@ -125,13 +125,13 @@ class TeachableIE(TeachableBaseIE): 'skip_download': True, }, }, { - 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, }, { 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', 'only_matching': True, }, { - 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 'only_matching': True, }] @@ -217,17 +217,17 @@ class TeachableCourseIE(TeachableBaseIE): /(?:courses|p)/(?:enrolled/)?(?P[^/?#&]+) ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', 'info_dict': { 'id': 'essential-web-developer-course', 'title': 'The Essential Web Developer Course (Free)', }, 'playlist_count': 192, }, { - 'url': 'http://upskillcourses.com/courses/119763/', + 'url': 'http://v1.upskillcourses.com/courses/119763/', 'only_matching': True, }, { - 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', 'only_matching': True, }, { 'url': 'https://academy.gns3.com/courses/enrolled/423415', From 38fa761a4549dc2c3b155306a8a9441944bdcf01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 02:55:17 +0700 Subject: [PATCH 0396/1705] [teachable] Update gns3 domain --- youtube_dl/extractor/teachable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 4de67b75e..2d9d354e8 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -22,7 +22,7 @@ class TeachableBaseIE(InfoExtractor): _SITES = { # Only notable ones here 'v1.upskillcourses.com': 'upskill', - 'academy.gns3.com': 'gns3', + 'gns3.teachable.com': 'gns3', 'academyhacker.com': 'academyhacker', 'stackskills.com': 'stackskills', 'market.saleshacker.com': 'saleshacker', @@ -128,7 +128,7 @@ class TeachableIE(TeachableBaseIE): 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939', + 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', 'only_matching': True, }, { 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', @@ -230,7 +230,7 @@ class TeachableCourseIE(TeachableBaseIE): 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', 'only_matching': True, }, { - 'url': 'https://academy.gns3.com/courses/enrolled/423415', + 'url': 'https://gns3.teachable.com/courses/enrolled/423415', 'only_matching': True, }, { 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', From 6e47200b6ecaeafc65a8f5a19cd12d6e91ad186e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 02:55:52 +0700 Subject: [PATCH 0397/1705] [teachable] Update test --- youtube_dl/extractor/teachable.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 2d9d354e8..a75369dbe 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -111,15 +111,17 @@ class TeachableIE(TeachableBaseIE): ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE _TESTS = [{ - 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', 'info_dict': { - 'id': 'uzw6zw58or', - 'ext': 'mp4', - 'title': 'Welcome to the Course!', - 'description': 'md5:65edb0affa582974de4625b9cdea1107', - 'duration': 138.763, - 'timestamp': 1479846621, - 'upload_date': '20161122', + 'id': 'untlgzk1v7', + 'ext': 'bin', + 'title': 'Overview', + 'description': 'md5:071463ff08b86c208811130ea1c2464c', + 'duration': 736.4, + 'timestamp': 1542315762, + 'upload_date': '20181115', + 'chapter': 'Welcome', + 'chapter_number': 1, }, 'params': { 'skip_download': True, From b439634f0e9a1f251d117303dc60f02fd0ab11ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 03:07:34 +0700 Subject: [PATCH 0398/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index 84b43c642..c53cde141 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Extractors +* [teachable] Update upskillcourses and gns3 domains +* [generic] Look for teachable embeds before wistia ++ [teachable] Extract chapter metadata (#24421) ++ [bilibili] Add support for player.bilibili.com (#24402) ++ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) +* [limelight] Remove disabled API requests (#24255) +* [soundcloud] Fix download URL extraction (#24394) ++ [cbc:watch] Add support for authentication (#19160) +* [hellporno] Fix extraction (#24399) +* [xtube] Fix formats extraction (#24348) +* [ndr] Fix extraction (#24326) +* [nhk] Update m3u8 URL and use native HLS downloader (#24329) +- [nhk] Remove obsolete rtmp formats (#24329) +* [nhk] Relax URL regular expression (#24329) + + version 2020.03.08 Core From 30b5121a1c63c3f84251e9add3c9bf9e3c490228 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 03:12:15 +0700 Subject: [PATCH 0399/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index c53cde141..fc1e28020 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ version +Core +- [utils] Revert support for cookie files with spaces used instead of tabs + Extractors * [teachable] Update upskillcourses and gns3 domains * [generic] Look for teachable embeds before wistia @@ -15,12 +18,13 @@ Extractors * [nhk] Update m3u8 URL and use native HLS downloader (#24329) - [nhk] Remove obsolete rtmp formats (#24329) * [nhk] Relax URL regular expression (#24329) +- [vimeo] Revert fix showcase password protected video extraction (#24224) version 2020.03.08 Core -+ [utils] Add support for cookie files with spaces ++ [utils] Add support for cookie files with spaces used instead of tabs Extractors + [pornhub] Add support for pornhubpremium.com (#24288) From 049c0486bbd57a9bb5fb5a6a5eeff82fd4ac03ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Mar 2020 03:14:30 +0700 Subject: [PATCH 0400/1705] release 2020.03.24 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d82ff9111..40a869113 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.03.08** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.08 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 04b350f76..7b10df3d4 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.03.08** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 6f17ad7bc..04bbcfa68 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.08** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index efb179ea5..a9e231817 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.03.08** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.08 + [debug] youtube-dl version 2020.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index cf4874bcc..4a3d32d51 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.08** +- [ ] I've verified that I'm running youtube-dl version **2020.03.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index fc1e28020..f753972c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.03.24 Core - [utils] Revert support for cookie files with spaces used instead of tabs diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 02bc088ab..174b83bf3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -98,6 +98,7 @@ - **BiliBili** - **BilibiliAudio** - **BilibiliAudioAlbum** + - **BiliBiliPlayer** - **BioBioChileTV** - **BIQLE** - **BitChute** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0f768f7c1..5aedd3268 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.03.08' +__version__ = '2020.03.24' From d44a707fdde6c0138e9e275ed5b4ffb0b8f72966 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 20:34:57 +0700 Subject: [PATCH 0401/1705] [spankwire] Fix extraction (closes #18924, closes #20648) --- youtube_dl/extractor/spankwire.py | 201 +++++++++++++++++++----------- 1 file changed, 125 insertions(+), 76 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 44d8fa52f..8f67463ed 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,34 +3,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) from ..utils import ( - sanitized_Request, + float_or_none, + int_or_none, + merge_dicts, + str_or_none, str_to_int, - unified_strdate, + url_or_none, ) -from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P\d+) + ''' _TESTS = [{ # download URL pattern: */P_K_.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070507', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, - } + 'categories': list, + 'tags': list, + }, }, { # download URL pattern: */mp4__.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', @@ -45,83 +58,119 @@ class SpankwireIE(InfoExtractor): 'upload_date': '20150822', 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - req = sanitized_Request('http://www.' + mobj.group('url')) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - title = self._html_search_regex( - r'

    ([^<]+)', webpage, 'title') - description = self._html_search_regex( - r'(?s)(.+?)', - webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'by:\s*]*>(.+?)', - webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( - r'by:\s* on (.+?) at \d+:\d+', - webpage, 'upload date', fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'
    ([\d,\.]+) views
    ', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r']*>([\d,\.]+)', - webpage, 'comment count', fatal=False)) - - videos = re.findall( - r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) - heights = [int(video[0]) for video in videos] - video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find(r'flashvars\.encrypted = "true"') != -1: - password = self._search_regex( - r'flashvars\.video_title = "([^"]+)', - webpage, 'password').replace('+', ' ') - video_urls = list(map( - lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), - video_urls)) + title = video['title'] formats = [] - for height, video_url in zip(heights, video_urls): - path = compat_urllib_parse_urlparse(video_url).path - m = re.search(r'/(?P\d+)[pP]_(?P\d+)[kK]', path) - if m: - tbr = int(m.group('tbr')) - height = int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height, - 'height': height, - 'tbr': tbr, + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P\d+)[pP]_(?P\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, }) - self._sort_formats(formats) - age_limit = self._rta_search(webpage) + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries - return { + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*]+\bclass=["\']uploaded__by[^>]*>(.+?)
    ', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), 'view_count': view_count, - 'comment_count': comment_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, 'formats': formats, - 'age_limit': age_limit, - } + }, info) From 8fae1a04eb20279a76d6b1eccdb8249718ad9942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 20:42:10 +0700 Subject: [PATCH 0402/1705] [spankwire] Add support for generic embeds (refs #24633) --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/spankwire.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a495ee15a..63b52306a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE +from .spankwire import SpankwireIE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2715,6 +2716,11 @@ class GenericIE(InfoExtractor): if tube8_urls: return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Spankwire player + spankwire_urls = SpankwireIE._extract_urls(webpage) + if spankwire_urls: + return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 8f67463ed..35ab9ec37 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -67,6 +67,12 @@ class SpankwireIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) From 52c4c51556df15f98c9cda911e36995fe0fc0a47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 20:56:14 +0700 Subject: [PATCH 0403/1705] [youporn] Add support form generic embeds --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/youporn.py | 23 +++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 63b52306a..0ada6354e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -61,6 +61,7 @@ from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE from .spankwire import SpankwireIE +from .youporn import YouPornIE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2721,6 +2722,11 @@ class GenericIE(InfoExtractor): if spankwire_urls: return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) + # Look for embedded YouPorn player + youporn_urls = YouPornIE._extract_urls(webpage) + if youporn_urls: + return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d4eccb4b2..e7fca22de 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - sanitized_Request, str_to_int, unescapeHTML, unified_strdate, @@ -15,7 +14,7 @@ from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P\d+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P\d+)(?:/(?P[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'md5': '3744d24c50438cf5b6f6d59feb5055c2', @@ -57,16 +56,28 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'only_matching': True, + }, { + 'url': 'http://www.youporn.com/watch/505835', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', + webpage) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - request = sanitized_Request(url) - request.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(request, display_id) + webpage = self._download_webpage( + 'http://www.youporn.com/watch/%s' % video_id, display_id, + headers={'Cookie': 'age_verified=1'}) title = self._html_search_regex( r'(?s)]+class=["\']watchVideoTitle[^>]+>(.+?)', From 4e7b5bba5fb73502476c61e4931284c9c3d3d232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 21:27:36 +0700 Subject: [PATCH 0404/1705] [mofosex] Add support for generic embeds (closes #24633) --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/mofosex.py | 23 +++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef803b8a7..e407ab3d9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -636,7 +636,10 @@ from .mixcloud import ( from .mlb import MLBIE from .mnet import MnetIE from .moevideo import MoeVideoIE -from .mofosex import MofosexIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) from .mojvideo import MojvideoIE from .morningstar import MorningstarIE from .motherless import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0ada6354e..ce8252f6a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE +from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE from .vimeo import VimeoIE @@ -2717,6 +2718,11 @@ class GenericIE(InfoExtractor): if tube8_urls: return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Mofosex player + mofosex_urls = MofosexEmbedIE._extract_urls(webpage) + if mofosex_urls: + return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) + # Look for embedded Spankwire player spankwire_urls = SpankwireIE._extract_urls(webpage) if spankwire_urls: diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 1c652813a..5234cac02 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,5 +1,8 @@ from __future__ import unicode_literals +import re + +from .common import InfoExtractor from ..utils import ( int_or_none, str_to_int, @@ -54,3 +57,23 @@ class MofosexIE(KeezMoviesIE): }) return info + + +class MofosexEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P\d+)' + _TESTS = [{ + 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), + ie=MofosexIE.ie_key(), video_id=video_id) From 6a6e1a0cd8bacf5a23f731eedaa1783503470227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 6 Apr 2020 02:05:06 +0700 Subject: [PATCH 0405/1705] [tele5] Fix extraction (closes #24553) --- youtube_dl/extractor/tele5.py | 61 ++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 33a72083b..364556a1f 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,9 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .jwplatform import JWPlatformIE from .nexx import NexxIE -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + NO_DEFAULT, + try_get, +) class Tele5IE(InfoExtractor): @@ -44,14 +54,49 @@ class Tele5IE(InfoExtractor): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - if not video_id: + NEXX_ID_RE = r'\d{6,}' + JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + + def nexx_result(nexx_id): + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, + ie=NexxIE.ie_key(), video_id=nexx_id) + + nexx_id = jwplatform_id = None + + if video_id: + if re.match(NEXX_ID_RE, video_id): + return nexx_result(video_id) + elif re.match(JWPLATFORM_ID_RE, video_id): + jwplatform_id = video_id + + if not nexx_id: display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', - r'\s+id\s*=\s*["\']player_(\d{6,})', - r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') + + def extract_id(pattern, name, default=NO_DEFAULT): + return self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + default=default) + + nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) + if nexx_id: + return nexx_result(nexx_id) + + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + + media = self._download_json( + 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, + display_id) + nexx_id = try_get( + media, lambda x: x['playlist'][0]['nexx_id'], compat_str) + + if nexx_id: + return nexx_result(nexx_id) return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, - ie=NexxIE.ie_key(), video_id=video_id) + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=jwplatform_id) From 13b08034b53efdcf7055df92199a0f35cf1e172e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Apr 2020 22:54:34 +0700 Subject: [PATCH 0406/1705] [extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats (#24667) --- youtube_dl/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..c51a3a07d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2340,6 +2340,8 @@ class InfoExtractor(object): if res is False: return [] ism_doc, urlh = res + if ism_doc is None: + return [] return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) From 91bd3bd0194119fccc91b7eafb7afdcda646ad57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Apr 2020 22:55:36 +0700 Subject: [PATCH 0407/1705] [tv4] Fix ISM formats extraction (closes #24667) --- youtube_dl/extractor/tv4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index a819d048c..c498b0191 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -99,7 +99,7 @@ class TV4IE(InfoExtractor): manifest_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats( - re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): From c9595ee78027ecf6bedbdc33c690228fa7d3a5bb Mon Sep 17 00:00:00 2001 From: Felix Stupp Date: Tue, 7 Apr 2020 16:21:25 +0000 Subject: [PATCH 0408/1705] [twitch:clips] Extend _VALID_URL (closes #24290) (#24642) --- youtube_dl/extractor/twitch.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 0db2dca41..78ee0115c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -643,7 +643,14 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| + (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + ) + (?P[^/?#&]+) + ''' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -669,6 +676,12 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, }] def _real_extract(self, url): From dcc8522fdba4c9286ebc0548caf05b425bc68773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Apr 2020 02:11:19 +0700 Subject: [PATCH 0409/1705] [motherless] Fix extraction (closes #24699) --- youtube_dl/extractor/motherless.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 43fd70f11..b1615b4d8 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -26,7 +26,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', 'uploader_id': 'famouslyfuckedup', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -40,7 +40,7 @@ class MotherlessIE(InfoExtractor): 'game', 'hairy'], 'upload_date': '20140622', 'uploader_id': 'Sulivana7x', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, 'skip': '404', @@ -54,7 +54,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -76,7 +76,8 @@ class MotherlessIE(InfoExtractor): raise ExtractorError('Video %s is for friends only' % video_id, expected=True) title = self._html_search_regex( - r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + (r'(?s)]+\bclass=["\']media-meta-title[^>]+>(.+?)', + r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') video_url = (self._html_search_regex( (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), @@ -84,14 +85,15 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - r'Views\s+([^<]+)<', + (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - r'Favorited\s+([^<]+)<', + (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( - r'Uploaded\s+([^<]+)<', webpage, 'upload date') + (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'Uploaded\s+([^<]+)<'), webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') From 5caf88ccb4bfe3d1b53885b78b2bc509ba333f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Apr 2020 03:52:29 +0700 Subject: [PATCH 0410/1705] [nova:embed] Fix extraction (closes #24700) --- youtube_dl/extractor/nova.py | 106 +++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 2850af5db..47b9748f0 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + determine_ext, int_or_none, js_to_json, qualities, @@ -33,42 +34,76 @@ class NovaEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - + duration = None formats = [] - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { - 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), + + player = self._parse_json( + self._search_regex( + r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', + webpage, 'player', default='{}'), video_id, fatal=False) + if player: + for format_id, format_list in player['tracks'].items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, }) - break - f['format_id'] = f_id - formats.append(f) + duration = int_or_none(player.get('duration')) + else: + # Old path, not actual as of 08.04.2020 + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + self._sort_formats(formats) title = self._og_search_title( @@ -81,7 +116,8 @@ class NovaEmbedIE(InfoExtractor): r'poster\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='value') duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', + default=duration)) return { 'id': video_id, From 6b09401b0ba95da5669d249c8930b3adb873d96e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Apr 2020 22:42:43 +0700 Subject: [PATCH 0411/1705] [youtube] Skip broken multifeed videos (closes #24711) --- youtube_dl/extractor/youtube.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 908defecd..633b839e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1840,15 +1840,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get(feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + 'title': title, }) - feed_ids.append(feed_data['id'][0]) + feed_ids.append(feed_id) self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) From b9e5f872916a7d753ae237459b10622c1c2c3471 Mon Sep 17 00:00:00 2001 From: tom Date: Thu, 9 Apr 2020 21:50:45 +1000 Subject: [PATCH 0412/1705] [soundcloud] Extract AAC format --- youtube_dl/extractor/soundcloud.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ff6be0b54..02d56184d 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -350,6 +350,8 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) + if f.get('ext') == 'aac': + f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: From 75294a5ed03f4443970478f3f4eac572239cec45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Apr 2020 17:24:21 +0700 Subject: [PATCH 0413/1705] [soundcloud] Improve AAC format extraction (closes #19173, closes #24708) --- youtube_dl/extractor/soundcloud.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 02d56184d..422ce1626 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -246,7 +246,12 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - } + }, + { + # with AAC HQ format available via OAuth token + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, ] _API_V2_BASE = 'https://api-v2.soundcloud.com/' @@ -350,7 +355,8 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) - if f.get('ext') == 'aac': + ext = f.get('ext') + if ext == 'aac': f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) @@ -362,9 +368,13 @@ class SoundcloudIE(InfoExtractor): abr = f.get('abr') if abr: f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'protocol': protocol, 'preference': -10 if preview else None, }) formats.append(f) From 533f3e3557af85e28afd72d291cb51a769c7dd7a Mon Sep 17 00:00:00 2001 From: AndrewMBL <62922222+AndrewMBL@users.noreply.github.com> Date: Tue, 31 Mar 2020 15:25:04 +1100 Subject: [PATCH 0414/1705] [thisoldhouse] Fix video id extraction (closes #24548) Added support for: with of without "www." and either ".chorus.build" or ".com" It now validated correctly on older URL's ``` ', - start_page, 'xml filename') + start_page, 'xml filename', default=None) + if not xml_name: + info = self._parse_html5_media_entries(url, start_page, video_id)[0] + info.update({ + 'title': remove_start(self._search_regex( + r'>Session Name:\s*<.*?>\s*(.+?)', start_page, + 'title', default=None) or self._og_search_title( + start_page, default=None), 'GDC Vault - '), + 'id': video_id, + 'display_id': display_id, + }) + return info embed_url = '%s/xml/%s' % (xml_root, xml_name) ie_key = 'DigitallySpeaking' From 04be55307a3a13f7091a9a2970a1cc20863769e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:54:12 +0700 Subject: [PATCH 1215/1705] [funimation] Add support for optional lang code in URLs (closes #28950) --- youtube_dl/extractor/funimation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 8bbedca26..d8f1e169a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -16,7 +16,7 @@ from ..utils import ( class FunimationIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P[^/?#&]+)' _NETRC_MACHINE = 'funimation' _TOKEN = None @@ -51,6 +51,10 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, + }, { + # with lang code + 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/', + 'only_matching': True, }] def _login(self): From b797c1cc750b1f617678281731303611e21c70d0 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 5 May 2021 03:31:24 +0700 Subject: [PATCH 1216/1705] [YoutubeDL] Improve extract_info doc (#28946) Co-authored-by: Sergey M. --- youtube_dl/YoutubeDL.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8f65c6499..fe30758ef 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -773,11 +773,20 @@ class YoutubeDL(object): def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True, force_generic_extractor=False): - ''' - Returns a list with a dictionary for each video we find. - If 'download', also downloads the videos. - extra_info is a dict containing the extra values to add to each result - ''' + """ + Return a list with a dictionary for each video extracted. + + Arguments: + url -- URL to extract + + Keyword arguments: + download -- whether to download videos during extraction + ie_key -- extractor key hint + extra_info -- dictionary containing the extra values to add to each result + process -- whether to resolve all unresolved references (URLs, playlist items), + must be True for download to work. + force_generic_extractor -- force using the generic extractor + """ if not ie_key and force_generic_extractor: ie_key = 'Generic' From 03afef753878c2528260dc8084092556428f4343 Mon Sep 17 00:00:00 2001 From: catboy <79282513+catboy-oss@users.noreply.github.com> Date: Tue, 4 May 2021 20:44:07 +0000 Subject: [PATCH 1217/1705] [medaltv] Relax _VALID_URL (#28884) Co-authored-by: Sergey M. --- youtube_dl/extractor/medaltv.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index ef2283dea..67bb4debb 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -15,7 +15,7 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', @@ -42,6 +42,12 @@ class MedalTVIE(InfoExtractor): 'upload_date': '20201117', 'uploader_id': '5156321', } + }, { + 'url': 'https://medal.tv/clips/37rMeFpryCC-9', + 'only_matching': True, + }, { + 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', + 'only_matching': True, }] def _real_extract(self, url): From a7260099873acc6dc7d76cafad2f6b139087afd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 04:12:35 +0700 Subject: [PATCH 1218/1705] [blinkx] Remove extractor (closes #28941) No longer exists. --- youtube_dl/extractor/blinkx.py | 86 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 87 deletions(-) delete mode 100644 youtube_dl/extractor/blinkx.py diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py deleted file mode 100644 index db5e12b21..000000000 --- a/youtube_dl/extractor/blinkx.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - remove_start, - int_or_none, -) - - -class BlinkxIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P[^?]+)' - IE_NAME = 'blinkx' - - _TEST = { - 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', - 'md5': '337cf7a344663ec79bf93a526a2e06c7', - 'info_dict': { - 'id': 'Da0Gw3xc', - 'ext': 'mp4', - 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', - 'uploader': 'IGN News', - 'upload_date': '20150217', - 'timestamp': 1424215740, - 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', - 'duration': 47.743333, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - display_id = video_id[:8] - - api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' - + 'video=%s' % video_id) - data_json = self._download_webpage(api_url, display_id) - data = json.loads(data_json)['api']['results'][0] - duration = None - thumbnails = [] - formats = [] - for m in data['media']: - if m['type'] == 'jpg': - thumbnails.append({ - 'url': m['link'], - 'width': int(m['w']), - 'height': int(m['h']), - }) - elif m['type'] == 'original': - duration = float(m['d']) - elif m['type'] == 'youtube': - yt_id = m['link'] - self.to_screen('Youtube video detected: %s' % yt_id) - return self.url_result(yt_id, 'Youtube', video_id=yt_id) - elif m['type'] in ('flv', 'mp4'): - vcodec = remove_start(m['vcodec'], 'ff') - acodec = remove_start(m['acodec'], 'ff') - vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) - abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) - tbr = vbr + abr if vbr and abr else None - format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) - formats.append({ - 'format_id': format_id, - 'url': m['link'], - 'vcodec': vcodec, - 'acodec': acodec, - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(m.get('w')), - 'height': int_or_none(m.get('h')), - }) - - self._sort_formats(formats) - - return { - 'id': display_id, - 'fullid': video_id, - 'title': data['title'], - 'formats': formats, - 'uploader': data['channel_name'], - 'timestamp': data['pubdate_epoch'], - 'description': data.get('description'), - 'thumbnails': thumbnails, - 'duration': duration, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ac33cd996..71584b1e6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -132,7 +132,6 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) -from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE From 2202cef0e4551293913e0be06b72b4cffccae0aa Mon Sep 17 00:00:00 2001 From: Lukas Anzinger Date: Sun, 16 May 2021 14:54:15 +0200 Subject: [PATCH 1219/1705] [orf:radio] Switch download URLs to HTTPS (closes #29012) (#29046) --- youtube_dl/extractor/orf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 700ce448c..3fadbcbea 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -182,7 +182,7 @@ class ORFRadioIE(InfoExtractor): duration = end - start if end and start else None entries.append({ 'id': loop_stream_id.replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), + 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), 'title': title, 'description': clean_html(data.get('subtitle')), 'duration': duration, From 552b1399110360232bf1bf2ba5ed50cb8b30e818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 20:28:32 +0700 Subject: [PATCH 1220/1705] [generic] Add Referer header for direct videojs download URLs (closes #2879, closes #20217, closes #29053) --- youtube_dl/extractor/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f99d887ca..648a58c77 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3400,6 +3400,9 @@ class GenericIE(InfoExtractor): 'url': src, 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), + 'http_headers': { + 'Referer': full_response.geturl(), + }, }) if formats: self._sort_formats(formats) From 8536dcafd82380c006a25409bdc2c3dc0d14195c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 20:48:24 +0700 Subject: [PATCH 1221/1705] [vk] Add support for sibnet embeds (closes #9500) --- youtube_dl/extractor/vk.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 00ec006c4..6b3513ee0 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -300,6 +300,13 @@ class VKIE(VKBaseIE): 'only_matching': True, }] + @staticmethod + def _extract_sibnet_urls(webpage): + # https://help.sibnet.ru/?sibnet_video_embed + return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1', + webpage)] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -408,6 +415,10 @@ class VKIE(VKBaseIE): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + sibnet_urls = self._extract_sibnet_urls(info_page) + if sibnet_urls: + return self.url_result(sibnet_urls[0]) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) From 286e01ce30b4d4d7a631512c3d1f983b30d9059c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 20:50:32 +0700 Subject: [PATCH 1222/1705] [generic] Add support for sibnet embeds --- youtube_dl/extractor/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 648a58c77..7b6f07318 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -126,6 +126,7 @@ from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE +from .vk import VKIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE @@ -2248,6 +2249,11 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 52, }, + { + # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) + 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', + 'only_matching': True, + }, ] def report_following_redirect(self, new_url): @@ -2777,6 +2783,11 @@ class GenericIE(InfoExtractor): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + # Look for sibnet embedded player + sibnet_urls = VKIE._extract_sibnet_urls(webpage) + if sibnet_urls: + return self.playlist_from_matches(sibnet_urls, video_id, video_title) + # Look for embedded ivi player mobj = re.search(r']+?src=(["\'])(?Phttps?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) if mobj is not None: From eb5080286a8882eedbb77d1a8cd72f1c85b75737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 21:21:14 +0700 Subject: [PATCH 1223/1705] [phoenix] Fix extraction (closes #29057) --- youtube_dl/extractor/phoenix.py | 51 ++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index dbbfce983..e3ea01443 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -9,8 +9,9 @@ from ..compat import compat_str from ..utils import ( int_or_none, merge_dicts, + try_get, unified_timestamp, - xpath_text, + urljoin, ) @@ -27,10 +28,11 @@ class PhoenixIE(ZDFBaseIE): 'title': 'Wohin führt der Protest in der Pandemie?', 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', 'duration': 1691, - 'timestamp': 1613906100, + 'timestamp': 1613902500, 'upload_date': '20210221', 'uploader': 'Phoenix', - 'channel': 'corona nachgehakt', + 'series': 'corona nachgehakt', + 'episode': 'Wohin führt der Protest in der Pandemie?', }, }, { # Youtube embed @@ -79,50 +81,53 @@ class PhoenixIE(ZDFBaseIE): video_id = compat_str(video.get('basename') or video.get('content')) - details = self._download_xml( + details = self._download_json( 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php', - video_id, 'Downloading details XML', query={ + video_id, 'Downloading details JSON', query={ 'ak': 'web', 'ptmd': 'true', 'id': video_id, 'profile': 'player2', }) - title = title or xpath_text( - details, './/information/title', 'title', fatal=True) - content_id = xpath_text( - details, './/video/details/basename', 'content id', fatal=True) + title = title or details['title'] + content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id, content_id, None, url) - timestamp = unified_timestamp(xpath_text(details, './/details/airtime')) + duration = int_or_none(try_get( + details, lambda x: x['tracking']['nielsen']['content']['length'])) + timestamp = unified_timestamp(details.get('editorialDate')) + series = try_get( + details, lambda x: x['tracking']['nielsen']['content']['program'], + compat_str) + episode = title if details.get('contentType') == 'episode' else None thumbnails = [] - for node in details.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text + teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} + for thumbnail_key, thumbnail_url in teaser_images.items(): + thumbnail_url = urljoin(url, thumbnail_url) if not thumbnail_url: continue thumbnail = { 'url': thumbnail_url, } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) thumbnails.append(thumbnail) return merge_dicts(info, { 'id': content_id, 'title': title, - 'description': xpath_text(details, './/information/detail'), - 'duration': int_or_none(xpath_text(details, './/details/lengthSec')), + 'description': details.get('leadParagraph'), + 'duration': duration, 'thumbnails': thumbnails, 'timestamp': timestamp, - 'uploader': xpath_text(details, './/details/channel'), - 'uploader_id': xpath_text(details, './/details/originChannelId'), - 'channel': xpath_text(details, './/details/originChannelTitle'), + 'uploader': details.get('tvService'), + 'series': series, + 'episode': episode, }) From 6423d7054eb2a73a1557c8531f631330e6d7e084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 21:34:10 +0700 Subject: [PATCH 1224/1705] [options] Fix thumbnail option group name (closes #29042) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 241cf110f..0a0641bd4 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -768,7 +768,7 @@ def parseOpts(overrideArguments=None): action='store_true', dest='rm_cachedir', help='Delete all filesystem cache files') - thumbnail = optparse.OptionGroup(parser, 'Thumbnail images') + thumbnail = optparse.OptionGroup(parser, 'Thumbnail Options') thumbnail.add_option( '--write-thumbnail', action='store_true', dest='writethumbnail', default=False, From 1e8aaa1d155d2f6b04ba1caa040876372bb0bb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 21:42:38 +0700 Subject: [PATCH 1225/1705] [generic] Add support for og:audio (closes #28311, closes #29015) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7b6f07318..87594534f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3482,7 +3482,7 @@ class GenericIE(InfoExtractor): m_video_type = re.findall(r' Date: Sun, 16 May 2021 16:46:32 +0200 Subject: [PATCH 1226/1705] [vivo] Add support for vivo.st (#29009) Co-authored-by: Sergey M. --- youtube_dl/extractor/shared.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 02295d1a4..93ab2a167 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -86,10 +86,10 @@ class SharedIE(SharedBaseIE): class VivoIE(SharedBaseIE): IE_DESC = 'vivo.sx' - _VALID_URL = r'https?://vivo\.sx/(?P[\da-z]{10})' + _VALID_URL = r'https?://vivo\.s[xt]/(?P[\da-z]{10})' _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' - _TEST = { + _TESTS = [{ 'url': 'http://vivo.sx/d7ddda0e78', 'md5': '15b3af41be0b4fe01f4df075c2678b2c', 'info_dict': { @@ -98,7 +98,10 @@ class VivoIE(SharedBaseIE): 'title': 'Chicken', 'filesize': 515659, }, - } + }, { + 'url': 'http://vivo.st/d7ddda0e78', + 'only_matching': True, + }] def _extract_title(self, webpage): title = self._html_search_regex( From 503a3744ad620a2fe21c82cb2c595dc939310a1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 21:56:30 +0700 Subject: [PATCH 1227/1705] [eroprofile] Fix extraction (closes #23200, closes #23626, closes #29008) --- youtube_dl/extractor/eroprofile.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index c08643a17..9ee549d77 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -6,7 +6,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, - unescapeHTML + merge_dicts, ) @@ -77,19 +77,15 @@ class EroProfileIE(InfoExtractor): [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) - video_url = unescapeHTML(self._search_regex( - r'([^<]+)', webpage, 'title') - thumbnail = self._search_regex( - r'onclick="showVideoPlayer\(\)">([^<]+)', r']*>(.+?)

    '), + webpage, 'title') - return { + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, - 'thumbnail': thumbnail, 'age_limit': 18, - } + }) From 199c645bee2052e43ec33cc8d0b0fa0c18853da8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 22:01:51 +0700 Subject: [PATCH 1228/1705] [eroprofile] Skip test --- youtube_dl/extractor/eroprofile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 9ee549d77..c460dc7f9 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -24,7 +24,8 @@ class EroProfileIE(InfoExtractor): 'title': 'sexy babe softcore', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + }, + 'skip': 'Video not found', }, { 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', 'md5': '1baa9602ede46ce904c431f5418d8916', From e90a890f01ad253b611d8edd365f41b0c4553b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 22:31:37 +0700 Subject: [PATCH 1229/1705] [playstuff] Add extractor (closes #28901, closes #28931) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/playstuff.py | 65 ++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 youtube_dl/extractor/playstuff.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 71584b1e6..402e542ae 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -925,6 +925,7 @@ from .platzi import ( from .playfm import PlayFMIE from .playplustv import PlayPlusTVIE from .plays import PlaysTVIE +from .playstuff import PlayStuffIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE from .playwire import PlaywireIE diff --git a/youtube_dl/extractor/playstuff.py b/youtube_dl/extractor/playstuff.py new file mode 100644 index 000000000..5a329957f --- /dev/null +++ b/youtube_dl/extractor/playstuff.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + smuggle_url, + try_get, +) + + +class PlayStuffIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a', + 'md5': 'c82d3669e5247c64bc382577843e5bd0', + 'info_dict': { + 'id': '6250584958001', + 'ext': 'mp4', + 'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga', + 'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913', + 'uploader_id': '6005208634001', + 'timestamp': 1619491027, + 'upload_date': '20210427', + }, + 'add_ie': ['BrightcoveNew'], + }, { + # geo restricted, bypassable + 'url': 'https://play.stuff.co.nz/details/_6155660351001', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + state = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'), + video_id) + + account_id = try_get( + state, lambda x: x['configurations']['accountId'], + compat_str) or '6005208634001' + player_id = try_get( + state, lambda x: x['configurations']['playerId'], + compat_str) or 'default' + + entries = [] + for item_id, video in state['items'].items(): + if not isinstance(video, dict): + continue + asset_id = try_get( + video, lambda x: x['content']['attributes']['assetId'], + compat_str) + if not asset_id: + continue + entries.append(self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id), + {'geo_countries': ['NZ']}), + 'BrightcoveNew', video_id)) + + return self.playlist_result(entries, video_id) From efeb9e0fbf1e33043c19dbfda9d12984bf0c6e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 22:40:39 +0700 Subject: [PATCH 1230/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ChangeLog b/ChangeLog index f15c84225..7c327ab9d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,34 @@ +version + +Core +* [options] Fix thumbnail option group name (#29042) +* [YoutubeDL] Improve extract_info doc (#28946) + +Extractors ++ [playstuff] Add support for play.stuff.co.nz (#28901, #28931) +* [eroprofile] Fix extraction (#23200, #23626, #29008) ++ [vivo] Add support for vivo.st (#29009) ++ [generic] Add support for og:audio (#28311, #29015) +* [phoenix] Fix extraction (#29057) ++ [generic] Add support for sibnet embeds ++ [vk] Add support for sibnet embeds (#9500) ++ [generic] Add Referer header for direct videojs download URLs (#2879, + #20217, #29053) +* [orf:radio] Switch download URLs to HTTPS (#29012, #29046) +- [blinkx] Remove extractor (#28941) +* [medaltv] Relax URL regular expression (#28884) ++ [funimation] Add support for optional lang code in URLs (#28950) ++ [gdcvault] Add support for HTML5 videos +* [dispeak] Improve FLV extraction (#13513, #28970) +* [kaltura] Improve iframe extraction (#28969) +* [kaltura] Make embed code alternatives actually work +* [cda] Improve extraction (#28709, #28937) +* [twitter] Improve formats extraction from vmap URL (#28909) +* [xtube] Fix formats extraction (#28870) +* [svtplay] Improve extraction (#28507, #28876) +* [tv2dk] Fix extraction (#28888) + + version 2021.04.26 Extractors From f47627a1c9a790fac29d5c166bc0f7944fcb1a98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 May 2021 22:55:05 +0700 Subject: [PATCH 1231/1705] release 2021.05.16 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- README.md | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 6ece3e031..d67bb482c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.26** +- [ ] I've verified that I'm running youtube-dl version **2021.05.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.26 + [debug] youtube-dl version 2021.05.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index f923b2d5f..efe9fef8c 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.26** +- [ ] I've verified that I'm running youtube-dl version **2021.05.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 97d605653..e213fc1a9 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.26** +- [ ] I've verified that I'm running youtube-dl version **2021.05.16** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 73a806833..1645087ad 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.26** +- [ ] I've verified that I'm running youtube-dl version **2021.05.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.26 + [debug] youtube-dl version 2021.05.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index ee19a75f5..e6e569af6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.26** +- [ ] I've verified that I'm running youtube-dl version **2021.05.16** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 7c327ab9d..5ea1d3150 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.05.16 Core * [options] Fix thumbnail option group name (#29042) diff --git a/README.md b/README.md index 94c34d89a..059141611 100644 --- a/README.md +++ b/README.md @@ -287,7 +287,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files -## Thumbnail images: +## Thumbnail Options: --write-thumbnail Write thumbnail image to disk --write-all-thumbnails Write all thumbnail image formats to disk diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 88d474de4..ed0d5e9d9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -119,7 +119,6 @@ - **BitChuteChannel** - **BleacherReport** - **BleacherReportCMS** - - **blinkx** - **Bloomberg** - **BokeCC** - **BongaCams** @@ -713,6 +712,7 @@ - **play.fm** - **player.sky.it** - **PlayPlusTV** + - **PlayStuff** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 576f721db..b16a84100 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.26' +__version__ = '2021.05.16' From e1a9d0ef780b7a0cdcdc706909ad2de9bd06138c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 17 May 2021 12:37:39 +0100 Subject: [PATCH 1232/1705] [shahid] relax _VALID_URL(closes #28772, closes #28930) --- youtube_dl/extractor/shahid.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index b5e093bd2..88b938e05 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -21,6 +21,7 @@ from ..utils import ( class ShahidBaseIE(AWSIE): _AWS_PROXY_HOST = 'api2.shahid.net' _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/' def _handle_error(self, e): fail_data = self._parse_json( @@ -49,7 +50,7 @@ class ShahidBaseIE(AWSIE): class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' - _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' + _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { @@ -73,6 +74,9 @@ class ShahidIE(ShahidBaseIE): # shahid plus subscriber only 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', 'only_matching': True + }, { + 'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319', + 'only_matching': True }] def _real_initialize(self): @@ -168,7 +172,7 @@ class ShahidIE(ShahidBaseIE): class ShahidShowIE(ShahidBaseIE): - _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', 'info_dict': { From dfbbe2902fc67f0f93ee47a8077c148055c67a9b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 17 May 2021 12:56:49 +0100 Subject: [PATCH 1233/1705] [redbulltv] fix embed data extraction(closes #28770) --- youtube_dl/extractor/redbulltv.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 3aae79f5d..6d000b372 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -133,8 +133,10 @@ class RedBullEmbedIE(RedBullTVIE): rrn_id = self._match_id(url) asset_id = self._download_json( 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', - rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'}, - query={ + rrn_id, headers={ + 'Accept': 'application/json', + 'API-KEY': 'e90a1ff11335423998b100c929ecc866', + }, query={ 'query': '''{ resource(id: "%s", enforceGeoBlocking: false) { %s From 1980ff4550a3f040fbc1e054d6b91013e9d8cb96 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 26 May 2021 11:04:39 +0100 Subject: [PATCH 1234/1705] [vimeo] fix vimeo pro embed extraction(closes #29126) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 102687b82..0b386f450 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -647,7 +647,7 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - if '://player.vimeo.com/video/' in url: + if '//player.vimeo.com/video/' in url: config = self._parse_json(self._search_regex( r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) if config.get('view') == 4: From 24297a42efc52862cb9510d32b28efd7faf49af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 May 2021 00:36:26 +0700 Subject: [PATCH 1235/1705] [youtube] Fix get_video_info request (closes #29086, closes #29165) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0c52e5a8b..bf858c39d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1499,6 +1499,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'unable to download video info webpage', query={ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'html5': 1, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From e13a01061d149f4fac7db1a50124c4745a11c16e Mon Sep 17 00:00:00 2001 From: phlip Date: Fri, 28 May 2021 11:01:59 +1000 Subject: [PATCH 1236/1705] [twitch:clips] Add access token query to download URLs (closes #29136) --- youtube_dl/extractor/twitch.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a7867f4d3..7f9738d43 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -49,6 +49,7 @@ class TwitchBaseIE(InfoExtractor): 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', } @@ -924,6 +925,17 @@ class TwitchClipsIE(TwitchBaseIE): raise ExtractorError( 'This clip is no longer available', expected=True) + access_token = self._download_gql( + video_id, [{ + 'operationName': 'VideoAccessToken_Clip', + 'variables': { + 'slug': video_id, + }, + }], + 'Downloading access token GraphQL') + access_token = try_get( + access_token, lambda x: x[0]['data']['clip']['playbackAccessToken']) + formats = [] for option in clip.get('videoQualities', []): if not isinstance(option, dict): @@ -931,6 +943,14 @@ class TwitchClipsIE(TwitchBaseIE): source = url_or_none(option.get('sourceURL')) if not source: continue + if access_token: + source = "%s%s%s" % ( + source, + "&" if "?" in source else "?", + compat_urllib_parse_urlencode({ + "sig": access_token.get('signature'), + "token": access_token.get('value'), + })) formats.append({ 'url': source, 'format_id': option.get('quality'), From f3cd1d9cec91943a459a0662cbcffe3b2e1f6675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 May 2021 01:46:49 +0700 Subject: [PATCH 1237/1705] [twitch:clips] Improve extraction (closes #29149) --- youtube_dl/extractor/twitch.py | 48 ++++++++++++++++------------------ 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 7f9738d43..a378bd6dc 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -894,7 +894,25 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_base_gql( + clip = self._download_gql( + video_id, [{ + 'operationName': 'VideoAccessToken_Clip', + 'variables': { + 'slug': video_id, + }, + }], + 'Downloading clip access token GraphQL')[0]['data']['clip'] + + if not clip: + raise ExtractorError( + 'This clip is no longer available', expected=True) + + access_query = { + 'sig': clip['playbackAccessToken']['signature'], + 'token': clip['playbackAccessToken']['value'], + } + + data = self._download_base_gql( video_id, { 'query': '''{ clip(slug: "%s") { @@ -919,22 +937,10 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL', fatal=False) - if not clip: - raise ExtractorError( - 'This clip is no longer available', expected=True) - - access_token = self._download_gql( - video_id, [{ - 'operationName': 'VideoAccessToken_Clip', - 'variables': { - 'slug': video_id, - }, - }], - 'Downloading access token GraphQL') - access_token = try_get( - access_token, lambda x: x[0]['data']['clip']['playbackAccessToken']) + if data: + clip = try_get(data, lambda x: x['data']['clip'], dict) or clip formats = [] for option in clip.get('videoQualities', []): @@ -943,16 +949,8 @@ class TwitchClipsIE(TwitchBaseIE): source = url_or_none(option.get('sourceURL')) if not source: continue - if access_token: - source = "%s%s%s" % ( - source, - "&" if "?" in source else "?", - compat_urllib_parse_urlencode({ - "sig": access_token.get('signature'), - "token": access_token.get('value'), - })) formats.append({ - 'url': source, + 'url': update_url_query(source, access_query), 'format_id': option.get('quality'), 'height': int_or_none(option.get('quality')), 'fps': int_or_none(option.get('frameRate')), From 6511b8e8d7db78d4ba3706df5122a74e1c9b9b57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 May 2021 03:05:22 +0700 Subject: [PATCH 1238/1705] [ted] Prefer own formats over external sources (closes #29142) --- youtube_dl/extractor/ted.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 63e2455b2..f09f1a3f9 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -123,6 +123,10 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # with own formats and private Youtube external + 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity', + 'only_matching': True, }] _NATIVE_FORMATS = { @@ -210,16 +214,6 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] - external = player_talk.get('external') - if isinstance(external, dict): - service = external.get('service') - if isinstance(service, compat_str): - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - - return self.url_result(ext_url or external['uri']) - resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None @@ -294,6 +288,16 @@ class TEDIE(InfoExtractor): 'vcodec': 'none', }) + if not formats: + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + return self.url_result(ext_url or external['uri']) + self._sort_formats(formats) video_id = compat_str(talk_info['id']) From 2ee6c7f11074917c08253af4c47f9258aa1e0dad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 May 2021 03:43:59 +0700 Subject: [PATCH 1239/1705] [ustream] Detect https embeds (closes #29133) --- youtube_dl/extractor/ustream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 9e860aeb7..1e29cbe22 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -75,7 +75,7 @@ class UstreamIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?Phttp://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) + r']+?src=(["\'])(?Phttps?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) if mobj is not None: return mobj.group('url') From d495292852b6c2f1bd58bc2141ff2b0265c952cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 May 2021 06:14:59 +0700 Subject: [PATCH 1240/1705] [ard] Relax _VALID_URL and fix video ids (closes #22724, closes #29091) --- youtube_dl/extractor/ard.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index d57c5ba0f..d45a9fe52 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -249,14 +249,14 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P[^/?#&]+))\.html' _TESTS = [{ # available till 7.01.2022 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'maischberger-die-woche', - 'id': '100', + 'id': 'maischberger-die-woche-video100', + 'display_id': 'maischberger-die-woche-video100', 'ext': 'mp4', 'duration': 3687.0, 'title': 'maischberger. die woche vom 7. Januar 2021', @@ -264,16 +264,25 @@ class ARDIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', + 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html', 'only_matching': True, }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = mobj.group('id') player_url = mobj.group('mainurl') + '~playerXml.xml' doc = self._download_xml(player_url, display_id) @@ -324,7 +333,7 @@ class ARDIE(InfoExtractor): self._sort_formats(formats) return { - 'id': mobj.group('id'), + 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, 'display_id': display_id, 'title': video_node.find('./title').text, From 82f3993ba3f4d435d3bc9e37426ab225f5549510 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 4 Jun 2021 17:51:44 +0100 Subject: [PATCH 1241/1705] [formula1] fix extraction(closes #29206) --- youtube_dl/extractor/formula1.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py index fecfc28ae..67662e6de 100644 --- a/youtube_dl/extractor/formula1.py +++ b/youtube_dl/extractor/formula1.py @@ -5,29 +5,23 @@ from .common import InfoExtractor class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P.+?)\.html' - _TESTS = [{ - 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', - 'md5': '8c79e54be72078b26b89e0e111c0502b', + _VALID_URL = r'https?://(?:www\.)?formula1\.com/en/latest/video\.[^.]+\.(?P\d+)\.html' + _TEST = { + 'url': 'https://www.formula1.com/en/latest/video.race-highlights-spain-2016.6060988138001.html', + 'md5': 'be7d3a8c2f804eb2ab2aa5d941c359f8', 'info_dict': { - 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', + 'id': '6060988138001', 'ext': 'mp4', 'title': 'Race highlights - Spain 2016', + 'timestamp': 1463332814, + 'upload_date': '20160515', + 'uploader_id': '6057949432001', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', - 'only_matching': True, - }] + 'add_ie': ['BrightcoveNew'], + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/6057949432001/S1WMrhjlh_default/index.html?videoId=%s' def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - ooyala_embed_code = self._search_regex( - r'data-videoid="([^"]+)"', webpage, 'ooyala embed code') + bc_id = self._match_id(url) return self.url_result( - 'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code) + self.BRIGHTCOVE_URL_TEMPLATE % bc_id, 'BrightcoveNew', bc_id) From 943070af4a9e13ef2b81c5e484d9c799f1845aab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 5 Jun 2021 23:42:25 +0700 Subject: [PATCH 1242/1705] [orf:tvthek] Fix thumbnails extraction (closes #29217) --- youtube_dl/extractor/orf.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 3fadbcbea..ed8a9a841 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -140,6 +140,25 @@ class ORFTVthekIE(InfoExtractor): }) upload_date = unified_strdate(sd.get('created_date')) + + thumbnails = [] + preview = sd.get('preview_image_url') + if preview: + thumbnails.append({ + 'id': 'preview', + 'url': preview, + 'preference': 0, + }) + image = sd.get('image_full_url') + if not image and len(data_jsb) == 1: + image = self._og_search_thumbnail(webpage) + if image: + thumbnails.append({ + 'id': 'full', + 'url': image, + 'preference': 1, + }) + entries.append({ '_type': 'video', 'id': video_id, @@ -149,7 +168,7 @@ class ORFTVthekIE(InfoExtractor): 'description': sd.get('description'), 'duration': int_or_none(sd.get('duration_in_seconds')), 'upload_date': upload_date, - 'thumbnail': sd.get('image_full_url'), + 'thumbnails': thumbnails, }) return { From fdf91c52a8b58b3b7c12a393629fc962d6ab7618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 Jun 2021 00:11:09 +0700 Subject: [PATCH 1243/1705] [youporn] Fix formats and view count extraction (closes #29216) --- youtube_dl/extractor/youporn.py | 111 ++++++++++++-------------------- 1 file changed, 42 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 33114363d..7084d3d12 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -4,13 +4,12 @@ import re from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, str_to_int, - unescapeHTML, unified_strdate, url_or_none, ) -from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): @@ -34,6 +33,7 @@ class YouPornIE(InfoExtractor): 'tags': list, 'age_limit': 18, }, + 'skip': 'This video has been disabled', }, { # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', @@ -78,6 +78,40 @@ class YouPornIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + definitions = self._download_json( + 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, + display_id) + + formats = [] + for definition in definitions: + if not isinstance(definition, dict): + continue + video_url = url_or_none(definition.get('videoUrl')) + if not video_url: + continue + f = { + 'url': video_url, + 'filesize': int_or_none(definition.get('videoSize')), + } + height = int_or_none(definition.get('quality')) + # Video URL's path looks like this: + # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 + # We will benefit from it by extracting some metadata + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'tbr': bitrate, + }) + f['height'] = height + formats.append(f) + self._sort_formats(formats) + webpage = self._download_webpage( 'http://www.youporn.com/watch/%s' % video_id, display_id, headers={'Cookie': 'age_verified=1'}) @@ -88,65 +122,6 @@ class YouPornIE(InfoExtractor): webpage, default=None) or self._html_search_meta( 'title', webpage, fatal=True) - links = [] - - # Main source - definitions = self._parse_json( - self._search_regex( - r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage, - 'media definitions', default='[]'), - video_id, fatal=False) - if definitions: - for definition in definitions: - if not isinstance(definition, dict): - continue - video_url = url_or_none(definition.get('videoUrl')) - if video_url: - links.append(video_url) - - # Fallback #1, this also contains extra low quality 180p format - for _, link in re.findall(r']+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage): - links.append(link) - - # Fallback #2 (unavailable as at 22.06.2017) - sources = self._search_regex( - r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) - if sources: - for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): - links.append(link) - - # Fallback #3 (unavailable as at 22.06.2017) - for _, link in re.findall( - r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): - links.append(link) - - # Fallback #4, encrypted links (unavailable as at 22.06.2017) - for _, encrypted_link in re.findall( - r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): - links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) - - formats = [] - for video_url in set(unescapeHTML(link) for link in links): - f = { - 'url': video_url, - } - # Video URL's path looks like this: - # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 - # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) - if mobj: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'height': height, - 'tbr': bitrate, - }) - formats.append(f) - self._sort_formats(formats) - description = self._html_search_regex( r'(?s)]+\bid=["\']description["\'][^>]*>(.+?)', webpage, 'description', @@ -169,13 +144,12 @@ class YouPornIE(InfoExtractor): age_limit = self._rta_search(webpage) - average_rating = int_or_none(self._search_regex( - r']+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%', - webpage, 'average rating', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'(?s)]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P[\d,.]+)<', - webpage, 'view count', fatal=False, group='count')) + view_count = None + views = self._search_regex( + r'(]+\bclass=["\']js_videoInfoViews["\']>)', webpage, + 'views', default=None) + if views: + view_count = str_to_int(extract_attributes(views).get('data-value')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', default=None)) @@ -201,7 +175,6 @@ class YouPornIE(InfoExtractor): 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, - 'average_rating': average_rating, 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, From bb7ac1ed669d67d79fa1a3b9e5c70271892ecbcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 Jun 2021 01:16:43 +0700 Subject: [PATCH 1244/1705] [facebook] Improve login required detection --- youtube_dl/extractor/facebook.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cb34c59f5..04650af39 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -521,7 +521,10 @@ class FacebookIE(InfoExtractor): raise ExtractorError( 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) - elif '>You must log in to continue' in webpage: + elif any(p in webpage for p in ( + '>You must log in to continue', + 'id="login_form"', + 'id="loginbutton"')): self.raise_login_required() if not video_data and '/watchparty/' in url: From 5f85eb820cb7eb89dcb567f9cbfefb5d9038b9c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 Jun 2021 01:32:15 +0700 Subject: [PATCH 1245/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ChangeLog b/ChangeLog index 5ea1d3150..06efe32ab 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version + +Extractors +* [facebook] Improve login required detection +* [youporn] Fix formats and view count extraction (#29216) +* [orf:tvthek] Fix thumbnails extraction (#29217) +* [formula1] Fix extraction (#29206) +* [ard] Relax URL regular expression and fix video ids (#22724, #29091) ++ [ustream] Detect https embeds (#29133) +* [ted] Prefer own formats over external sources (#29142) +* [twitch:clips] Improve extraction (#29149) ++ [twitch:clips] Add access token query to download URLs (#29136) +* [youtube] Fix get_video_info request (#29086, #29165) +* [vimeo] Fix vimeo pro embed extraction (#29126) +* [redbulltv] Fix embed data extraction (#28770) +* [shahid] Relax URL regular expression (#28772, #28930) + + version 2021.05.16 Core From b224cf39d53bd16bcfda2ac493712c3ff449ecb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 6 Jun 2021 01:38:22 +0700 Subject: [PATCH 1246/1705] release 2021.06.06 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d67bb482c..4eb505231 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.05.16** +- [ ] I've verified that I'm running youtube-dl version **2021.06.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.05.16 + [debug] youtube-dl version 2021.06.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index efe9fef8c..9fed0b489 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.05.16** +- [ ] I've verified that I'm running youtube-dl version **2021.06.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index e213fc1a9..573e8ded0 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.05.16** +- [ ] I've verified that I'm running youtube-dl version **2021.06.06** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 1645087ad..c0031bf7a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.05.16** +- [ ] I've verified that I'm running youtube-dl version **2021.06.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.05.16 + [debug] youtube-dl version 2021.06.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index e6e569af6..1138ab2ca 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.05.16** +- [ ] I've verified that I'm running youtube-dl version **2021.06.06** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 06efe32ab..680fffdf8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.06.06 Extractors * [facebook] Improve login required detection diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b16a84100..461dd87ca 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.05.16' +__version__ = '2021.06.06' From c2350cac243ba1ec1586fe85b0d62d1b700047a2 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 6 Jun 2021 05:32:27 +0700 Subject: [PATCH 1247/1705] [README.md] Update MSVC 2010 redist URL (closes #29222) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 059141611..2841ed68f 100644 --- a/README.md +++ b/README.md @@ -893,7 +893,7 @@ Since June 2012 ([#342](https://github.com/ytdl-org/youtube-dl/issues/342)) yout ### The exe throws an error due to missing `MSVCR100.dll` -To run the exe you need to install first the [Microsoft Visual C++ 2010 Redistributable Package (x86)](https://www.microsoft.com/en-US/download/details.aspx?id=5555). +To run the exe you need to install first the [Microsoft Visual C++ 2010 Service Pack 1 Redistributable Package (x86)](https://download.microsoft.com/download/1/6/5/165255E7-1014-4D0A-B094-B6A430A6BFFC/vcredist_x86.exe). ### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files? From d156bc8d59dd469bf70b822926504f213ce237de Mon Sep 17 00:00:00 2001 From: kikuyan Date: Thu, 17 Jun 2021 06:02:06 +0900 Subject: [PATCH 1248/1705] [orf:tvthek] Add support for MPD formats (closes #28672) (#29236) --- youtube_dl/extractor/orf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index ed8a9a841..8d537d7ae 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -98,6 +98,9 @@ class ORFTVthekIE(InfoExtractor): elif ext == 'f4m': formats.extend(self._extract_f4m_formats( src, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, From 8fe5d54eb721f1bbb8c8a0d18810a42d1257e406 Mon Sep 17 00:00:00 2001 From: kikuyan Date: Thu, 17 Jun 2021 06:12:13 +0900 Subject: [PATCH 1249/1705] [appleconnect] Fix extraction (#29208) --- youtube_dl/extractor/appleconnect.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index a84b8b1eb..494f8330c 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -9,10 +9,10 @@ from ..utils import ( class AppleConnectIE(InfoExtractor): - _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P[\w-]+)' - _TEST = { + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P[\w-]+)' + _TESTS = [{ 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': 'e7c38568a01ea45402570e6029206723', + 'md5': 'c1d41f72c8bcaf222e089434619316e4', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', @@ -22,7 +22,10 @@ class AppleConnectIE(InfoExtractor): 'upload_date': '20150710', 'timestamp': 1436545535, }, - } + }, { + 'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -36,7 +39,7 @@ class AppleConnectIE(InfoExtractor): video_data = self._parse_json(video_json, video_id) timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) - like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None)) return { 'id': video_id, From a7f61feab2dbfc50a7ebe8b0ea390bd0e5edf77a Mon Sep 17 00:00:00 2001 From: kikuyan Date: Thu, 17 Jun 2021 12:34:33 +0900 Subject: [PATCH 1250/1705] [egghead] Add support for app.egghead.io (closes #28404) (#29303) Co-authored-by: Sergey M. --- youtube_dl/extractor/egghead.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index aff9b88c0..9bbd703e0 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -22,16 +22,19 @@ class EggheadBaseIE(InfoExtractor): class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://egghead\.io/courses/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': '72', + 'id': '432655', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, - } + }, { + 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript', + 'only_matching': True, + }] def _real_extract(self, url): playlist_id = self._match_id(url) @@ -65,7 +68,7 @@ class EggheadCourseIE(EggheadBaseIE): class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { @@ -88,6 +91,9 @@ class EggheadLessonIE(EggheadBaseIE): }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', 'only_matching': True, + }, { + 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'only_matching': True, }] def _real_extract(self, url): From 3a7ef27cf306a0a8f79ebd78ae60329c53080d14 Mon Sep 17 00:00:00 2001 From: kikuyan Date: Mon, 21 Jun 2021 01:58:19 +0900 Subject: [PATCH 1251/1705] [postprocessor/ffmpeg] Show ffmpeg output on error (refs #22680) (#29336) --- youtube_dl/postprocessor/ffmpeg.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5f7298345..9f76c9d4e 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -231,7 +231,10 @@ class FFmpegPostProcessor(PostProcessor): stdout, stderr = p.communicate() if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') - msg = stderr.strip().split('\n')[-1] + msgs = stderr.strip().split('\n') + msg = msgs[-1] + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] ' + '\n'.join(msgs[:-1])) raise FFmpegPostProcessorError(msg) self.try_utime(out_path, oldest_mtime, oldest_mtime) From 57b9a4b4c6cf2580b5007db78bd333a9a237fd47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 00:36:28 +0700 Subject: [PATCH 1252/1705] [nrk] Switch psapi URL to https (closes #29344) Catalog calls no longer work via http --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 40dee2162..6d01a25c3 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -58,7 +58,7 @@ class NRKBaseIE(InfoExtractor): def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): return self._download_json( - urljoin('http://psapi.nrk.no/', path), + urljoin('https://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, fatal=fatal, query=query, headers={'Accept-Encoding': 'gzip, deflate, br'}) From cc21aebe9071660ba558dae75c3066a4a3b38820 Mon Sep 17 00:00:00 2001 From: Logan B Date: Mon, 21 Jun 2021 05:41:14 +1200 Subject: [PATCH 1253/1705] [umg:de] Update GraphQL API URL (#29304) Previous one no longer resolves Co-authored-by: Sergey M. --- youtube_dl/extractor/umg.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py index d815cd9a6..47948b6ce 100644 --- a/youtube_dl/extractor/umg.py +++ b/youtube_dl/extractor/umg.py @@ -28,7 +28,7 @@ class UMGDeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - 'https://api.universal-music.de/graphql', + 'https://graphql.universal-music.de/', video_id, query={ 'query': '''{ universalMusic(channel:16) { @@ -56,11 +56,9 @@ class UMGDeIE(InfoExtractor): formats = [] def add_m3u8_format(format_id): - m3u8_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( hls_url_template % format_id, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal='False') - if m3u8_formats and m3u8_formats[0].get('height'): - formats.extend(m3u8_formats) + 'm3u8_native', m3u8_id='hls', fatal=False)) for f in video_data.get('formats', []): f_url = f.get('url') From 41317030017418c89742594a80c0596c4b26bbb9 Mon Sep 17 00:00:00 2001 From: bopol Date: Sun, 20 Jun 2021 19:42:09 +0200 Subject: [PATCH 1254/1705] [youtube] Update invidious instance list (#29281) --- youtube_dl/extractor/youtube.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bf858c39d..35058950a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -353,7 +353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', r'(?:www\.)?invidious\.tinfoil-hat\.net', r'(?:www\.)?invidious\.himiko\.cloud', r'(?:www\.)?invidious\.reallyancient\.tech', @@ -380,6 +380,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.fdn\.fr', r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', r'(?:www\.)?kgg2m7yk5aybusll\.onion', r'(?:www\.)?qklhadlycap4cnod\.onion', r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', @@ -388,6 +396,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', ) _VALID_URL = r"""(?x)^ ( From 4c77a2e538fb23da116aaba0f51e314ef76feb68 Mon Sep 17 00:00:00 2001 From: Tianyi Shi Date: Sun, 20 Jun 2021 19:03:21 +0100 Subject: [PATCH 1255/1705] [bilibili] Strip uploader name (#29202) --- youtube_dl/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 589fdc1ce..bff6ea194 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -233,7 +233,7 @@ class BiliBiliIE(InfoExtractor): webpage) if uploader_mobj: info.update({ - 'uploader': uploader_mobj.group('name'), + 'uploader': uploader_mobj.group('name').strip(), 'uploader_id': uploader_mobj.group('id'), }) if not info.get('uploader'): From 03ab02730f77da5b7ad05ca78ff1624d8226ec5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 01:34:27 +0700 Subject: [PATCH 1256/1705] [youtube] Workaround for get_video_info request (refs #29333) See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 --- youtube_dl/extractor/youtube.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 35058950a..e68214008 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1512,6 +1512,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'html5': 1, + # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 + 'c': 'TVHTML5', + 'cver': '6.20180913', }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 47f2f2fbe9730b041b91451d17279216f311ffc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 01:35:21 +0700 Subject: [PATCH 1257/1705] [youtube] Make get_video_info processing more robust (closes #29333) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e68214008..dc4bd4a77 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1504,22 +1504,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playability_status = player_response.get('playabilityStatus') or {} if playability_status.get('reason') == 'Sign in to confirm your age': - pr = self._parse_json(try_get(compat_parse_qs( - self._download_webpage( - base_url + 'get_video_info', video_id, - 'Refetching age-gated info webpage', - 'unable to download video info webpage', query={ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'html5': 1, - # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 - 'c': 'TVHTML5', - 'cver': '6.20180913', - }, fatal=False)), - lambda x: x['player_response'][0], - compat_str) or '{}', video_id) - if pr: - player_response = pr + video_info = self._download_webpage( + base_url + 'get_video_info', video_id, + 'Refetching age-gated info webpage', + 'unable to download video info webpage', query={ + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'html5': 1, + # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 + 'c': 'TVHTML5', + 'cver': '6.20180913', + }, fatal=False) + if video_info: + pr = self._parse_json( + try_get( + compat_parse_qs(video_info), + lambda x: x['player_response'][0], compat_str) or '{}', + video_id, fatal=False) + if pr and isinstance(pr, dict): + player_response = pr trailer_video_id = try_get( playability_status, From 2ccee8db74c36eb1254cdffd4e691e56c0ce0724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 01:54:52 +0700 Subject: [PATCH 1258/1705] [curiositystream:collection] Extend _VALID_URL (closes #26326, closes #29117) --- youtube_dl/extractor/curiositystream.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index ae64a07d7..48ff30432 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -145,7 +145,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P\d+)' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P\d+)' _TESTS = [{ 'url': 'https://app.curiositystream.com/collection/2', 'info_dict': { @@ -157,6 +157,9 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, + }, { + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, }] def _real_extract(self, url): From da32828208743c8012c8eea01780cbf9b3f60436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 03:22:37 +0700 Subject: [PATCH 1259/1705] [pornhub] Dismiss tbr extracted from download URLs (closes #28927) No longer reliable --- youtube_dl/extractor/pornhub.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 031454600..10516ee5a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -408,17 +408,14 @@ class PornHubIE(PornHubBaseIE): format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) return - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) + if not height: + height = int_or_none(self._search_regex( + r'(?P\d+)[pP]?_\d+[kK]', format_url, 'height', + default=None)) formats.append({ 'url': format_url, 'format_id': '%dp' % height if height else None, 'height': height, - 'tbr': tbr, }) for video_url, height in video_urls: @@ -440,7 +437,8 @@ class PornHubIE(PornHubBaseIE): add_format(video_url, height) continue add_format(video_url) - self._sort_formats(formats) + self._sort_formats( + formats, field_preference=('height', 'width', 'fps', 'format_id')) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', From 751c9ae39a0bb9c66eca888a12595624db00bf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 03:33:43 +0700 Subject: [PATCH 1260/1705] [pornhub] Detect geo restriction --- youtube_dl/extractor/pornhub.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 10516ee5a..d74e69ed9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -236,6 +236,10 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', 'only_matching': True, + }, { + # geo restricted + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, }] @staticmethod @@ -275,6 +279,11 @@ class PornHubIE(PornHubBaseIE): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) + if any(re.search(p, webpage) for p in ( + r'class=["\']geoBlocked["\']', + r'>\s*This content is unavailable in your country')): + self.raise_geo_restricted() + # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. From cb668eb973b8f09152bb48e3b49a014d3cb72b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Jun 2021 04:08:15 +0700 Subject: [PATCH 1261/1705] [pornhub] Add support for pornhubthbh7ap3u.onion --- youtube_dl/extractor/pornhub.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index d74e69ed9..e2e1500ff 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -30,6 +30,7 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): _NETRC_MACHINE = 'pornhub' + _PORNHUB_HOST_RE = r'(?:(?Ppornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)' def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): @@ -122,11 +123,13 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)? + %s + /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) - ''' + ''' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': 'a6391306d050e4547f62b3f485dd9ba9', @@ -240,6 +243,9 @@ class PornHubIE(PornHubBaseIE): # geo restricted 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', 'only_matching': True, + }, { + 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, }] @staticmethod @@ -520,7 +526,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -549,6 +555,9 @@ class PornHubUserIE(PornHubPlaylistBaseIE): # Same as before, multi page 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', 'only_matching': True, + }, { + 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph', + 'only_matching': True, }] def _real_extract(self, url): @@ -624,7 +633,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -729,6 +738,9 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): }, { 'url': 'https://de.pornhub.com/playlist/4667351', 'only_matching': True, + }, { + 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', + 'only_matching': True, }] @classmethod @@ -739,7 +751,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { @@ -749,4 +761,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'only_matching': True, + }, { + 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, }] From 379f52a4954013767219d25099cce9e0f9401961 Mon Sep 17 00:00:00 2001 From: Aleri Kaisattera <73682764+alerikaisattera@users.noreply.github.com> Date: Mon, 21 Jun 2021 03:23:50 +0600 Subject: [PATCH 1262/1705] [liveleak] Remove extractor (closes #17625, closes #24222) (#29331) --- youtube_dl/extractor/extractors.py | 4 - youtube_dl/extractor/generic.py | 31 ----- youtube_dl/extractor/liveleak.py | 191 ----------------------------- 3 files changed, 226 deletions(-) delete mode 100644 youtube_dl/extractor/liveleak.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 402e542ae..6e8fc3961 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -610,10 +610,6 @@ from .linkedin import ( from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE from .livejournal import LiveJournalIE -from .liveleak import ( - LiveLeakIE, - LiveLeakEmbedIE, -) from .livestream import ( LivestreamIE, LivestreamOriginalIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 87594534f..a9c064105 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -84,7 +84,6 @@ from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .arkena import ArkenaIE from .instagram import InstagramIE -from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .kaltura import KalturaIE @@ -1629,31 +1628,6 @@ class GenericIE(InfoExtractor): 'upload_date': '20160409', }, }, - # LiveLeak embed - { - 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': '7619da8c820e835bef21a1efa2a0fc71', - 'info_dict': { - 'id': '874_1459135191', - 'ext': 'mp4', - 'title': 'Man shows poor quality of new apartment building', - 'description': 'The wall is like a sand pile.', - 'uploader': 'Lake8737', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Another LiveLeak embed pattern (#13336) - { - 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', - 'info_dict': { - 'id': '2eb_1496309988', - 'ext': 'mp4', - 'title': 'Thief robs place where everyone was armed', - 'description': 'md5:694d73ee79e535953cf2488562288eee', - 'uploader': 'brazilwtf', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, # Duplicated embedded video URLs { 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', @@ -3179,11 +3153,6 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - # Look for LiveLeak embeds - liveleak_urls = LiveLeakIE._extract_urls(webpage) - if liveleak_urls: - return self.playlist_from_matches(liveleak_urls, video_id, video_title) - # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) if threeqsdn_url: diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py deleted file mode 100644 index 4ac437c8b..000000000 --- a/youtube_dl/extractor/liveleak.py +++ /dev/null @@ -1,191 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class LiveLeakIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P[\w_]+)' - _TESTS = [{ - 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '0813c2430bea7a46bf13acf3406992f4', - 'info_dict': { - 'id': '757_1364311680', - 'ext': 'mp4', - 'description': 'extremely bad day for this guy..!', - 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', - 'info_dict': { - 'id': 'f93_1390833151', - 'ext': 'mp4', - 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', - 'uploader': 'ARD_Stinkt', - 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - # Prochan embed - 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', - 'md5': '42c6d97d54f1db107958760788c5f48f', - 'info_dict': { - 'id': '4f7_1392687779', - 'ext': 'mp4', - 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.", - 'uploader': 'CapObveus', - 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', - 'age_limit': 18, - }, - 'skip': 'Video is dead', - }, { - # Covers https://github.com/ytdl-org/youtube-dl/pull/5983 - # Multiple resolutions - 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', - 'info_dict': { - 'id': '801_1409392012', - 'ext': 'mp4', - 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', - 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - # Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521 - 'url': 'http://m.liveleak.com/view?i=763_1473349649', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': '763_1473349649', - 'ext': 'mp4', - 'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty', - 'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.', - 'uploader': 'Ziz', - 'upload_date': '20160908', - 'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw' - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.liveleak.com/view?i=677_1439397581', - 'info_dict': { - 'id': '677_1439397581', - 'title': 'Fuel Depot in China Explosion caught on video', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', - 'only_matching': True, - }, { - # No original video - 'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() - video_description = self._og_search_description(webpage) - video_uploader = self._html_search_regex( - r'By:.*?(\w+)', webpage, 'uploader', fatal=False) - age_limit = int_or_none(self._search_regex( - r'you confirm that you are ([0-9]+) years and over.', - webpage, 'age limit', default=None)) - video_thumbnail = self._og_search_thumbnail(webpage) - - entries = self._parse_html5_media_entries(url, webpage, video_id) - if not entries: - # Maybe an embed? - embed_url = self._search_regex( - r']+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } - - for idx, info_dict in enumerate(entries): - formats = [] - for a_format in info_dict['formats']: - if not a_format.get('height'): - a_format['height'] = int_or_none(self._search_regex( - r'([0-9]+)p\.mp4', a_format['url'], 'height label', - default=None)) - formats.append(a_format) - - # Removing '.*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/ytdl-org/youtube-dl/pull/4768) - orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) - if a_format['url'] != orig_url: - format_id = a_format.get('format_id') - format_id = 'original' + ('-' + format_id if format_id else '') - if self._is_valid_url(orig_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) - info_dict['formats'] = formats - - # Don't append entry ID for one-video pages to keep backward compatibility - if len(entries) > 1: - info_dict['id'] = '%s_%s' % (video_id, idx + 1) - else: - info_dict['id'] = video_id - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - 'thumbnail': video_thumbnail, - }) - - return self.playlist_result(entries, video_id, video_title) - - -class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[ift])=(?P[\w_]+)' - - # See generic.py for actual test cases - _TESTS = [{ - 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', - 'only_matching': True, - }, { - 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', - 'only_matching': True, - }] - - def _real_extract(self, url): - kind, video_id = re.match(self._VALID_URL, url).groups() - - if kind == 'f': - webpage = self._download_webpage(url, video_id) - liveleak_url = self._search_regex( - r'(?:logourl\s*:\s*|window\.open\()(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, - webpage, 'LiveLeak URL', group='url') - else: - liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) - - return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) From 7fb9564420d43252c8f8c453d4dc54bf3ff9f8ee Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Jun 2021 20:06:33 +0100 Subject: [PATCH 1263/1705] [periscope] pass referer to HLS requests(closes #29419) --- youtube_dl/extractor/periscope.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index b15906390..b93a02b7d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,6 +12,10 @@ from ..utils import ( class PeriscopeBaseIE(InfoExtractor): + _M3U8_HEADERS = { + 'Referer': 'https://www.periscope.tv/' + } + def _call_api(self, method, query, item_id): return self._download_json( 'https://api.periscope.tv/api/v2/%s' % method, @@ -54,9 +58,11 @@ class PeriscopeBaseIE(InfoExtractor): m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native' if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=fatal) + m3u8_id=format_id, fatal=fatal, headers=self._M3U8_HEADERS) if len(m3u8_formats) == 1: self._add_width_and_height(m3u8_formats[0], width, height) + for f in m3u8_formats: + f.setdefault('http_headers', {}).update(self._M3U8_HEADERS) return m3u8_formats From a8035827177d6b59aca03bd717acb6a9bdd75ada Mon Sep 17 00:00:00 2001 From: bopol Date: Thu, 1 Jul 2021 08:53:22 +0200 Subject: [PATCH 1264/1705] [peertube] only call description endpoint if necessary (#29383) --- youtube_dl/extractor/peertube.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index d9b13adc2..3af533925 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -569,15 +569,15 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - full_description = self._call_api( - host, video_id, 'description', note='Downloading description JSON', - fatal=False) + description = video.get('description') + if len(description) >= 250: + # description is shortened + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) - description = None - if isinstance(full_description, dict): - description = str_or_none(full_description.get('description')) - if not description: - description = video.get('description') + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) or description subtitles = self.extract_subtitles(host, video_id) From b5242da7d24028f60cd23fd10f28fb635c7c7634 Mon Sep 17 00:00:00 2001 From: lanegramling Date: Thu, 16 Dec 2021 11:42:17 -0700 Subject: [PATCH 1265/1705] [youtube] Update signature function patterns (closes #30363) (#30366) --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dc4bd4a77..62e58c13e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1323,10 +1323,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\bm=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', From e41882335066ed03b1f4837e72fc0e83dfbe3525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Dec 2021 01:43:16 +0700 Subject: [PATCH 1266/1705] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 680fffdf8..e530e6aea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core +* [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) + +Extractors +* [youtube] Update signature function patterns (#30363, #30366) +* [peertube] Only call description endpoint if necessary (#29383) +* [periscope] Pass referer to HLS requests (#29419) +- [liveleak] Remove extractor (#17625, #24222, #29331) ++ [pornhub] Add support for pornhubthbh7ap3u.onion +* [pornhub] Detect geo restriction +* [pornhub] Dismiss tbr extracted from download URLs (#28927) +* [curiositystream:collection] Extend _VALID_URL (#26326, #29117) +* [youtube] Make get_video_info processing more robust (#29333) +* [youtube] Workaround for get_video_info request (#29333) +* [bilibili] Strip uploader name (#29202) +* [youtube] Update invidious instance list (#29281) +* [umg:de] Update GraphQL API URL (#29304) +* [nrk] Switch psapi URL to https (#29344) ++ [egghead] Add support for app.egghead.io (#28404, #29303) +* [appleconnect] Fix extraction (#29208) ++ [orf:tvthek] Add support for MPD formats (#28672, #29236) + + version 2021.06.06 Extractors From 5014bd67c22b421207b2650d4dc874b95b36dda1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Dec 2021 01:49:07 +0700 Subject: [PATCH 1267/1705] release 2021.12.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 -- youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 4eb505231..e5405c235 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.06.06 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9fed0b489..33b01ce7f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 573e8ded0..285610cc7 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index c0031bf7a..af73525fb 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.06.06 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 1138ab2ca..42c878b83 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index e530e6aea..658864282 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.12.17 Core * [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ed0d5e9d9..ae2a6b8b0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -472,8 +472,6 @@ - **LinuxAcademy** - **LiTV** - **LiveJournal** - - **LiveLeak** - - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 461dd87ca..b82fbc702 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.06.06' +__version__ = '2021.12.17' From ed99d68bdddfba0440dc81c105d5c0ea7cee7d1c Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 00:41:47 +0530 Subject: [PATCH 1268/1705] Add back `YoutubeSearchURLIE` --- test/test_all_urls.py | 6 +- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/youtube.py | 177 +++++++++++++++-------------- 3 files changed, 93 insertions(+), 92 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..0e1328ede 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -66,9 +66,9 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) - # def test_youtube_search_matching(self): - # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + def test_youtube_search_matching(self): + self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9b449937d..d403a2dbe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1556,7 +1556,7 @@ from .youtube import ( YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, - #YoutubeSearchURLIE, + YoutubeSearchURLIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87bdc1677..578cfcf90 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -308,6 +308,77 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) + def _search_results(self, query, params): + data = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + 'query': query, + } + if params: + data['params'] = params + for page_num in itertools.count(1): + search = self._download_json( + 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + video_id='query "%s"' % query, + note='Downloading page %s' % page_num, + errnote='Unable to download API page', fatal=False, + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}) + if not search: + break + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + if not slr_contents: + break + isr_contents = try_get( + slr_contents, + lambda x: x[0]['itemSectionRenderer']['contents'], + list) + if not isr_contents: + break + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) + description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) + duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = int_or_none(self._search_regex( + r'^(\d+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + yield { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + token = try_get( + slr_contents, + lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + compat_str) + if not token: + break + data['continuation'] = token + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -2454,7 +2525,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): (?: (?:channel|c|user|feed)/| (?:playlist|watch)\?.*?\blist=| - (?!(?:watch|embed|v|e)\b) + (?!(?:watch|embed|v|e|results)\b) ) (?P[^/?\#&]+) ''' @@ -3379,88 +3450,18 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None + _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _TESTS = [] def _entries(self, query, n): - data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, - 'query': query, - } - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS total = 0 - for page_num in itertools.count(1): - search = self._download_json( - 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - video_id='query "%s"' % query, - note='Downloading page %s' % page_num, - errnote='Unable to download API page', fatal=False, - data=json.dumps(data).encode('utf8'), - headers={'content-type': 'application/json'}) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - isr_contents = try_get( - slr_contents, - lambda x: x[0]['itemSectionRenderer']['contents'], - list) - if not isr_contents: - break - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) - description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) - duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = int_or_none(self._search_regex( - r'^(\d+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - total += 1 - yield { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - if total == n: - return - token = try_get( - slr_contents, - lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], - compat_str) - if not token: - break - data['continuation'] = token + for entry in self._search_results(query, self._SEARCH_PARAMS): + yield entry + total += 1 + if total >= n: + return def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -3471,18 +3472,19 @@ class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' - _SEARCH_PARAMS = 'CAI%3D' + _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date + _TESTS = [] -r""" -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)' +class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube search URLs with sorting and filter support' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, 'info_dict': { + 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', } }, { @@ -3491,11 +3493,10 @@ class YoutubeSearchURLIE(YoutubeSearchIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) -""" + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query = (qs.get('search_query') or qs.get('q'))[0] + params = qs.get('sp', ('',))[0] + return self.playlist_result(self._search_results(query, params), query, query) class YoutubeFeedsInfoExtractor(YoutubeTabIE): From bfe72723d8318f8bfcb35dee69a40758df5fa3c0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 00:49:55 +0530 Subject: [PATCH 1269/1705] Use `itertools.islice` --- youtube_dl/extractor/youtube.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 578cfcf90..017837e10 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3455,17 +3455,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _TESTS = [] - def _entries(self, query, n): - total = 0 - for entry in self._search_results(query, self._SEARCH_PARAMS): - yield entry - total += 1 - if total >= n: - return - def _get_n_results(self, query, n): """Get a specified number of results for a query""" - return self.playlist_result(self._entries(query, n), query) + entries = itertools.islice(self._search_results(query, self._SEARCH_PARAMS), 0, None if n == float('inf') else n) + return self.playlist_result(entries, query, query) class YoutubeSearchDateIE(YoutubeSearchIE): From 2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 00:54:22 +0530 Subject: [PATCH 1270/1705] Fix max_results --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 017837e10..bbd3e80d8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3453,6 +3453,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only + _MAX_RESULTS = float('inf') _TESTS = [] def _get_n_results(self, query, n): From 57044eacebc6f2f3cd83c345e1b6e659a22e4773 Mon Sep 17 00:00:00 2001 From: df Date: Thu, 28 Oct 2021 15:55:38 +0100 Subject: [PATCH 1271/1705] Fix test_youtube_playlist_noplaylist --- test/test_youtube_lists.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index cf2fdf14f..72820972e 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- from __future__ import unicode_literals # Allow direct execution @@ -9,11 +10,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL - from youtube_dl.extractor import ( + YoutubeIE, YoutubePlaylistIE, YoutubeTabIE, - YoutubeIE, ) @@ -25,9 +25,11 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True + dl.params['format'] = 'best' ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertEqual(result['_type'], 'url') + result = dl.extract_info(result['url'], download=False, ie_key=result.get('ie_key'), process=False) self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') def test_youtube_course(self): From 46e0a729b2d4503d8d49433fdddfce726d08261e Mon Sep 17 00:00:00 2001 From: df Date: Thu, 28 Oct 2021 15:57:10 +0100 Subject: [PATCH 1272/1705] Remove obsolete test_youtube_course --- test/test_youtube_lists.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 72820972e..e1636a1a6 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -32,16 +32,6 @@ class TestYoutubeLists(unittest.TestCase): result = dl.extract_info(result['url'], download=False, ie_key=result.get('ie_key'), process=False) self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') - def test_youtube_course(self): - dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - # TODO find a > 100 (paginating?) videos course - result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - entries = list(result['entries']) - self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') - self.assertEqual(len(entries), 25) - self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') - def test_youtube_mix(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) From 2c2c2bd348b7dce0aad55a6fc37a18c6f9a000e3 Mon Sep 17 00:00:00 2001 From: df Date: Fri, 29 Oct 2021 03:03:00 +0100 Subject: [PATCH 1273/1705] Fix test_youtube_mix --- test/test_youtube_lists.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index e1636a1a6..fae8a950a 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -34,12 +34,14 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_mix(self): dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') - entries = result['entries'] + dl.params['format'] = 'best' + ie = YoutubeTabIE(dl) + result = dl.extract_info('https://www.youtube.com/watch?v=uVJ0Il5WvbE&list=PLhQjrBD2T381k8ul4WQ8SQ165XqY149WW', + download=False, ie_key=ie.ie_key(), process=True) + entries = (result or {}).get('entries', [{'id': 'not_found', }]) self.assertTrue(len(entries) >= 50) original_video = entries[0] - self.assertEqual(original_video['id'], 'OQpdSVF_k_w') + self.assertEqual(original_video['id'], 'uVJ0Il5WvbE') def test_youtube_toptracks(self): print('Skipping: The playlist page gives error 500') From d76d59d99d05fba94963690a039d38373dddc658 Mon Sep 17 00:00:00 2001 From: df Date: Fri, 29 Oct 2021 03:10:35 +0100 Subject: [PATCH 1274/1705] Remove obsolete non-working test_youtube_toptracks --- test/test_youtube_lists.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index fae8a950a..69c5d52eb 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -43,15 +43,6 @@ class TestYoutubeLists(unittest.TestCase): original_video = entries[0] self.assertEqual(original_video['id'], 'uVJ0Il5WvbE') - def test_youtube_toptracks(self): - print('Skipping: The playlist page gives error 500') - return - dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=MCUS') - entries = result['entries'] - self.assertEqual(len(entries), 100) - def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() dl.params['extract_flat'] = True From 39ca35e7651048c2adf558f1d6db2df0de4554f5 Mon Sep 17 00:00:00 2001 From: df Date: Mon, 1 Nov 2021 04:44:57 +0000 Subject: [PATCH 1275/1705] Fix test_youtube_flat_playlist_extraction --- test/test_youtube_lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 69c5d52eb..07a6b6d06 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -52,7 +52,7 @@ class TestYoutubeLists(unittest.TestCase): entries = list(result['entries']) self.assertTrue(len(entries) == 1) video = entries[0] - self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['_type'], 'url') self.assertEqual(video['ie_key'], 'Youtube') self.assertEqual(video['id'], 'BaW_jenozKc') self.assertEqual(video['url'], 'BaW_jenozKc') From 5f5de51a499f732a6e687f32037e130cbdc50c8f Mon Sep 17 00:00:00 2001 From: df Date: Mon, 1 Nov 2021 13:34:29 +0000 Subject: [PATCH 1276/1705] Add compat_map/filter and use the former --- youtube_dl/compat.py | 21 +++++++++++++++++++++ youtube_dl/extractor/youtube.py | 1 + 2 files changed, 22 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e45c454b..29e0d3a02 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2962,6 +2962,25 @@ else: compat_Struct = struct.Struct +# compat_map/filter() returning an iterator, supposedly the +# same versioning as for zip below +try: + from future_builtins import map as compat_map +except ImportError: + try: + from itertools import imap as compat_map + except ImportError: + compat_map = map + +try: + from future_builtins import filter as compat_filter +except ImportError: + try: + from itertools import ifilter as compat_filter + except ImportError: + compat_filter = filter + + try: from future_builtins import zip as compat_zip except ImportError: # not 2.6+ or is 3.x @@ -3015,6 +3034,7 @@ __all__ = [ 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', + 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', 'compat_getpass', @@ -3026,6 +3046,7 @@ __all__ = [ 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', + 'compat_map', 'compat_numeric_types', 'compat_ord', 'compat_os_name', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 62e58c13e..da410f8f0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -13,6 +13,7 @@ from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_chr, compat_HTTPError, + compat_map as map, compat_parse_qs, compat_str, compat_urllib_parse_unquote_plus, From 96f87aaa3b34d80bc72097a7475d8093849091fc Mon Sep 17 00:00:00 2001 From: df Date: Tue, 2 Nov 2021 11:18:39 +0000 Subject: [PATCH 1277/1705] Back-port JS interpreter upgrade from yt-dlp PR #1437 --- test/test_jsinterp.py | 51 +++++ youtube_dl/compat.py | 5 + youtube_dl/jsinterp.py | 496 ++++++++++++++++++++++++++++++++--------- 3 files changed, 449 insertions(+), 103 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c24b8ca74..4d05ea610 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -112,6 +112,57 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('z'), 5) + def test_for_loop(self): + # function x() { a=0; for (i=0; i-10; i++) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i = i + 1) {a++} a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_switch(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 1:f+=1; + case 2:f+=2; + case 3:f+=3;break; + case 4:f+=4; + default:f=0; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 7) + self.assertEqual(jsi.call_function('x', 3), 6) + self.assertEqual(jsi.call_function('x', 5), 0) + + def test_try(self): + jsi = JSInterpreter(''' + function x() { try{return 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_for_loop_continue(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_for_loop_break(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { break; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_literal_list(self): + jsi = JSInterpreter(''' + function x() { [1, 2, "asdf", [5, 6, 7]][3] } + ''') + self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + + def test_comma(self): + jsi = JSInterpreter(''' + function x() { a=5; a -= 1, a+=3; return a } + ''') + self.assertEqual(jsi.call_function('x'), 7) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 29e0d3a02..2004a405a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,10 @@ import subprocess import sys import xml.etree.ElementTree +try: + import collections.abc as compat_collections_abc +except ImportError: + import collections as compat_collections_abc try: import urllib.request as compat_urllib_request @@ -3025,6 +3029,7 @@ __all__ = [ 'compat_b64decode', 'compat_basestring', 'compat_chr', + 'compat_collections_abc', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 7bda59610..061e92c2a 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -8,6 +8,15 @@ from .utils import ( ExtractorError, remove_quotes, ) +from .compat import ( + compat_collections_abc +) +MutableMapping = compat_collections_abc.MutableMapping + + +class Nonlocal: + pass + _OPERATORS = [ ('|', operator.or_), @@ -22,11 +31,55 @@ _OPERATORS = [ ('*', operator.mul), ] _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) +_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +class JS_Break(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid break') + + +class JS_Continue(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid continue') + + +class LocalNameSpace(MutableMapping): + def __init__(self, *stack): + self.stack = tuple(stack) + + def __getitem__(self, key): + for scope in self.stack: + if key in scope: + return scope[key] + raise KeyError(key) + + def __setitem__(self, key, value): + for scope in self.stack: + if key in scope: + scope[key] = value + break + else: + self.stack[0][key] = value + return value + + def __delitem__(self, key): + raise NotImplementedError('Deleting is not supported') + + def __iter__(self): + for scope in self.stack: + for scope_item in iter(scope): + yield scope_item + + def __len__(self, key): + return len(iter(self)) + + def __repr__(self): + return 'LocalNameSpace%s' % (self.stack, ) + + class JSInterpreter(object): def __init__(self, code, objects=None): if objects is None: @@ -34,11 +87,58 @@ class JSInterpreter(object): self.code = code self._functions = {} self._objects = objects + self.__named_object_counter = 0 + + def _named_object(self, namespace, obj): + self.__named_object_counter += 1 + name = '__youtube_dl_jsinterp_obj%s' % (self.__named_object_counter, ) + namespace[name] = obj + return name + + @staticmethod + def _separate(expr, delim=',', max_split=None): + if not expr: + return + parens = {'(': 0, '{': 0, '[': 0, ']': 0, '}': 0, ')': 0} + start, splits, pos, max_pos = 0, 0, 0, len(delim) - 1 + for idx, char in enumerate(expr): + if char in parens: + parens[char] += 1 + is_in_parens = (parens['['] - parens[']'] + or parens['('] - parens[')'] + or parens['{'] - parens['}']) + if char == delim[pos] and not is_in_parens: + if pos == max_pos: + pos = 0 + yield expr[start: idx - max_pos] + start = idx + 1 + splits += 1 + if max_split and splits >= max_split: + break + else: + pos += 1 + else: + pos = 0 + yield expr[start:] + + @staticmethod + def _separate_at_paren(expr, delim): + separated = list(JSInterpreter._separate(expr, delim, 1)) + if len(separated) < 2: + raise ExtractorError('No terminating paren {0} in {1}'.format(delim, expr)) + return separated[0][1:].strip(), separated[1].strip() def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise ExtractorError('Recursion limit reached') + sub_statements = list(self._separate(stmt, ';')) + stmt = (sub_statements or ['']).pop() + for sub_stmt in sub_statements: + ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) + if should_abort: + return ret + should_abort = False stmt = stmt.lstrip() stmt_m = re.match(r'var\s', stmt) @@ -61,25 +161,119 @@ class JSInterpreter(object): if expr == '': # Empty expression return None - if expr.startswith('('): - parens_count = 0 - for m in re.finditer(r'[()]', expr): - if m.group(0) == '(': - parens_count += 1 - else: - parens_count -= 1 - if parens_count == 0: - sub_expr = expr[1:m.start()] - sub_result = self.interpret_expression( - sub_expr, local_vars, allow_recursion) - remaining_expr = expr[m.end():].strip() - if not remaining_expr: - return sub_result - else: - expr = json.dumps(sub_result) + remaining_expr - break + if expr.startswith('{'): + inner, outer = self._separate_at_paren(expr, '}') + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1) + if not outer or should_abort: + return inner else: - raise ExtractorError('Premature end of parens in %r' % expr) + expr = json.dumps(inner) + outer + + if expr.startswith('('): + inner, outer = self._separate_at_paren(expr, ')') + inner = self.interpret_expression(inner, local_vars, allow_recursion) + if not outer: + return inner + else: + expr = json.dumps(inner) + outer + + if expr.startswith('['): + inner, outer = self._separate_at_paren(expr, ']') + name = self._named_object(local_vars, [ + self.interpret_expression(item, local_vars, allow_recursion) + for item in self._separate(inner)]) + expr = name + outer + + m = re.match(r'try\s*', expr) + if m: + if expr[m.end()] == '{': + try_expr, expr = self._separate_at_paren(expr[m.end():], '}') + else: + try_expr, expr = expr[m.end() - 1:], '' + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1) + if should_abort: + return ret + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + m = re.match(r'(?:(?Pcatch)|(?Pfor)|(?Pswitch))\s*\(', expr) + md = m.groupdict() if m else {} + if md.get('catch'): + # We ignore the catch block + _, expr = self._separate_at_paren(expr, '}') + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + elif md.get('for'): + def raise_constructor_error(c): + raise ExtractorError( + 'Premature return in the initialization of a for loop in {0!r}'.format(c)) + + constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + if remaining.startswith('{'): + body, expr = self._separate_at_paren(remaining, '}') + else: + m = re.match(r'switch\s*\(', remaining) # FIXME + if m: + switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')') + body, expr = self._separate_at_paren(remaining, '}') + body = 'switch(%s){%s}' % (switch_val, body) + else: + body, expr = remaining, '' + start, cndn, increment = self._separate(constructor, ';') + if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: + raise_constructor_error(constructor) + while True: + if not self.interpret_expression(cndn, local_vars, allow_recursion): + break + try: + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1) + if should_abort: + return ret + except JS_Break: + break + except JS_Continue: + pass + if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: + raise_constructor_error(constructor) + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + elif md.get('switch'): + switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) + body, expr = self._separate_at_paren(remaining, '}') + body, default = body.split('default:') if 'default:' in body else (body, None) + items = body.split('case ')[1:] + if default: + items.append('default:%s' % (default, )) + matched = False + for item in items: + case, stmt = [i.strip() for i in self._separate(item, ':', 1)] + matched = matched or case == 'default' or switch_val == self.interpret_expression(case, local_vars, allow_recursion) + if matched: + try: + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) + if should_abort: + return ret + except JS_Break: + break + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + # Comma separated statements + sub_expressions = list(self._separate(expr)) + expr = sub_expressions.pop().strip() if sub_expressions else '' + for sub_expr in sub_expressions: + self.interpret_expression(sub_expr, local_vars, allow_recursion) + + for m in re.finditer(r'''(?x) + (?P\+\+|--)(?P%(_NAME_RE)s)| + (?P%(_NAME_RE)s)(?P\+\+|--)''' % globals(), expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + json.dumps(ret) + expr[end:] for op, opfunc in _ASSIGN_OPERATORS: m = re.match(r'''(?x) @@ -88,14 +282,13 @@ class JSInterpreter(object): (?P.*)$''' % (_NAME_RE, re.escape(op)), expr) if not m: continue - right_val = self.interpret_expression( - m.group('expr'), local_vars, allow_recursion - 1) + right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) if m.groupdict().get('index'): lvar = local_vars[m.group('out')] - idx = self.interpret_expression( - m.group('index'), local_vars, allow_recursion) - assert isinstance(idx, int) + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, int): + raise ExtractorError('List indices must be integers: %s' % (idx, )) cur = lvar[idx] val = opfunc(cur, right_val) lvar[idx] = val @@ -109,8 +302,13 @@ class JSInterpreter(object): if expr.isdigit(): return int(expr) + if expr == 'break': + raise JS_Break() + elif expr == 'continue': + raise JS_Continue() + var_m = re.match( - r'(?!if|return|true|false)(?P%s)$' % _NAME_RE, + r'(?!if|return|true|false|null)(?P%s)$' % _NAME_RE, expr) if var_m: return local_vars[var_m.group('name')] @@ -124,91 +322,161 @@ class JSInterpreter(object): r'(?P%s)\[(?P.+)\]$' % _NAME_RE, expr) if m: val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) + idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) return val[idx] + def raise_expr_error(where, op, exp): + raise ExtractorError('Premature {0} return of {1} in {2!r}'.format(where, op, exp)) + + for op, opfunc in _OPERATORS: + separated = list(self._separate(expr, op)) + if len(separated) < 2: + continue + right_val = separated.pop() + left_val = op.join(separated) + left_val, should_abort = self.interpret_statement( + left_val, local_vars, allow_recursion - 1) + if should_abort: + raise_expr_error('left-side', op, expr) + right_val, should_abort = self.interpret_statement( + right_val, local_vars, allow_recursion - 1) + if should_abort: + raise_expr_error('right-side', op, expr) + return opfunc(left_val or 0, right_val) + m = re.match( - r'(?P%s)(?:\.(?P[^(]+)|\[(?P[^]]+)\])\s*(?:\(+(?P[^()]*)\))?$' % _NAME_RE, + r'(?P%s)(?:\.(?P[^(]+)|\[(?P[^]]+)\])\s*' % _NAME_RE, expr) if m: variable = m.group('var') - member = remove_quotes(m.group('member') or m.group('member2')) - arg_str = m.group('args') + nl = Nonlocal() - if variable in local_vars: - obj = local_vars[variable] + nl.member = remove_quotes(m.group('member') or m.group('member2')) + arg_str = expr[m.end():] + if arg_str.startswith('('): + arg_str, remaining = self._separate_at_paren(arg_str, ')') else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] + arg_str, remaining = None, arg_str - if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] + def assertion(cndn, msg): + """ assert, but without risk of getting optimized out """ + if not cndn: + raise ExtractorError('{0} {1}: {2}'.format(nl.member, msg, expr)) - assert expr.endswith(')') - # Function call - if arg_str == '': - argvals = tuple() - else: - argvals = tuple([ + def eval_method(): + # nonlocal member + member = nl.member + if variable == 'String': + obj = str + elif variable in local_vars: + obj = local_vars[variable] + else: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + # Function call + argvals = [ self.interpret_expression(v, local_vars, allow_recursion) - for v in arg_str.split(',')]) + for v in self._separate(arg_str)] - if member == 'split': - assert argvals == ('',) - return list(obj) - if member == 'join': - assert len(argvals) == 1 - return argvals[0].join(obj) - if member == 'reverse': - assert len(argvals) == 0 - obj.reverse() - return obj - if member == 'slice': - assert len(argvals) == 1 - return obj[argvals[0]:] - if member == 'splice': - assert isinstance(obj, list) - index, howMany = argvals - res = [] - for i in range(index, min(index + howMany, len(obj))): - res.append(obj.pop(index)) - return res + if obj == str: + if member == 'fromCharCode': + assertion(argvals, 'takes one or more arguments') + return ''.join(map(chr, argvals)) + raise ExtractorError('Unsupported string method %s' % (member, )) - return obj[member](argvals) + if member == 'split': + assertion(argvals, 'takes one or more arguments') + assertion(argvals == [''], 'with arguments is not implemented') + return list(obj) + elif member == 'join': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return argvals[0].join(obj) + elif member == 'reverse': + assertion(not argvals, 'does not take any arguments') + obj.reverse() + return obj + elif member == 'slice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return obj[argvals[0]:] + elif member == 'splice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + index, howMany = (argvals + [len(obj)])[:2] + if index < 0: + index += len(obj) + add_items = argvals[2:] + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + for i, item in enumerate(add_items): + obj.insert(index + i, item) + return res + elif member == 'unshift': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + for item in reversed(argvals): + obj.insert(0, item) + return obj + elif member == 'pop': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(not argvals, 'does not take any arguments') + if not obj: + return + return obj.pop() + elif member == 'push': + assertion(argvals, 'takes one or more arguments') + obj.extend(argvals) + return obj + elif member == 'forEach': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + f, this = (argvals + [''])[:2] + return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)] + elif member == 'indexOf': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + idx, start = (argvals + [0])[:2] + try: + return obj.index(idx, start) + except ValueError: + return -1 - for op, opfunc in _OPERATORS: - m = re.match(r'(?P.+?)%s(?P.+)' % re.escape(op), expr) - if not m: - continue - x, abort = self.interpret_statement( - m.group('x'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature left-side return of %s in %r' % (op, expr)) - y, abort = self.interpret_statement( - m.group('y'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature right-side return of %s in %r' % (op, expr)) - return opfunc(x, y) + if isinstance(obj, list): + member = int(member) + nl.member = member + return obj[member](argvals) - m = re.match( - r'^(?P%s)\((?P[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) + if remaining: + return self.interpret_expression( + self._named_object(local_vars, eval_method()) + remaining, + local_vars, allow_recursion) + else: + return eval_method() + + m = re.match(r'^(?P%s)\((?P[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) if m: fname = m.group('func') argvals = tuple([ int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() - if fname not in self._functions: + for v in self._separate(m.group('args'))]) + if fname in local_vars: + return local_vars[fname](argvals) + elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) - raise ExtractorError('Unsupported JS expression %r' % expr) + if expr: + raise ExtractorError('Unsupported JS expression %r' % expr) def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -233,30 +501,52 @@ class JSInterpreter(object): return obj - def extract_function(self, funcname): + def extract_function_code(self, funcname): + """ @returns argnames, code """ func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%(f_n)s|[{;,]\s*%(f_n)s\s*=\s*function|var\s+%(f_n)s\s*=\s*function)\s* \((?P[^)]*)\)\s* - \{(?P[^}]+)\}''' % ( - re.escape(funcname), re.escape(funcname), re.escape(funcname)), + (?P\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % {'f_n': re.escape(funcname), }, self.code) + code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match if func_m is None: raise ExtractorError('Could not find JS function %r' % funcname) - argnames = func_m.group('args').split(',') + return func_m.group('args').split(','), code - return self.build_function(argnames, func_m.group('code')) + def extract_function(self, funcname): + return self.extract_function_from_code(*self.extract_function_code(funcname)) + + def extract_function_from_code(self, argnames, code, *global_stack): + local_vars = {} + while True: + mobj = re.search(r'function\((?P[^)]*)\)\s*{', code) + if mobj is None: + break + start, body_start = mobj.span() + body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + name = self._named_object( + local_vars, + self.extract_function_from_code( + [str.strip(x) for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) + code = code[:start] + name + remaining + return self.build_function(argnames, code, local_vars, *global_stack) def call_function(self, funcname, *args): - f = self.extract_function(funcname) - return f(args) + return self.extract_function(funcname)(args) - def build_function(self, argnames, code): - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in code.split(';'): - res, abort = self.interpret_statement(stmt, local_vars) - if abort: + def build_function(self, argnames, code, *global_stack): + global_stack = list(global_stack) or [{}] + local_vars = global_stack.pop(0) + + def resf(args, **kwargs): + local_vars.update(dict(zip(argnames, args))) + local_vars.update(kwargs) + var_stack = LocalNameSpace(local_vars, *global_stack) + for stmt in self._separate(code.replace('\n', ''), ';'): + ret, should_abort = self.interpret_statement(stmt, var_stack) + if should_abort: break - return res + return ret return resf From e1eae16b56b5c57e341b000167c0a92e67095e6e Mon Sep 17 00:00:00 2001 From: df Date: Thu, 4 Nov 2021 12:48:06 +0000 Subject: [PATCH 1278/1705] Handle default in switch better Add https://github.com/yt-dlp/yt-dlp/commit/a1fc7ca0743c8df06416e68ee74b64e07dfe7135 Thanks coletdjnz --- test/test_jsinterp.py | 15 +++++++++++++++ youtube_dl/jsinterp.py | 23 ++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 4d05ea610..acdabffb1 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -133,6 +133,21 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('x', 3), 6) self.assertEqual(jsi.call_function('x', 5), 0) + def test_switch_default(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 2: f+=2; + default: f-=1; + case 5: + case 6: f+=6; + case 0: break; + case 1: f+=1; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 2) + self.assertEqual(jsi.call_function('x', 5), 11) + self.assertEqual(jsi.call_function('x', 9), 14) + def test_try(self): jsi = JSInterpreter(''' function x() { try{return 10} catch(e){return 5} } diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 061e92c2a..c35765702 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -240,21 +240,26 @@ class JSInterpreter(object): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) body, expr = self._separate_at_paren(remaining, '}') - body, default = body.split('default:') if 'default:' in body else (body, None) - items = body.split('case ')[1:] - if default: - items.append('default:%s' % (default, )) - matched = False - for item in items: - case, stmt = [i.strip() for i in self._separate(item, ':', 1)] - matched = matched or case == 'default' or switch_val == self.interpret_expression(case, local_vars, allow_recursion) - if matched: + items = body.replace('default:', 'case default:').split('case ')[1:] + for default in (False, True): + matched = False + for item in items: + case, stmt = [i.strip() for i in self._separate(item, ':', 1)] + if default: + matched = matched or case == 'default' + elif not matched: + matched = (case != 'default' + and switch_val == self.interpret_expression(case, local_vars, allow_recursion)) + if not matched: + continue try: ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) if should_abort: return ret except JS_Break: break + if matched: + break return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] # Comma separated statements From 1ca673bd98cc5bbfa76d00ac84ad5f6c1376db01 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 27 Nov 2021 02:06:13 +0000 Subject: [PATCH 1279/1705] Fix splice to handle float Needed for new youtube js player f1ca6900 Add https://github.com/yt-dlp/yt-dlp/commit/57dbe8077f8d00e0fffac53669f40cd7d584474f#diff-729b57caa8d006426f6a8960c061f519a8b6658682284015e069745af52ffb07 --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c35765702..c75cf45b9 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -416,7 +416,7 @@ class JSInterpreter(object): elif member == 'splice': assertion(isinstance(obj, list), 'must be applied on a list') assertion(argvals, 'takes one or more arguments') - index, howMany = (argvals + [len(obj)])[:2] + index, howMany = map(int, (argvals + [len(obj)])[:2]) if index < 0: index += len(obj) add_items = argvals[2:] From 9d142109f445ea247e476cfc0e0ca134f6ebb802 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 27 Nov 2021 03:18:29 +0000 Subject: [PATCH 1280/1705] Back-port test_youtube_signature.py from yt-dlp and fix JSInterp accordingly --- test/test_youtube_signature.py | 89 ++++++++++++++++++++++++---------- youtube_dl/jsinterp.py | 9 ++-- 2 files changed, 69 insertions(+), 29 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 627d4cb92..c8e85b500 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -14,9 +14,10 @@ import string from test.helper import FakeYDL from youtube_dl.extractor import YoutubeIE +from youtube_dl.jsinterp import JSInterpreter from youtube_dl.compat import compat_str, compat_urlretrieve -_TESTS = [ +_SIG_TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', 86, @@ -64,6 +65,25 @@ _TESTS = [ ) ] +_NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', + 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', + ), + ( + 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN', + ), + ( + 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', '3DIBbn3qdQ', + ), + ( + 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', + 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', + ), +] + class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): @@ -95,35 +115,54 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, sig_input, expected_sig): - m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) - assert m, '%r should follow URL format' % url - test_id = m.group(1) +def t_factory(name, sig_func, url_pattern): + def make_tfunc(url, sig_input, expected_sig): + m = url_pattern.match(url) + assert m, '%r should follow URL format' % url + test_id = m.group('id') - def test_func(self): - basename = 'player-%s.js' % test_id - fn = os.path.join(self.TESTDATA_DIR, basename) + def test_func(self): + basename = 'player-{0}-{1}.js'.format(name, test_id) + fn = os.path.join(self.TESTDATA_DIR, basename) - if not os.path.exists(fn): - compat_urlretrieve(url, fn) + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + self.assertEqual(sig_func(jscode, sig_input), expected_sig) - ydl = FakeYDL() - ie = YoutubeIE(ydl) - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - src_sig = ( - compat_str(string.printable[:sig_input]) - if isinstance(sig_input, int) else sig_input) - got_sig = func(src_sig) - self.assertEqual(got_sig, expected_sig) - - test_func.__name__ = str('test_signature_js_' + test_id) - setattr(TestSignature, test_func.__name__, test_func) + test_func.__name__ = str('test_{0}_js_{1}'.format(name, test_id)) + setattr(TestSignature, test_func.__name__, test_func) + return make_tfunc -for test_spec in _TESTS: - make_tfunc(*test_spec) +def signature(jscode, sig_input): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) + src_sig = ( + compat_str(string.printable[:sig_input]) + if isinstance(sig_input, int) else sig_input) + return func(src_sig) + + +def n_sig(jscode, sig_input): + # Pending implementation of _extract_n_function_name() or similar in + # youtube.py, hard-code here + # funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) + import re + funcname = re.search(r'[=(,&|](\w+)\(\w+\),\w+\.set\("n",', jscode) + funcname = funcname and funcname.group(1) + return JSInterpreter(jscode).call_function(funcname, sig_input) + + +make_sig_test = t_factory( + 'signature', signature, re.compile(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) +for test_spec in _SIG_TESTS: + make_sig_test(*test_spec) + +make_nsig_test = t_factory( + 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) +for test_spec in _NSIG_TESTS: + make_nsig_test(*test_spec) if __name__ == '__main__': diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c75cf45b9..a2306557b 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -9,7 +9,8 @@ from .utils import ( remove_quotes, ) from .compat import ( - compat_collections_abc + compat_collections_abc, + compat_str, ) MutableMapping = compat_collections_abc.MutableMapping @@ -372,7 +373,7 @@ class JSInterpreter(object): # nonlocal member member = nl.member if variable == 'String': - obj = str + obj = compat_str elif variable in local_vars: obj = local_vars[variable] else: @@ -391,7 +392,7 @@ class JSInterpreter(object): self.interpret_expression(v, local_vars, allow_recursion) for v in self._separate(arg_str)] - if obj == str: + if obj == compat_str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) @@ -533,7 +534,7 @@ class JSInterpreter(object): name = self._named_object( local_vars, self.extract_function_from_code( - [str.strip(x) for x in mobj.group('args').split(',')], + [x.strip() for x in mobj.group('args').split(',')], body, local_vars, *global_stack)) code = code[:start] + name + remaining return self.build_function(argnames, code, local_vars, *global_stack) From 6ca7b776965ed1e9220690edc4ee22de8c8587f5 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 10 Dec 2021 19:14:54 +0000 Subject: [PATCH 1281/1705] Refactor JSInterpreter._separate yt-dlp/yt-dlp/@06dfe0a, improve _MATCHING_PARENS --- youtube_dl/jsinterp.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a2306557b..8eaa911cd 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -36,6 +36,8 @@ _ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) + class JS_Break(ExtractorError): def __init__(self): @@ -100,26 +102,24 @@ class JSInterpreter(object): def _separate(expr, delim=',', max_split=None): if not expr: return - parens = {'(': 0, '{': 0, '[': 0, ']': 0, '}': 0, ')': 0} - start, splits, pos, max_pos = 0, 0, 0, len(delim) - 1 + counters = {k: 0 for k in _MATCHING_PARENS.values()} + start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 for idx, char in enumerate(expr): - if char in parens: - parens[char] += 1 - is_in_parens = (parens['['] - parens[']'] - or parens['('] - parens[')'] - or parens['{'] - parens['}']) - if char == delim[pos] and not is_in_parens: - if pos == max_pos: - pos = 0 - yield expr[start: idx - max_pos] - start = idx + 1 - splits += 1 - if max_split and splits >= max_split: - break - else: - pos += 1 - else: + if char in _MATCHING_PARENS: + counters[_MATCHING_PARENS[char]] += 1 + elif char in counters: + counters[char] -= 1 + if char != delim[pos] or any(counters.values()): pos = 0 + continue + elif pos != delim_len: + pos += 1 + continue + yield expr[start: idx - delim_len] + start, pos = idx + 1, 0 + splits += 1 + if max_split and splits >= max_split: + break yield expr[start:] @staticmethod From af9e72507ea38e5ab3fa2751ed09ec88021260cb Mon Sep 17 00:00:00 2001 From: df Date: Mon, 1 Nov 2021 04:45:42 +0000 Subject: [PATCH 1282/1705] Implement n-param descrambling using JSInterp Fixes #29326, closes #29790, closes #30004, closes #30024, closes #30052, closes #30088, closes #30097, closes #30102, closes #30109, closes #30119, closes #30125, closes #30128, closes #30162, closes #30173, closes #30186, closes #30192, closes #30221, closes #30239, closes #30539, closes #30552. --- youtube_dl/extractor/youtube.py | 115 +++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index da410f8f0..63918924d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1254,6 +1254,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') + def _get_player_code(self, video_id, player_url, player_id=None): + if not player_id: + player_id = self._extract_player_info(player_url) + + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( + player_url, video_id, + note='Downloading player ' + player_id, + errnote='Download of %s failed' % player_url) + return self._code_cache[player_id] + def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) @@ -1266,12 +1277,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - if player_id not in self._code_cache: - self._code_cache[player_id] = self._download_webpage( - player_url, video_id, - note='Downloading player ' + player_id, - errnote='Download of %s failed' % player_url) - code = self._code_cache[player_id] + code = self._get_player_code(video_id, player_url, player_id) res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) @@ -1350,11 +1356,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url is None: raise ExtractorError('Cannot decrypt signature without player_url') - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -1371,6 +1372,88 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) + def _extract_player_url(self, webpage): + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage or '', 'player URL', fatal=False) + if not player_url: + return + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) + return player_url + + # from yt-dlp + # See also: + # 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419 + # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 + # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 + def _extract_n_function_name(self, jscode): + return self._search_regex( + (r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), + jscode, 'Initial JS player n function name', group='nfunc') + + def _extract_n_function(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self._downloader.cache.load('youtube-nsig', player_id) + + if func_code: + jsi = JSInterpreter(func_code) + else: + player_id = self._extract_player_info(player_url) + jscode = self._get_player_code(video_id, player_url, player_id) + funcname = self._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func_code = jsi.extract_function_code(funcname) + self._downloader.cache.store('youtube-nsig', player_id, func_code) + + if self._downloader.params.get('youtube_print_sig_code'): + self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1])) + + return lambda s: jsi.extract_function_from_code(*func_code)([s]) + + def _n_descramble(self, n_param, player_url, video_id): + """Compute the response to YT's "n" parameter challenge + + Args: + n_param -- challenge string that is the value of the + URL's "n" query parameter + player_url -- URL of YT player JS + video_id + """ + + sig_id = ('nsig_value', n_param) + if sig_id in self._player_cache: + return self._player_cache[sig_id] + + try: + player_id = ('nsig', player_url) + if player_id not in self._player_cache: + self._player_cache[player_id] = self._extract_n_function(video_id, player_url) + func = self._player_cache[player_id] + self._player_cache[sig_id] = func(n_param) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) + return self._player_cache[sig_id] + except Exception as e: + raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + + def _unthrottle_format_urls(self, video_id, player_url, formats): + for fmt in formats: + parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) + qs = compat_urlparse.parse_qs(parsed_fmt_url.query) + n_param = qs.get('n') + if not n_param: + continue + n_param = n_param[-1] + n_response = self._n_descramble(n_param, player_url, video_id) + if n_response: + qs['n'] = [n_response] + fmt['url'] = compat_urlparse.urlunparse( + parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, @@ -1632,11 +1715,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not (sc and fmt_url and encrypted_sig): continue if not player_url: - if not webpage: - continue - player_url = self._search_regex( - r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', - webpage, 'player URL', fatal=False) + player_url = self._extract_player_url(webpage) if not player_url: continue signature = self._decrypt_signature(sc['s'][0], video_id, player_url) @@ -1782,6 +1861,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_live = video_details.get('isLive') owner_profile_url = microformat.get('ownerProfileUrl') + if not player_url: + player_url = self._extract_player_url(webpage) + self._unthrottle_format_urls(video_id, player_url, formats) + info = { 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, From 1e677567cd083d43f55daef0cc74e5fa24575ae3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 1 Feb 2022 14:39:03 +0000 Subject: [PATCH 1283/1705] [YouTube] Fix n-sig for player e06dea74 (#30582) From yt-dl commit 48416bc --- test/test_youtube_signature.py | 24 +++++++++++++++++------- youtube_dl/extractor/youtube.py | 14 +++++++++++--- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index c8e85b500..fc5e9828e 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -82,6 +82,14 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', ), + ( + 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js', + 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw', + ), + ( + 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js', + 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw', + ), ] @@ -110,10 +118,17 @@ class TestPlayerInfo(unittest.TestCase): class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) - self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') + self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata/sigs') if not os.path.exists(self.TESTDATA_DIR): os.mkdir(self.TESTDATA_DIR) + def tearDown(self): + try: + for f in os.listdir(self.TESTDATA_DIR): + os.remove(f) + except OSError: + pass + def t_factory(name, sig_func, url_pattern): def make_tfunc(url, sig_input, expected_sig): @@ -145,12 +160,7 @@ def signature(jscode, sig_input): def n_sig(jscode, sig_input): - # Pending implementation of _extract_n_function_name() or similar in - # youtube.py, hard-code here - # funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) - import re - funcname = re.search(r'[=(,&|](\w+)\(\w+\),\w+\.set\("n",', jscode) - funcname = funcname and funcname.group(1) + funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) return JSInterpreter(jscode).call_function(funcname, sig_input) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 63918924d..7943b94f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -28,6 +28,7 @@ from ..utils import ( dict_get, float_or_none, int_or_none, + js_to_json, mimetype2ext, parse_codecs, parse_duration, @@ -1391,9 +1392,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 def _extract_n_function_name(self, jscode): - return self._search_regex( - (r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), - jscode, 'Initial JS player n function name', group='nfunc') + target = r'(?P[a-zA-Z0-9$]{3})(?:\[(?P\d+)\])?' + nfunc_and_idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ), + jscode, 'Initial JS player n function name') + nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') + if not idx: + return nfunc + return self._parse_json(self._search_regex( + r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode, + 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) From 34c06b16f5eb814308392b68dce07bbff62bc406 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 31 Jan 2022 00:02:56 +0000 Subject: [PATCH 1284/1705] Support Youtube Shorts URL format --- youtube_dl/extractor/youtube.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7943b94f9..05688dc70 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -417,6 +417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ + |shorts/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -1119,6 +1120,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # YT 'Shorts' + 'url': 'https://youtube.com/shorts/4L2J27mJ3Dc', + 'info_dict': { + 'id': '4L2J27mJ3Dc', + 'ext': 'mp4', + 'upload_date': '20211025', + 'uploader': 'Charlie Berens', + 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', + 'uploader_id': 'fivedlrmilkshake', + 'title': 'Midwest Squid Game #Shorts', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, From 41f0043983c831b7c0c3614340d2f66ec153087b Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 1 Feb 2022 23:22:57 +0000 Subject: [PATCH 1285/1705] Avoid crashing if n-sig decode fails --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 05688dc70..4165de15c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..utils import ( ExtractorError, clean_html, dict_get, + error_to_compat_str, float_or_none, int_or_none, js_to_json, @@ -1463,7 +1464,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) return self._player_cache[sig_id] except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + self._downloader.report_warning( + '[%s] %s (%s %s)' % ( + self.IE_NAME, + 'Unable to decode n-parameter: download likely to be throttled', + error_to_compat_str(e), + traceback.format_exc())) def _unthrottle_format_urls(self, video_id, player_url, formats): for fmt in formats: From 78ce962f4fe020994c216dd2671546fbe58a5c67 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 30 Jan 2022 01:24:09 +0530 Subject: [PATCH 1286/1705] [youtube] Support channel search Code from https://github.com/yt-dlp/yt-dlp/commit/cd684175adbe663bbdf6a6c72d8b99b617b6ff2e --- youtube_dl/extractor/youtube.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4165de15c..8e1254f19 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2438,6 +2438,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', 'only_matching': True, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + } }] @classmethod @@ -2835,8 +2846,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_selected_tab(tabs): for tab in tabs: - if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): - return tab['tabRenderer'] + renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} + if renderer.get('selected') is True: + return renderer else: raise ExtractorError('Unable to find selected tab') @@ -2893,6 +2905,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title = channel_title or item_id if tab_title: title += ' - %s' % tab_title + if selected_tab.get('expandedText'): + title += ' - %s' % selected_tab['expandedText'] description = renderer.get('description') playlist_id = renderer.get('externalId') else: From 7a497f1405ecdcd76c671c7bfaad238d75d01639 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 04:09:23 +0000 Subject: [PATCH 1287/1705] Rework 2c2c2bd with an actual Mix page and realistic playlist size From https://github.com/ytdl-org/youtube-dl/commit/2c2c2bd348b7dce0aad55a6fc37a18c6f9a000e3#commitcomment-65953545 --- test/test_youtube_lists.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 07a6b6d06..e0e8891ba 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -36,12 +36,12 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() dl.params['format'] = 'best' ie = YoutubeTabIE(dl) - result = dl.extract_info('https://www.youtube.com/watch?v=uVJ0Il5WvbE&list=PLhQjrBD2T381k8ul4WQ8SQ165XqY149WW', + result = dl.extract_info('https://www.youtube.com/watch?v=tyITL_exICo&list=RDCLAK5uy_kLWIr9gv1XLlPbaDS965-Db4TrBoUTxQ8', download=False, ie_key=ie.ie_key(), process=True) entries = (result or {}).get('entries', [{'id': 'not_found', }]) - self.assertTrue(len(entries) >= 50) + self.assertTrue(len(entries) >= 25) original_video = entries[0] - self.assertEqual(original_video['id'], 'uVJ0Il5WvbE') + self.assertEqual(original_video['id'], 'tyITL_exICo') def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() From 0c0876f790c78c38ececbc920073e8b6cf01e9c7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 3 Feb 2022 07:44:37 +0530 Subject: [PATCH 1288/1705] [youtube:search] Add tests --- youtube_dl/extractor/youtube.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3ab60960a..41695a561 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3206,7 +3206,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): _SEARCH_KEY = 'ytsearch' _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _MAX_RESULTS = float('inf') - _TESTS = [] + _TESTS = [{ + 'url': 'ytsearch10:youtube-dl test video', + 'playlist_count': 10, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -3219,7 +3226,14 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date - _TESTS = [] + _TESTS = [{ + 'url': 'ytsearchdate10:youtube-dl test video', + 'playlist_count': 10, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): @@ -3232,7 +3246,8 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', - } + }, + 'params': {'playlistend': 5} }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, From 61d791726f67255c2ed3c0bb6ee24c8c1faeb028 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 11:24:03 +0000 Subject: [PATCH 1289/1705] Find TV2DK Kaltura ID in Nuxt.js page format --- youtube_dl/extractor/tv2dk.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bd5fd640..106a081e1 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor): 'duration': 1347, 'view_count': int, }, - 'params': { - 'skip_download': True, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', + 'info_dict': { + 'id': '1_7iwll9n0', + 'ext': 'mp4', + 'upload_date': '20211027', + 'title': 'Gadekamp #6 - Højhuse i København', + 'uploader_id': 'tv2lorry', + 'timestamp': 1635345229, }, 'add_ie': ['Kaltura'], }, { @@ -91,7 +99,8 @@ class TV2DKIE(InfoExtractor): add_entry(partner_id, kaltura_id) if not entries: kaltura_id = self._search_regex( - r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + (r'entry_id\s*:\s*["\']([0-9a-z_]+)', + r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') partner_id = self._search_regex( (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') From 27dbf6f0ab778a9e3d81be64a615046e6737c3f6 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 11:38:44 +0000 Subject: [PATCH 1290/1705] Return the item itself if playlist has one entry Removes playlist spam from log --- youtube_dl/extractor/tv2dk.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 106a081e1..ec5cbdf03 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -105,6 +105,8 @@ class TV2DKIE(InfoExtractor): (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') add_entry(partner_id, kaltura_id) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries) From 8248133e5ee5579316120cbcbff3ba8b713f1017 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 4 Feb 2022 11:29:41 +0000 Subject: [PATCH 1291/1705] Back-port yt-dlp Viki extractor From https://github.com/yt-dlp/yt-dlp/pull/2540 --- youtube_dl/extractor/viki.py | 335 +++++++++++++++-------------------- 1 file changed, 144 insertions(+), 191 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 2e9cbf148..2ddca0ca6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,38 +1,29 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 import hashlib import hmac -import itertools import json -import re import time from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) from ..utils import ( ExtractorError, int_or_none, parse_age_limit, parse_iso8601, - sanitized_Request, - std_headers, try_get, ) class VikiBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' - _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' - _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' + _API_URL_TEMPLATE = 'https://api.viki.io%s' + _DEVICE_ID = '112395910d' _APP = '100005a' - _APP_VERSION = '6.0.0' - _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' + _APP_VERSION = '6.11.3' + _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472' _GEO_BYPASS = False _NETRC_MACHINE = 'viki' @@ -45,43 +36,60 @@ class VikiBaseIE(InfoExtractor): 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } - def _prepare_call(self, path, timestamp=None, post_data=None): + def _stream_headers(self, timestamp, sig): + return { + 'X-Viki-manufacturer': 'vivo', + 'X-Viki-device-model': 'vivo 1606', + 'X-Viki-device-os-ver': '6.0.1', + 'X-Viki-connection-type': 'WIFI', + 'X-Viki-carrier': '', + 'X-Viki-as-id': '100005a-1625321982-3932', + 'timestamp': str(timestamp), + 'signature': str(sig), + 'x-viki-app-ver': self._APP_VERSION + } + + def _api_query(self, path, version=4, **kwargs): path += '?' if '?' not in path else '&' - if not timestamp: - timestamp = int(time.time()) - query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) + app = self._APP + query = '/v{version}/{path}app={app}'.format(**locals()) if self._token: query += '&token=%s' % self._token + return query + ''.join('&{name}={val}.format(**locals())' for name, val in kwargs.items()) + + def _sign_query(self, path): + timestamp = int(time.time()) + query = self._api_query(path, version=5) sig = hmac.new( self._APP_SECRET.encode('ascii'), - query.encode('ascii'), - hashlib.sha1 - ).hexdigest() - url = self._API_URL_TEMPLATE % (query, sig) - return sanitized_Request( - url, json.dumps(post_data).encode('utf-8')) if post_data else url + '{query}&t={timestamp}'.format(**locals()).encode('ascii'), + hashlib.sha1).hexdigest() + return timestamp, sig, self._API_URL_TEMPLATE % query - def _call_api(self, path, video_id, note, timestamp=None, post_data=None): + def _call_api( + self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True): + if query is None: + timestamp, sig, url = self._sign_query(path) + else: + url = self._API_URL_TEMPLATE % self._api_query(path, version=4) resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note, - headers={'x-viki-app-ver': self._APP_VERSION}) - - error = resp.get('error') - if error: - if error == 'invalid timestamp': - resp = self._download_json( - self._prepare_call(path, int(resp['current_timestamp']), post_data), - video_id, '%s (retry)' % note) - error = resp.get('error') - if error: - self._raise_error(resp['error']) + url, video_id, note, fatal=fatal, query=query, + data=json.dumps(data).encode('utf-8') if data else None, + headers=({'x-viki-app-ver': self._APP_VERSION} if data + else self._stream_headers(timestamp, sig) if query is None + else None), expected_status=400) or {} + self._raise_error(resp.get('error'), fatal) return resp - def _raise_error(self, error): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), - expected=True) + def _raise_error(self, error, fatal=True): + if error is None: + return + msg = '%s said: %s' % (self.IE_NAME, error) + if fatal: + raise ExtractorError(msg, expected=True) + else: + self.report_warning(msg) def _check_errors(self, data): for reason, status in (data.get('blocking') or {}).items(): @@ -90,9 +98,10 @@ class VikiBaseIE(InfoExtractor): if reason == 'geo': self.raise_geo_restricted(msg=message) elif reason == 'paywall': + if try_get(data, lambda x: x['paywallable']['tvod']): + self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)') self.raise_login_required(message) - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, message), expected=True) + self._raise_error(message) def _real_initialize(self): self._login() @@ -102,35 +111,39 @@ class VikiBaseIE(InfoExtractor): if username is None: return - login_form = { - 'login_id': username, - 'password': password, - } - - login = self._call_api( - 'sessions.json', None, - 'Logging in', post_data=login_form) - - self._token = login.get('token') + self._token = self._call_api( + 'sessions.json', None, 'Logging in', fatal=False, + data={'username': username, 'password': password}).get('token') if not self._token: - self.report_warning('Unable to get session token, login has probably failed') + self.report_warning('Login Failed: Unable to get session token') @staticmethod - def dict_selection(dict_obj, preferred_key, allow_fallback=True): + def dict_selection(dict_obj, preferred_key): if preferred_key in dict_obj: - return dict_obj.get(preferred_key) - - if not allow_fallback: - return - - filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()])) - return filtered_dict[0] if filtered_dict else None + return dict_obj[preferred_key] + return (list(filter(None, dict_obj.values())) or [None])[0] class VikiIE(VikiBaseIE): IE_NAME = 'viki' _VALID_URL = r'%s(?:videos|player)/(?P[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ + 'note': 'Free non-DRM video with storyboards in MPD', + 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', + 'info_dict': { + 'id': '1175236v', + 'ext': 'mp4', + 'title': 'Choosing Spouse by Lottery - Episode 1', + 'timestamp': 1606463239, + 'age_limit': 12, + 'uploader': 'FCC', + 'upload_date': '20201127', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], + 'params': { + 'format': 'bestvideo', + }, + }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { 'id': '1023585v', @@ -146,7 +159,7 @@ class VikiIE(VikiBaseIE): 'params': { 'format': 'bestvideo', }, - 'skip': 'Blocked in the US', + 'skip': 'Content is only available to Viki Pass Plus subscribers', 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip @@ -178,11 +191,11 @@ class VikiIE(VikiBaseIE): 'like_count': int, 'age_limit': 13, }, - 'skip': 'Blocked in the US', + 'skip': 'Page not found!', }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '0a53dc252e6e690feccd756861495a8c', + 'md5': '670440c79f7109ca6564d4c7f24e3e81', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -193,7 +206,7 @@ class VikiIE(VikiBaseIE): 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, - 'age_limit': 13, + 'age_limit': 15, 'episode_number': 1, }, 'params': { @@ -224,7 +237,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '41faaba0de90483fb4848952af7c7d0d', + 'md5': '78bf49fdaa51f9e7f9150262a9ef9bdf', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -232,8 +245,8 @@ class VikiIE(VikiBaseIE): 'upload_date': '20111122', 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', - 'title': 'Love In Magic', - 'age_limit': 13, + 'title': 'Love in Magic', + 'age_limit': 15, }, 'params': { 'format': 'bestvideo', @@ -244,45 +257,53 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - resp = self._download_json( - 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', headers={ - 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '3.0.0', - }) - video = resp['video'] + video = self._call_api('videos/{0}.json'.format(video_id), video_id, 'Downloading video JSON', query={}) self._check_errors(video) - title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + title = try_get(video, lambda x: x['titles']['en'], str) episode_number = int_or_none(video.get('number')) if not title: title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') - title = '%s - %s' % (container_title, title) + if container_title and title == video_id: + title = container_title + else: + title = '%s - %s' % (container_title, title) + + resp = self._call_api( + 'playback_streams/%s.json?drms=dt3&device_id=%s' % (video_id, self._DEVICE_ID), + video_id, 'Downloading video streams JSON')['main'][0] + + mpd_url = resp['url'] + # 720p is hidden in another MPD which can be found in the current manifest content + mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') + mpd_url = self._search_regex( + r'(?mi)(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) + if 'mpdhd_high' not in mpd_url: + # Modify the URL to get 1080p + mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') + formats = self._extract_mpd_formats(mpd_url, video_id) + self._sort_formats(formats) description = self.dict_selection(video.get('descriptions', {}), 'en') - + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail['url'], + } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) - thumbnails = [] - for thumbnail_id, thumbnail in (video.get('images') or {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail.get('url'), - }) + stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) + subtitles = dict((lang, [{ + 'ext': ext, + 'url': self._API_URL_TEMPLATE % self._api_query( + 'videos/{0}/auth_subtitles/{1}.{2}'.format(video_id, lang, ext), stream_id=stream_id) + } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys()) - subtitles = {} - for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': subtitles_format, - 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), - } for subtitles_format in ('srt', 'vtt')] - - result = { + return { 'id': video_id, + 'formats': formats, 'title': title, 'description': description, 'duration': int_or_none(video.get('duration')), @@ -296,79 +317,6 @@ class VikiIE(VikiBaseIE): 'episode_number': episode_number, } - formats = [] - - def add_format(format_id, format_dict, protocol='http'): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - return - format_url = format_dict.get('url') - if not format_url: - return - qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) - stream = qs.get('stream', [None])[0] - if stream: - format_url = base64.b64decode(stream).decode() - if format_id in ('m3u8', 'hls'): - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if '_drm/index_' in f['url']: - continue - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.append(f) - elif format_id in ('mpd', 'dash'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', - format_url) - if not mobj: - return - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)), - }) - - for format_id, format_dict in (resp.get('streams') or {}).items(): - add_format(format_id, format_dict) - if not formats: - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - - for format_id, stream_dict in streams.items(): - for protocol, format_dict in stream_dict.items(): - add_format(format_id, format_dict, protocol) - self._sort_formats(formats) - - result['formats'] = formats - return result - class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' @@ -378,9 +326,9 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', + 'description': 'md5:f08b679c200e1a273c695fe9986f21d7', }, - 'playlist_mincount': 71, + 'playlist_mincount': 51, }, { 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', 'info_dict': { @@ -401,33 +349,38 @@ class VikiChannelIE(VikiBaseIE): 'only_matching': True, }] - _PER_PAGE = 25 + _video_types = ('episodes', 'movies', 'clips', 'trailers') + + def _entries(self, channel_id): + params = { + 'app': self._APP, 'token': self._token, 'only_ids': 'true', + 'direction': 'asc', 'sort': 'number', 'per_page': 30 + } + video_types = self._video_types + for video_type in video_types: + if video_type not in self._video_types: + self.report_warning('Unknown video_type: ' + video_type) + page_num = 0 + while True: + page_num += 1 + params['page'] = page_num + res = self._call_api( + 'containers/{channel_id}/{video_type}.json'.format(**locals()), channel_id, query=params, fatal=False, + note='Downloading %s JSON page %d' % (video_type.title(), page_num)) + + for video_id in res.get('response') or []: + yield self.url_result('https://www.viki.com/videos/' + video_id, VikiIE.ie_key(), video_id) + if not res.get('more'): + break def _real_extract(self, url): channel_id = self._match_id(url) - channel = self._call_api( - 'containers/%s.json' % channel_id, channel_id, - 'Downloading channel JSON') + channel = self._call_api('containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') self._check_errors(channel) - title = self.dict_selection(channel['titles'], 'en') - - description = self.dict_selection(channel['descriptions'], 'en') - - entries = [] - for video_type in ('episodes', 'clips', 'movies'): - for page_num in itertools.count(1): - page = self._call_api( - 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' - % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, - 'Downloading %s JSON page #%d' % (video_type, page_num)) - for video in page['response']: - video_id = video['id'] - entries.append(self.url_result( - 'https://www.viki.com/videos/%s' % video_id, 'Viki')) - if not page['pagination']['next']: - break - - return self.playlist_result(entries, channel_id, title, description) + return self.playlist_result( + self._entries(channel_id), channel_id, + self.dict_selection(channel['titles'], 'en'), + self.dict_selection(channel['descriptions'], 'en')) From b494824286f0ac2fc7313452b287fbbffe61ccbe Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 13:11:11 +0000 Subject: [PATCH 1292/1705] Support Tele5 pages with Discovery Networks format instead of JWPlatform --- youtube_dl/extractor/tele5.py | 86 ++++++++++++++--------------------- 1 file changed, 35 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 3e1a7a9e6..df02dfc47 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,19 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .nexx import NexxIE from ..compat import compat_urlparse from ..utils import ( - NO_DEFAULT, - smuggle_url, + ExtractorError, + extract_attributes, ) +from .dplay import DPlayIE -class Tele5IE(InfoExtractor): + +class Tele5IE(DPlayIE): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -28,6 +25,7 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'No longer available: "404 Seite nicht gefunden"', }, { # jwplatform, nexx unavailable 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', @@ -42,7 +40,20 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [JWPlatformIE.ie_key()], + 'skip': 'No longer available, redirects to Filme page', + }, { + 'url': 'https://tele5.de/mediathek/angel-of-mine/', + 'info_dict': { + 'id': '1252360', + 'ext': 'mp4', + 'upload_date': '20220109', + 'timestamp': 1641762000, + 'title': 'Angel of Mine', + 'description': 'md5:a72546a175e1286eb3251843a52d1ad7', + }, + 'params': { + 'format': 'bestvideo', + }, }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -64,45 +75,18 @@ class Tele5IE(InfoExtractor): }] def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - - return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_element = self._search_regex(r'(]+?>)', webpage, 'video player') + player_info = extract_attributes(player_element) + asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', )) + endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname + source_type = player_info.get('sourcetype') + if source_type: + endpoint = '%s-%s' % (source_type, endpoint) + try: + return self._get_disco_api_info(url, asset_id, endpoint, realm, country) + except ExtractorError as e: + if getattr(e, 'message', '') == 'Missing deviceId in context': + raise ExtractorError('DRM protected', cause=e, expected=True) + raise From 4186e817772d49d6f66b07c5ac8c248f026a6446 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 03:13:37 +0000 Subject: [PATCH 1293/1705] NDR: improve extraction of NDR id, description, etc with current page formats --- youtube_dl/extractor/ndr.py | 45 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index ddd828d92..a0d553f00 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, merge_dicts, parse_iso8601, @@ -20,13 +22,13 @@ class NDRBaseIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = next(group for group in mobj.groups() if group) webpage = self._download_webpage(url, display_id) - return self._extract_embed(webpage, display_id) + return self._extract_embed(webpage, display_id, url) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:\w+\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -109,19 +111,38 @@ class NDRIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id): - embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', - default=None) or self._search_regex( - r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'embed URL', group='url') + def _extract_embed(self, webpage, display_id, url): + embed_url = ( + self._html_search_meta( + 'embedURL', webpage, 'embed URL', + default=None) + or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default=None) + or self._search_regex( + r'\bvar\s*sophoraID\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default='')) + # some more work needed if we only found sophoraID + if re.match(r'^[a-z]+\d+$', embed_url): + # get the initial part of the url path,. eg /panorama/archiv/2022/ + parsed_url = compat_urllib_parse_urlparse(url) + path = self._search_regex(r'(.+/)%s' % display_id, parsed_url.path or '', 'embed URL', default='') + # find tell-tale image with the actual ID + ndr_id = self._search_regex(r'%s([a-z]+\d+)(?!\.)\b' % (path, ), webpage, 'embed URL', default=None) + # or try to use special knowledge! + NDR_INFO_URL_TPL = 'https://www.ndr.de/info/%s-player.html' + embed_url = 'ndr:%s' % (ndr_id, ) if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) + if not embed_url: + raise ExtractorError('Unable to extract embedUrl') + description = self._search_regex( r']+itemprop="description">([^<]+)

    ', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( - r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', default=None)) + (r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="(?P[^"]+)"', + r'\bvar\s*pdt\s*=\s*(?P["\'])(?P(?:(?!(?P=q)).)+)(?P=q)', ), + webpage, 'upload date', group='cont', default=None)) info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts({ '_type': 'url_transparent', @@ -179,7 +200,7 @@ class NJoyIE(NDRBaseIE): video_id = self._search_regex( r']+id="pp_([\da-z]+)"', webpage, 'embed id') description = self._search_regex( - r']+class="subline"[^>]*>[^<]+\s*

    ([^<]+)

    ', + r']+class="subline"[^>]*>[^<]+\s*

    ([^<]+)

    ', webpage, 'description', fatal=False) return { '_type': 'url_transparent', @@ -291,7 +312,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', From f0a05a55c2ee512880546c056cfbec5ad3399798 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 03:22:32 +0000 Subject: [PATCH 1294/1705] NJoy: improve extraction of NDR id, description, etc with current page formats --- youtube_dl/extractor/ndr.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index a0d553f00..0a723e3b0 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -196,18 +196,25 @@ class NJoyIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id): + def _extract_embed(self, webpage, display_id, url=None): + # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - r']+id="pp_([\da-z]+)"', webpage, 'embed id') - description = self._search_regex( + (r'''\bsrc\s*=\s*(?:"|')?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + r']+id="pp_([\da-z]+)"', ), + webpage, 'NDR id', default=None) + + description = ( + self._html_search_meta('description', webpage) + or self._search_regex( r']+class="subline"[^>]*>[^<]+\s*

    ([^<]+)

    ', - webpage, 'description', fatal=False) + webpage, 'description', fatal=False)) return { '_type': 'url_transparent', 'ie_key': 'NDREmbedBase', 'url': 'ndr:%s' % video_id, 'display_id': display_id, 'description': description, + 'title': display_id.replace('-', ' ').strip(), } From 39a98b09a2acf50dc64bc41185be723b98e740b9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jan 2022 03:29:43 +0000 Subject: [PATCH 1295/1705] Fix NDR, NJoy tests --- youtube_dl/extractor/ndr.py | 41 ++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0a723e3b0..1996d4f96 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -40,13 +40,14 @@ class NDRIE(NDRBaseIE): 'title': 'Party, Pötte und Parade', 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', 'uploader': 'ndrtv', - 'timestamp': 1431108900, + 'timestamp': 1431255671, 'upload_date': '20150510', 'duration': 3498, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpVideo, different content id 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', @@ -65,6 +66,7 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudio, same content id 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', @@ -76,8 +78,8 @@ class NDRIE(NDRBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', 'uploader': 'ndrinfo', - 'timestamp': 1290626100, - 'upload_date': '20140729', + 'timestamp': 1631711863, + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -91,9 +93,10 @@ class NDRIE(NDRBaseIE): 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', 'ext': 'mp4', 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', 'uploader': 'ndrtv', - 'upload_date': '20201113', + 'upload_date': '20201207', + 'timestamp': 1614349457, 'duration': 1749, 'subtitles': { 'de': [{ @@ -174,19 +177,19 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideo, different content id 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', 'md5': '417660fffa90e6df2fda19f1b40a64d8', 'info_dict': { - 'id': 'dockville882', + 'id': 'livestream283', 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', - 'ext': 'mp4', - 'title': '"Ich hab noch nie" mit Felix Jaehn', - 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'ext': 'mp3', + 'title': 'Das frueheste DJ Set des Nordens live mit Felix Jaehn', + 'description': 'md5:681698f527b8601e511e7b79edde7d2c', 'uploader': 'njoy', - 'upload_date': '20150822', - 'duration': 211, + 'upload_date': '20210830', }, 'params': { 'skip_download': True, @@ -332,6 +335,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'upload_date': '20150907', 'duration': 132, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', 'md5': '002085c44bae38802d94ae5802a36e78', @@ -347,6 +351,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/info/audio51535-player.html', 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', @@ -356,7 +361,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'is_live': False, 'uploader': 'ndrinfo', - 'upload_date': '20140729', + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -377,15 +382,17 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideoLive 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', 'info_dict': { 'id': 'livestream217', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, - 'upload_date': '20150910', + 'upload_date': '20210409', + 'uploader': 'ndrtv', }, 'params': { 'skip_download': True, @@ -423,9 +430,10 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'ext': 'mp4', 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', 'is_live': False, - 'upload_date': '20150807', + 'upload_date': '20200826', 'duration': 1011, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpAudio 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', @@ -442,6 +450,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudioLive, no explicit ext 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', @@ -451,7 +460,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'uploader': 'njoy', - 'upload_date': '20150810', + 'upload_date': '20210830', }, 'params': { 'skip_download': True, From 01824d275bfa7efbaca274b38c1ddc2b03f12f5d Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 19 Jan 2022 13:24:33 +0000 Subject: [PATCH 1296/1705] Additional tweaks: allow any .ndr.de, simplify quote match --- youtube_dl/extractor/ndr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 1996d4f96..26627f8b0 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -28,7 +28,7 @@ class NDRBaseIE(InfoExtractor): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:\w+\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -202,7 +202,7 @@ class NJoyIE(NDRBaseIE): def _extract_embed(self, webpage, display_id, url=None): # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - (r'''\bsrc\s*=\s*(?:"|')?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + (r'''\bsrc\s*=\s*["']?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', r']+id="pp_([\da-z]+)"', ), webpage, 'NDR id', default=None) @@ -322,7 +322,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', From 5197336de6ee2d18c37732f3f7c6532c8899ec29 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 14 Jan 2022 20:14:14 +0000 Subject: [PATCH 1297/1705] Support more deeply nested ptmd_path with test, update tests --- youtube_dl/extractor/zdf.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 4dd56f66d..3d39bb33a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, + ExtractorError, float_or_none, int_or_none, merge_dicts, @@ -145,6 +146,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1613948400, 'upload_date': '20210221', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', @@ -158,6 +160,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1608604200, 'upload_date': '20201222', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { @@ -190,6 +193,17 @@ class ZDFIE(ZDFBaseIE): }, { 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', + 'info_dict': { + 'id': 'video_artede_083871-001-A', + 'ext': 'mp4', + 'title': 'Tödliche Flucht (1/6)', + 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', + 'duration': 3193.0, + 'timestamp': 1641355200, + 'upload_date': '20220105', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -197,12 +211,18 @@ class ZDFIE(ZDFBaseIE): t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + def get_ptmd_path(d): + return ( + d.get('http://zdf.de/rels/streams/ptmd') + or d.get('http://zdf.de/rels/streams/ptmd-template', + '').replace('{playerId}', 'ngplayer_2_4')) + + ptmd_path = get_ptmd_path(try_get(t, lambda x: x['streams']['default'], dict) or {}) + if not ptmd_path: + ptmd_path = get_ptmd_path(t) if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') + raise ExtractorError('Could not extract ptmd_path') info = self._extract_ptmd( urljoin(url, ptmd_path), video_id, player['apiToken'], url) From 5cb4833f408745135d1b0e178b9a2545a899f2ac Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 13 Jan 2022 19:38:08 +0000 Subject: [PATCH 1298/1705] Update URPlayIE extractor for Next.js page format, with subtitles --- youtube_dl/extractor/urplay.py | 52 ++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index d6c79147e..abd2bee84 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( dict_get, + ExtractorError, int_or_none, + ISO639Utils, + parse_age_limit, + try_get, unified_timestamp, ) @@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor): 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', 'duration': 2269, - 'categories': ['Kultur & historia'], + 'categories': ['Vetenskap & teknik'], 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', + 'age_limit': 15, }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -50,11 +55,19 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - vid = int(video_id) - accessible_episodes = self._parse_json(self._html_search_regex( - r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'] - urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) + urplayer_data = self._search_regex( + r'(?s)\bid\s*=\s*"__NEXT_DATA__"[^>]*>\s*({.+?})\s*]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id)) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( @@ -72,6 +85,30 @@ class URPlayIE(InfoExtractor): video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) + subtitles = {} + + def parse_lang_code(code): + "3-character language code or None (utils candidate)" + if code is None: + return + lang = code.lower() + if not ISO639Utils.long2short(lang): + lang = ISO639Utils.short2long(lang) + return lang or None + + for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl + image = urplayer_data.get('image') or {} thumbnails = [] for k, v in image.items(): @@ -104,4 +141,7 @@ class URPlayIE(InfoExtractor): 'season': series.get('label'), 'episode': episode, 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), + 'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0 + for a in urplayer_data.get('ageRanges', []))), + 'subtitles': subtitles, } From 568c7005d513d0398c20b9e88eb9838c68651fc2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 25 Jan 2022 12:59:31 +0000 Subject: [PATCH 1299/1705] Fix WDRMaus; extend URL matching for other Maus pages; improve ID extraction --- youtube_dl/extractor/wdr.py | 39 +++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 2903d189e..a5488f3fd 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, js_to_json, strip_jsonp, @@ -22,9 +23,10 @@ from ..utils import ( class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P\d+)\.js' + __API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s' + _VALID_URL = (r'(?:https?:' + __API_URL_TPL) % (r'\d+', r'(?=\d+\.js)|wdr:)(?P\d{6,})') _GEO_COUNTRIES = ['DE'] - _TEST = { + _TESTS = [{ 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { 'id': 'mdb-1557833', @@ -32,11 +34,20 @@ class WDRIE(InfoExtractor): 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', 'upload_date': '20180112', }, - } + }, + ] + + def _asset_url(self, wdr_id): + id_len = max(len(wdr_id), 5) + return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js')) def _real_extract(self, url): video_id = self._match_id(url) + if url.startswith('wdr:'): + video_id = url[4:] + url = self._asset_url(video_id) + metadata = self._download_json( url, video_id, transform_source=strip_jsonp) @@ -115,10 +126,10 @@ class WDRIE(InfoExtractor): } -class WDRPageIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' +class WDRPageIE(WDRIE): + _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P[^/?#.]+)(?:/?|/index\.php5|\.php5)$' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX _TESTS = [ { @@ -180,12 +191,12 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1552552', + 'id': 'mdb-2627637', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, - 'skip': 'The id changes from week to week because of the new episode' + # 'skip': 'The id changes from week to week because of the new episode' }, { 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', @@ -234,7 +245,7 @@ class WDRPageIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus') webpage = self._download_webpage(url, display_id) entries = [] @@ -260,6 +271,14 @@ class WDRPageIE(InfoExtractor): jsonp_url = try_get( media_link_obj, lambda x: x['mediaObj']['url'], compat_str) if jsonp_url: + # metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps + clip_id = media_link_obj['mediaObj'].get('ref') + if jsonp_url.endswith('.assetjsonp'): + asset = self._download_json( + jsonp_url, display_id, fatal=False, transform_source=strip_jsonp) + clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str) + if clip_id: + jsonp_url = self._asset_url(clip_id[4:]) entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) From 96423449659131ed8e7bfaa7f791466c3f8f2db1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 25 Jan 2022 13:04:04 +0000 Subject: [PATCH 1300/1705] Fix tests for working IEs; disable obsolete WDRMobile --- youtube_dl/extractor/wdr.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a5488f3fd..10db73148 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -170,11 +170,11 @@ class WDRPageIE(WDRIE): { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-1406149', + 'id': 'mdb-2296252', 'ext': 'mp4', - 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': '20150101', + 'upload_date': '20201112', 'is_live': True, }, 'params': { @@ -183,7 +183,7 @@ class WDRPageIE(WDRIE): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 7, + 'playlist_mincount': 6, 'info_dict': { 'id': 'aktuelle-stunde-120', }, @@ -196,7 +196,7 @@ class WDRPageIE(WDRIE): 'upload_date': 're:^[0-9]{8}$', 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, - # 'skip': 'The id changes from week to week because of the new episode' + 'skip': 'The id changes from week to week because of the new episode' }, { 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5', @@ -207,6 +207,7 @@ class WDRPageIE(WDRIE): 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', @@ -232,6 +233,7 @@ class WDRPageIE(WDRIE): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', @@ -298,16 +300,14 @@ class WDRPageIE(WDRIE): class WDRElefantIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P.+)' _TEST = { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe', + # adaptive stream: unstable file MD5 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', + 'title': 'Wippe', + 'id': 'mdb-1198320', 'ext': 'mp4', 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download': True, + 'upload_date': '20071003' }, } @@ -342,6 +342,7 @@ class WDRMobileIE(InfoExtractor): /[0-9]+/[0-9]+/ (?P[0-9]+)_(?P[0-9]+)''' IE_NAME = 'wdr:mobile' + _WORKING = False # no such domain _TEST = { 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', 'info_dict': { From 23ad6402a6966dd09e4c854f32c33f69be1a064e Mon Sep 17 00:00:00 2001 From: Chris Rose <offline@offby1.net> Date: Fri, 26 Nov 2021 08:08:17 -0800 Subject: [PATCH 1301/1705] xvideos: Fix for #30271 --- youtube_dl/extractor/xvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 8fc64914c..e63d4690d 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -82,7 +82,7 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/0' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: From 005339d6375f2d2a4cec962b1c1a157c1dffbf8f Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 8 Dec 2021 23:37:54 +0000 Subject: [PATCH 1302/1705] [applepodcasts] Support new AMP-ish page structure --- youtube_dl/extractor/applepodcasts.py | 43 ++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 6a74de758..f0186d4bf 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, parse_iso8601, try_get, @@ -14,15 +16,15 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', } }, { @@ -39,17 +41,38 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) return { 'id': episode_id, From e00b0eab1e78ed822683b2689f60eab85514ac42 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 9 Dec 2021 00:55:04 +0000 Subject: [PATCH 1303/1705] [applepodcasts] Improve format extraction Set acodec and vcodec, etc, to avoid breaking, eg, bestaudio --- youtube_dl/extractor/applepodcasts.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index f0186d4bf..dd413a289 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -7,6 +7,7 @@ from ..utils import ( clean_podcast_url, get_element_by_class, int_or_none, + parse_codecs, parse_iso8601, try_get, ) @@ -74,7 +75,7 @@ class ApplePodcastsIE(InfoExtractor): series = try_get(inc, lambda x: x['attributes']['name']) series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) - return { + info = [{ 'id': episode_id, 'title': episode['name'], 'url': clean_podcast_url(episode['assetUrl']), @@ -82,4 +83,9 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, - } + }] + self._sort_formats(info) + info = info[0] + codecs = parse_codecs(info.get('ext', 'mp3')) + info.update(codecs) + return info From 584715a803eef68f68fbbb8b72a022a699983197 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 9 Dec 2021 01:35:35 +0000 Subject: [PATCH 1304/1705] [applepodcasts] Extract default thumbnail image --- youtube_dl/extractor/applepodcasts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index dd413a289..95e0f663c 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -27,6 +27,7 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': 1593932400, 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -83,6 +84,7 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, + 'thumbnail': self._og_search_thumbnail(webpage), }] self._sort_formats(info) info = info[0] From 73e1ab6125eeea2b07942326cd2f1d6d9adff64e Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 6 Dec 2021 19:26:33 +0000 Subject: [PATCH 1305/1705] [test:download] Only extract enough videos for playlist_mincount --- test/parameters.json | 1 - test/test_download.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parameters.json b/test/parameters.json index 65fd54428..864c9d130 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -18,7 +18,6 @@ "noprogress": false, "outtmpl": "%(id)s.%(ext)s", "password": null, - "playlistend": -1, "playliststart": 1, "prefer_free_formats": false, "quiet": false, diff --git a/test/test_download.py b/test/test_download.py index ebe820dfc..8e43cfa12 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -121,6 +121,7 @@ def generator(test_case, tname): params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') + params.setdefault('playlistend', test_case.get('playlist_mincount')) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) From 91278f4b6b5600e9ce65826ec9e7e38e7dba5937 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 6 Dec 2021 20:52:21 +0000 Subject: [PATCH 1306/1705] [niconico] Back-port extractor from yt-dlp Add Nico search extractors, fix extraction --- youtube_dl/extractor/extractors.py | 9 +- youtube_dl/extractor/niconico.py | 646 +++++++++++++++++++++-------- 2 files changed, 477 insertions(+), 178 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4e9954c6a..e70daf2b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -789,7 +789,14 @@ from .nick import ( NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NicovideoSearchIE, + NicovideoSearchDateIE, + NicovideoSearchURLIE, +) from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index a85fc3d5c..756ad0e25 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -2,25 +2,28 @@ from __future__ import unicode_literals import datetime -import functools +import itertools import json -import math +import re -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, - dict_get, ExtractorError, + dict_get, float_or_none, - InAdvancePagedList, int_or_none, + OnDemandPagedList, parse_duration, parse_iso8601, + PostProcessingError, remove_start, + str_or_none, try_get, unified_timestamp, urlencode_postdata, @@ -34,7 +37,7 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', + 'md5': 'a5bad06f1347452102953f323c69da34s', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -162,6 +165,11 @@ class NiconicoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + def _real_initialize(self): self._login() @@ -191,37 +199,89 @@ class NiconicoIE(InfoExtractor): self._downloader.report_warning('unable to log in: bad username or password') return login_ok - def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): - def yesno(boolean): - return 'yes' if boolean else 'no' + def _get_heartbeat_info(self, info_dict): - session_api_data = api_data['video']['dmcInfo']['session_api'] - session_api_endpoint = session_api_data['urls'][0] + video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') - format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + api_data = ( + info_dict.get('_api_data') + or self._parse_json( + self._html_search_regex( + 'data-api-data="([^"]+)"', + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id), + 'API data', default='{}'), + video_id)) + + session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session']) + session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) + + def ping(): + status = try_get( + self._download_json( + 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, + query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, + note='Acquiring permission for downloading video', + headers=self._API_HEADERS), + lambda x: x['meta']['status']) + if status != 200: + self.report_warning('Failed to acquire permission for playing video. The video may not download.') + + yesno = lambda x: 'yes' if x else 'no' + + # m3u8 (encryption) + if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None: + protocol = 'm3u8' + encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption'] + session_api_http_parameters = { + 'parameters': { + 'hls_parameters': { + 'encryption': { + encryption: { + 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']), + 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri']) + } + }, + 'transfer_preset': '', + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + 'segment_duration': 6000, + } + } + } + # http + else: + protocol = 'http' + session_api_http_parameters = { + 'parameters': { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + } + } + } session_response = self._download_json( session_api_endpoint['url'], video_id, query={'_format': 'json'}, headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for %s' % format_id, + note='Downloading JSON metadata for %s' % info_dict['format_id'], data=json.dumps({ 'session': { 'client_info': { - 'player_id': session_api_data['player_id'], + 'player_id': session_api_data.get('playerId'), }, 'content_auth': { - 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], - 'content_key_timeout': session_api_data['content_key_timeout'], + 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), + 'content_key_timeout': session_api_data.get('contentKeyTimeout'), 'service_id': 'nicovideo', - 'service_user_id': session_api_data['service_user_id'] + 'service_user_id': session_api_data.get('serviceUserId') }, - 'content_id': session_api_data['content_id'], + 'content_id': session_api_data.get('contentId'), 'content_src_id_sets': [{ 'content_src_ids': [{ 'src_id_to_mux': { - 'audio_src_ids': [audio_quality['id']], - 'video_src_ids': [video_quality['id']], + 'audio_src_ids': [audio_src_id], + 'video_src_ids': [video_src_id], } }] }], @@ -229,52 +289,81 @@ class NiconicoIE(InfoExtractor): 'content_uri': '', 'keep_method': { 'heartbeat': { - 'lifetime': session_api_data['heartbeat_lifetime'] + 'lifetime': session_api_data.get('heartbeatLifetime') } }, - 'priority': session_api_data['priority'], + 'priority': session_api_data.get('priority'), 'protocol': { 'name': 'http', 'parameters': { - 'http_parameters': { - 'parameters': { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_endpoint['is_ssl']), - 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), - } - } - } + 'http_parameters': session_api_http_parameters } }, - 'recipe_id': session_api_data['recipe_id'], + 'recipe_id': session_api_data.get('recipeId'), 'session_operation_auth': { 'session_operation_auth_by_signature': { - 'signature': session_api_data['signature'], - 'token': session_api_data['token'], + 'signature': session_api_data.get('signature'), + 'token': session_api_data.get('token'), } }, 'timing_constraint': 'unlimited' } }).encode()) - resolution = video_quality.get('resolution', {}) + info_dict['url'] = session_response['data']['session']['content_uri'] + info_dict['protocol'] = protocol + + # get heartbeat info + heartbeat_info_dict = { + 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT', + 'data': json.dumps(session_response['data']), + # interval, convert milliseconds to seconds, then halve to make a buffer. + 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), + 'ping': ping + } + + return info_dict, heartbeat_info_dict + + def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): + def parse_format_id(id_code): + mobj = re.match(r'''(?x) + (?:archive_)? + (?:(?P<codec>[^_]+)_)? + (?:(?P<br>[\d]+)kbps_)? + (?:(?P<res>[\d+]+)p_)? + ''', '%s_' % id_code) + return mobj.groupdict() if mobj else {} + + protocol = 'niconico_dmc' + format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + vdict = parse_format_id(video_quality['id']) + adict = parse_format_id(audio_quality['id']) + resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} + vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) return { - 'url': session_response['data']['session']['content_uri'], + 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), 'format_id': format_id, + 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'abr': float_or_none(audio_quality.get('bitrate'), 1000), - 'vbr': float_or_none(video_quality.get('bitrate'), 1000), - 'height': resolution.get('height'), - 'width': resolution.get('width'), + 'vcodec': vdict.get('codec'), + 'acodec': adict.get('codec'), + 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), + 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), + 'height': int_or_none(resolution.get('height', vdict.get('res'))), + 'width': int_or_none(resolution.get('width')), + 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 + 'protocol': protocol, + 'http_headers': { + 'Origin': 'https://www.nicovideo.jp', + 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, + } } def _real_extract(self, url): video_id = self._match_id(url) - # Get video webpage. We are not actually interested in it for normal - # cases, but need the cookies in order to be able to download the - # info webpage + # Get video webpage for API data. webpage, handle = self._download_webpage_handle( 'http://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): @@ -284,86 +373,136 @@ class NiconicoIE(InfoExtractor): 'data-api-data="([^"]+)"', webpage, 'API data', default='{}'), video_id) - def _format_id_from_url(video_url): - return 'economy' if video_real_url.endswith('low') else 'normal' + def get_video_info_web(items): + return dict_get(api_data['video'], items) - try: - video_real_url = api_data['video']['smileInfo']['url'] - except KeyError: # Flash videos - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') + # Get video info + video_info_xml = self._download_xml( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, + video_id, note='Downloading video info page') - flv_info = compat_parse_qs(flv_info_webpage) - if 'url' not in flv_info: - if 'deleted' in flv_info: - raise ExtractorError('The video has been deleted.', - expected=True) - elif 'closed' in flv_info: - raise ExtractorError('Niconico videos now require logging in', - expected=True) - elif 'error' in flv_info: - raise ExtractorError('%s reports error: %s' % ( - self.IE_NAME, flv_info['error'][0]), expected=True) - else: - raise ExtractorError('Unable to find video URL') + def get_video_info_xml(items): + if not isinstance(items, list): + items = [items] + for item in items: + ret = xpath_text(video_info_xml, './/' + item) + if ret: + return ret - video_info_xml = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, - video_id, note='Downloading video info page') + if get_video_info_xml('error'): + error_code = get_video_info_xml('code') - def get_video_info(items): - if not isinstance(items, list): - items = [items] - for item in items: - ret = xpath_text(video_info_xml, './/' + item) - if ret: - return ret + if error_code == 'DELETED': + raise ExtractorError('The video has been deleted.', + expected=True) + elif error_code == 'NOT_FOUND': + raise ExtractorError('The video is not found.', + expected=True) + elif error_code == 'COMMUNITY': + self.to_screen('%s: The video is community members only.' % video_id) + else: + raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code)) - video_real_url = flv_info['url'][0] + # Start extracting video formats + formats = [] - extension = get_video_info('movie_type') - if not extension: - extension = determine_ext(video_real_url) + # Get HTML5 videos info + quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) + if not quality_info: + raise ExtractorError('The video can\'t be downloaded', expected=True) - formats = [{ - 'url': video_real_url, - 'ext': extension, - 'format_id': _format_id_from_url(video_real_url), - }] - else: - formats = [] + for audio_quality in quality_info.get('audios') or {}: + for video_quality in quality_info.get('videos') or {}: + if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): + continue + formats.append(self._extract_format_for_quality( + api_data, video_id, audio_quality, video_quality)) - dmc_info = api_data['video'].get('dmcInfo') - if dmc_info: # "New" HTML5 videos - quality_info = dmc_info['quality'] - for audio_quality in quality_info['audios']: - for video_quality in quality_info['videos']: - if not audio_quality['available'] or not video_quality['available']: - continue - formats.append(self._extract_format_for_quality( - api_data, video_id, audio_quality, video_quality)) + # Get flv/swf info + timestamp = None + video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url']) + if video_real_url: + is_economy = video_real_url.endswith('low') - self._sort_formats(formats) - else: # "Old" HTML5 videos - formats = [{ + if is_economy: + self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams') + + # Invoking ffprobe to determine resolution + pp = FFmpegPostProcessor(self._downloader) + cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n') + + self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe')) + + try: + metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies]) + except PostProcessingError as err: + raise ExtractorError(err.msg, expected=True) + + v_stream = a_stream = {} + + # Some complex swf files doesn't have video stream (e.g. nm4809023) + for stream in metadata['streams']: + if stream['codec_type'] == 'video': + v_stream = stream + elif stream['codec_type'] == 'audio': + a_stream = stream + + # Community restricted videos seem to have issues with the thumb API not returning anything at all + filesize = int( + (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low')) + or metadata['format']['size'] + ) + extension = ( + get_video_info_xml('movie_type') + or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'] + ) + + # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'. + timestamp = ( + parse_iso8601(get_video_info_web('first_retrieve')) + or unified_timestamp(get_video_info_web('postedDateTime')) + ) + metadata_timestamp = ( + parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time'])) + or timestamp if extension != 'mp4' else 0 + ) + + # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts + smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00') + + is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0 + + # If movie file size is unstable, old server movie is not source movie. + if filesize > 1: + formats.append({ 'url': video_real_url, - 'ext': 'mp4', - 'format_id': _format_id_from_url(video_real_url), - }] + 'format_id': 'smile' if not is_economy else 'smile_low', + 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality', + 'ext': extension, + 'container': extension, + 'vcodec': v_stream.get('codec_name'), + 'acodec': a_stream.get('codec_name'), + # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209) + 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000), + 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000), + 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000), + 'height': int_or_none(v_stream.get('height')), + 'width': int_or_none(v_stream.get('width')), + 'source_preference': 5 if not is_economy else -2, + 'quality': 5 if is_source and not is_economy else None, + 'filesize': filesize + }) - def get_video_info(items): - return dict_get(api_data['video'], items) + self._sort_formats(formats) # Start extracting information - title = get_video_info('title') - if not title: - title = self._og_search_title(webpage, default=None) - if not title: - title = self._html_search_regex( + title = ( + get_video_info_xml('title') # prefer to get the untranslated original title + or get_video_info_web(['originalTitle', 'title']) + or self._og_search_title(webpage, default=None) + or self._html_search_regex( r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', - webpage, 'video title') + webpage, 'video title')) watch_api_data_string = self._html_search_regex( r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', @@ -372,14 +511,15 @@ class NiconicoIE(InfoExtractor): video_detail = watch_api_data.get('videoDetail', {}) thumbnail = ( - get_video_info(['thumbnail_url', 'thumbnailURL']) + self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None) + or dict_get( # choose highest from 720p to 240p + get_video_info_web('thumbnail'), + ['ogp', 'player', 'largeUrl', 'middleUrl', 'url']) or self._html_search_meta('image', webpage, 'thumbnail', default=None) or video_detail.get('thumbnail')) - description = get_video_info('description') + description = get_video_info_web('description') - timestamp = (parse_iso8601(get_video_info('first_retrieve')) - or unified_timestamp(get_video_info('postedDateTime'))) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: @@ -388,19 +528,25 @@ class NiconicoIE(InfoExtractor): timestamp = parse_iso8601( video_detail['postedAt'].replace('/', '-'), delimiter=' ', timezone=datetime.timedelta(hours=9)) + timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt'])) - view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) + view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount'])) if not view_count: match = self._html_search_regex( r'>Views: <strong[^>]*>([^<]+)</strong>', webpage, 'view count', default=None) if match: view_count = int_or_none(match.replace(',', '')) - view_count = view_count or video_detail.get('viewCount') + view_count = ( + view_count + or video_detail.get('viewCount') + or try_get(api_data, lambda x: x['video']['count']['view'])) + + comment_count = ( + int_or_none(get_video_info_web('comment_num')) + or video_detail.get('commentCount') + or try_get(api_data, lambda x: x['video']['count']['comment'])) - comment_count = (int_or_none(get_video_info('comment_num')) - or video_detail.get('commentCount') - or try_get(api_data, lambda x: x['thread']['commentCount'])) if not comment_count: match = self._html_search_regex( r'>Comments: <strong[^>]*>([^<]+)</strong>', @@ -409,22 +555,41 @@ class NiconicoIE(InfoExtractor): comment_count = int_or_none(match.replace(',', '')) duration = (parse_duration( - get_video_info('length') + get_video_info_web('length') or self._html_search_meta( 'video:duration', webpage, 'video duration', default=None)) or video_detail.get('length') - or get_video_info('duration')) + or get_video_info_web('duration')) - webpage_url = get_video_info('watch_url') or url + webpage_url = get_video_info_web('watch_url') or url + + # for channel movie and community movie + channel_id = try_get( + api_data, + (lambda x: x['channel']['globalId'], + lambda x: x['community']['globalId'])) + channel = try_get( + api_data, + (lambda x: x['channel']['name'], + lambda x: x['community']['name'])) # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" # in the JSON, which will cause None to be returned instead of {}. owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} - uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') - uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') + uploader_id = str_or_none( + get_video_info_web(['ch_id', 'user_id']) + or owner.get('id') + or channel_id + ) + uploader = ( + get_video_info_web(['ch_name', 'user_nickname']) + or owner.get('nickname') + or channel + ) return { 'id': video_id, + '_api_data': api_data, 'title': title, 'formats': formats, 'thumbnail': thumbnail, @@ -432,6 +597,8 @@ class NiconicoIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'uploader_id': uploader_id, + 'channel': channel, + 'channel_id': channel_id, 'view_count': view_count, 'comment_count': comment_count, 'duration': duration, @@ -440,7 +607,7 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', @@ -456,60 +623,185 @@ class NiconicoPlaylistIE(InfoExtractor): 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', 'only_matching': True, }] - _PAGE_SIZE = 100 - def _call_api(self, list_id, resource, query): - return self._download_json( - 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, - 'Downloading %s JSON metatdata' % resource, query=query, - headers={'X-Frontend-Id': 6})['data']['mylist'] - - def _parse_owner(self, item): - owner = item.get('owner') or {} - if owner: - return { - 'uploader': owner.get('name'), - 'uploader_id': owner.get('id'), - } - return {} - - def _fetch_page(self, list_id, page): - page += 1 - items = self._call_api(list_id, 'page %d' % page, { - 'page': page, - 'pageSize': self._PAGE_SIZE, - })['items'] - for item in items: - video = item.get('video') or {} - video_id = video.get('id') - if not video_id: - continue - count = video.get('count') or {} - get_count = lambda x: int_or_none(count.get(x)) - info = { - '_type': 'url', - 'id': video_id, - 'title': video.get('title'), - 'url': 'https://www.nicovideo.jp/watch/' + video_id, - 'description': video.get('shortDescription'), - 'duration': int_or_none(video.get('duration')), - 'view_count': get_count('view'), - 'comment_count': get_count('comment'), - 'ie_key': NiconicoIE.ie_key(), - } - info.update(self._parse_owner(video)) - yield info + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } def _real_extract(self, url): list_id = self._match_id(url) - mylist = self._call_api(list_id, 'list', { - 'pageSize': 1, - }) - entries = InAdvancePagedList( - functools.partial(self._fetch_page, list_id), - math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), - self._PAGE_SIZE) - result = self.playlist_result( - entries, list_id, mylist.get('name'), mylist.get('description')) - result.update(self._parse_owner(mylist)) - return result + + def get_page_data(pagenum, pagesize): + return self._download_json( + 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + query={'page': 1 + pagenum, 'pageSize': pagesize}, + headers=self._API_HEADERS).get('data').get('mylist') + + data = get_page_data(0, 1) + title = data.get('name') + description = data.get('description') + uploader = data.get('owner').get('name') + uploader_id = data.get('owner').get('id') + + def pagefunc(pagenum): + data = get_page_data(pagenum, 25) + return ({ + '_type': 'url', + 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'), + } for item in data.get('items')) + + return { + '_type': 'playlist', + 'id': list_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'entries': OnDemandPagedList(pagefunc, 25), + } + + +class NicovideoSearchBaseIE(InfoExtractor): + _MAX_RESULTS = float('inf') + + def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'): + query = query or {} + pages = [query['page']] if 'page' in query else itertools.count(1) + for page_num in pages: + query['page'] = str(page_num) + webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num}) + results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.+?)(?=["\'])', webpage) + for item in results: + yield self.url_result('http://www.nicovideo.jp/watch/%s' % item, 'Niconico', item) + if not results: + break + + def _get_n_results(self, query, n): + entries = self._entries(self._proto_relative_url('//www.nicovideo.jp/search/%s' % query), query) + if n < self._MAX_RESULTS: + entries = itertools.islice(entries, 0, n) + return self.playlist_result(entries, query, query) + + +class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search' + IE_NAME = 'nicovideo:search' + _SEARCH_KEY = 'nicosearch' + + def _search_results(self, query): + return self._entries( + self._proto_relative_url('//www.nicovideo.jp/search/%s' % query), query) + + +class NicovideoSearchURLIE(NicovideoSearchBaseIE): + IE_NAME = '%s_url' % NicovideoSearchIE.IE_NAME + IE_DESC = 'Nico video search URLs' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/search/sm9', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_count': 31, + }] + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) + + +class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search, newest first' + IE_NAME = '%s:date' % NicovideoSearchIE.IE_NAME + _SEARCH_KEY = 'nicosearchdate' + + _TESTS = [{ + 'url': 'nicosearchdateall:a', + 'info_dict': { + 'id': 'a', + 'title': 'a' + }, + 'playlist_mincount': 1610, + }] + + _START_DATE = datetime.date(2007, 1, 1) + _RESULTS_PER_PAGE = 32 + _MAX_PAGES = 50 + + def _entries(self, url, item_id, start_date=None, end_date=None): + start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + + # If the last page has a full page of videos, we need to break down the query interval further + last_page_len = len(list(self._get_entries_for_date( + url, item_id, start_date, end_date, self._MAX_PAGES, + note='Checking number of videos from {0} to {1}'.format(start_date, end_date)))) + if (last_page_len == self._RESULTS_PER_PAGE and start_date != end_date): + midpoint = start_date + ((end_date - start_date) // 2) + for entry in itertools.chain( + iter(self._entries(url, item_id, midpoint, end_date)), + iter(self._entries(url, item_id, start_date, midpoint))): + yield entry + else: + self.to_screen('{0}: Downloading results from {1} to {2}'.format(item_id, start_date, end_date)) + for entry in iter(self._get_entries_for_date( + url, item_id, start_date, end_date, note=' Downloading page %(page)s')): + yield entry + + def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_num=None, note=None): + query = { + 'start': compat_str(start_date), + 'end': compat_str(end_date or start_date), + 'sort': 'f', + 'order': 'd', + } + if page_num: + query['page'] = compat_str(page_num) + + for entry in iter(super(NicovideoSearchDateIE, self)._entries(url, item_id, query=query, note=note)): + yield entry + + +class NiconicoUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _TEST = { + 'url': 'https://www.nicovideo.jp/user/419948', + 'info_dict': { + 'id': '419948', + }, + 'playlist_mincount': 101, + } + _API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s" + _PAGE_SIZE = 100 + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + + def _entries(self, list_id): + total_count = 1 + count = page_num = 0 + while count < total_count: + json_parsed = self._download_json( + self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id, + headers=self._API_HEADERS, + note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) + if not page_num: + total_count = int_or_none(json_parsed['data'].get('totalCount')) + for entry in json_parsed["data"]["items"]: + count += 1 + yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id']) + page_num += 1 + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id), list_id) From 92d73ef3936ed6de9770f613fddf2260731becc9 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 7 Dec 2021 23:30:30 +0000 Subject: [PATCH 1307/1705] [niconico] Implement heartbeat for download --- youtube_dl/downloader/__init__.py | 25 ++++++++---- youtube_dl/downloader/niconico.py | 66 +++++++++++++++++++++++++++++++ youtube_dl/extractor/niconico.py | 18 +++++++++ 3 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 youtube_dl/downloader/niconico.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9d..d8f2fa342 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,22 +1,31 @@ from __future__ import unicode_literals +from ..utils import ( + determine_protocol, +) + + +def get_suitable_downloader(info_dict, params={}): + info_dict['protocol'] = determine_protocol(info_dict) + info_copy = info_dict.copy() + return _get_suitable_downloader(info_copy, params) + + +# Some of these require get_suitable_downloader from .common import FileDownloader +from .dash import DashSegmentsFD from .f4m import F4mFD from .hls import HlsFD from .http import HttpFD from .rtmp import RtmpFD -from .dash import DashSegmentsFD from .rtsp import RtspFD from .ism import IsmFD +from .niconico import NiconicoDmcFD from .external import ( get_external_downloader, FFmpegFD, ) -from ..utils import ( - determine_protocol, -) - PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': HlsFD, @@ -26,13 +35,12 @@ PROTOCOL_MAP = { 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, 'ism': IsmFD, + 'niconico_dmc': NiconicoDmcFD, } -def get_suitable_downloader(info_dict, params={}): +def _get_suitable_downloader(info_dict, params={}): """Get the downloader class that can handle the info dict.""" - protocol = determine_protocol(info_dict) - info_dict['protocol'] = protocol # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD @@ -43,6 +51,7 @@ def get_suitable_downloader(info_dict, params={}): if ed.can_download(info_dict): return ed + protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'): return FFmpegFD diff --git a/youtube_dl/downloader/niconico.py b/youtube_dl/downloader/niconico.py new file mode 100644 index 000000000..6392c9989 --- /dev/null +++ b/youtube_dl/downloader/niconico.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +try: + import threading +except ImportError: + threading = None + +from .common import FileDownloader +from ..downloader import get_suitable_downloader +from ..extractor.niconico import NiconicoIE +from ..utils import sanitized_Request + + +class NiconicoDmcFD(FileDownloader): + """ Downloading niconico douga from DMC with heartbeat """ + + FD_NAME = 'niconico_dmc' + + def real_download(self, filename, info_dict): + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + + ie = NiconicoIE(self.ydl) + info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) + + fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + + if not threading: + self.to_screen('[%s] Threading for Heartbeat not available' % self.FD_NAME) + return fd.real_download(filename, info_dict) + + success = download_complete = False + timer = [None] + heartbeat_lock = threading.Lock() + heartbeat_url = heartbeat_info_dict['url'] + heartbeat_data = heartbeat_info_dict['data'].encode() + heartbeat_interval = heartbeat_info_dict.get('interval', 30) + + request = sanitized_Request(heartbeat_url, heartbeat_data) + + def heartbeat(): + try: + self.ydl.urlopen(request).read() + except Exception: + self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) + + with heartbeat_lock: + if not download_complete: + timer[0] = threading.Timer(heartbeat_interval, heartbeat) + timer[0].start() + + heartbeat_info_dict['ping']() + self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) + try: + heartbeat() + if type(fd).__name__ == 'HlsFD': + info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) + success = fd.real_download(filename, info_dict) + finally: + if heartbeat_lock: + with heartbeat_lock: + timer[0].cancel() + download_complete = True + return success diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 756ad0e25..93f813968 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -160,6 +160,24 @@ class NiconicoIE(InfoExtractor): }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, + }, { + # DMC video with heartbeat + 'url': 'https://www.nicovideo.jp/watch/sm34815188', + 'md5': '9360c6e1f1519d7759e2fe8e1326ae83', + 'info_dict': { + 'id': 'sm34815188', + 'ext': 'mp4', + 'title': 'md5:aee93e9f3366db72f902f6cd5d389cb7', + 'description': 'md5:7b9149fc7a00ab053cafaf5c19662704', + 'thumbnail': r're:https?://.*', + 'uploader': 'md5:2762e18fa74dbb40aa1ad27c6291ee32', + 'uploader_id': '67449889', + 'upload_date': '20190322', + 'timestamp': int, # timestamp is unstable + 'duration': 1082.0, + 'view_count': int, + 'comment_count': int, + }, }] _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' From 6d4932f02347bb1d0228b20798435930022bf316 Mon Sep 17 00:00:00 2001 From: df <fieldhouse@gmx.net> Date: Sun, 18 Apr 2021 01:46:40 +0100 Subject: [PATCH 1308/1705] Try for timestamp, description from window.__INITIAL_DATA__ pages --- youtube_dl/extractor/bbc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 247d982ce..37d427a66 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1205,7 +1205,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) From 58babe9af79215bd6bdf07da0a8ebb1d3650e00b Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 30 Nov 2021 05:15:33 +0000 Subject: [PATCH 1309/1705] Support __INITIAL_DATA__ with stringified JSON Add test and fix test for bbcthreeConfig --- youtube_dl/extractor/bbc.py | 50 +++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 37d427a66..088af9823 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..compat import ( compat_HTTPError, compat_parse_qs, compat_str, + compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -395,9 +396,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -775,21 +784,33 @@ class BBCIE(BBCCoUkIE): 'timestamp': 1437785037, 'upload_date': '20150725', }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -1162,9 +1183,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: From c820a284a23438f065171b7e222024d01893a95f Mon Sep 17 00:00:00 2001 From: Abdullah Ibn Fulan <ibnfulan@tutanota.de> Date: Tue, 17 Aug 2021 18:22:07 +0600 Subject: [PATCH 1310/1705] [extractor/audiomack] Updated URL regex, corrected invalid testcases, fixed bug Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/audiomack.py | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index cc7771354..638eb4041 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,25 +29,27 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', - 'info_dict': { - 'id': '258901379', - 'ext': 'mp3', - 'description': 'mamba day freestyle for the legend Kobe Bryant ', - 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', - 'uploader': 'ILOVEMAKONNEN', - 'upload_date': '20160414', - } + 'only_matching': True, + # 'info_dict': { + # 'id': '258901379', + # 'ext': 'mp3', + # 'description': 'mamba day freestyle for the legend Kobe Bryant ', + # 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + # 'uploader': 'ILOVEMAKONNEN', + # 'upload_date': '20160414', + # } }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -79,7 +81,7 @@ class AudiomackAlbumIE(InfoExtractor): # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,24 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], 'params': { - 'playliststart': 9, - 'playlistend': 9, + 'playliststart': 2, + 'playlistend': 2, } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +136,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), From 16a3fe2ba6b4c86e60bca930253c81c8efdd676b Mon Sep 17 00:00:00 2001 From: Abdullah Ibn Fulan <54185653+abdullah-if@users.noreply.github.com> Date: Tue, 17 Aug 2021 19:56:39 +0000 Subject: [PATCH 1311/1705] Updated Album URL regex Mistakenly forgot to edit a line in last commit. Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/audiomack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 638eb4041..4d1fbad1f 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -75,7 +75,7 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist From ddc080a562cce984ac4a86969f511b1ae59421bf Mon Sep 17 00:00:00 2001 From: df <fieldhouse@gmx.net> Date: Mon, 18 Oct 2021 15:54:26 +0100 Subject: [PATCH 1312/1705] Add ArteTVCategoryIE to support category playlists --- youtube_dl/extractor/arte.py | 47 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 48 insertions(+) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 03abdbfaf..5bfe57b10 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -252,3 +253,49 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + if items: + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r'<title\b[^>]*>([^<]+)', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + result = self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title) + if result: + description = self._og_search_description(webpage, default=None) + if description: + result['description'] = description + return result diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e70daf2b1..50b7cb4a0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,7 @@ from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) from .arnes import ArnesIE from .asiancrush import ( From 734dfbb4e3ad4ee4d98609dc902ac864b94033a4 Mon Sep 17 00:00:00 2001 From: Seonghyeon Cho Date: Wed, 13 Oct 2021 20:27:40 +0900 Subject: [PATCH 1313/1705] Remove redundant assigning `format_id` --- youtube_dl/extractor/uol.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index 628adf219..59f8e5dc3 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -95,7 +95,6 @@ class UOLIE(InfoExtractor): if v: query[k] = v f_url = update_url_query(f_url, query) - format_id = format_id if format_id == 'HLS': m3u8_formats = self._extract_m3u8_formats( f_url, media_id, 'mp4', 'm3u8_native', From 47b0c8697a39bbd64d5b922f81ad74ee4d2a3136 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 7 Feb 2022 13:28:21 +0000 Subject: [PATCH 1314/1705] [ARD] Back-port subtitle extraction from yt-dlp PR 2409 Authored by: fstirlitz Fixes #30543 Closes #17766 (thanks ngdio) --- youtube_dl/extractor/ard.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index d45a9fe52..a5b1f54d5 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -332,9 +332,24 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), From 825d3426c56aabfc91aea139f2e6e0589f8096bc Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 9 Feb 2022 02:40:34 +0000 Subject: [PATCH 1315/1705] [Nuvid] Use site JSON for video details (#29332) Back-port yt-dlp PR 1022 onto PR #17890 and update Video details aren't in the original HTML now but populated by async JS Co-authored by: u-spec-png Co-authored by: vidaritos --- youtube_dl/extractor/nuvid.py | 120 +++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index ab6bfcd7f..f6c94dd77 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -1,71 +1,113 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_duration, + int_or_none, + try_get, + url_or_none, ) +import re + class NuvidIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P[0-9]+)' - _TEST = { - 'url': 'http://m.nuvid.com/video/1310741/', - 'md5': 'eab207b7ac4fccfb4e23c86201f11277', + _TESTS = [{ + 'url': 'https://www.nuvid.com/video/6513023/italian-babe', + 'md5': '772d2f8288f3d3c5c45f7a41761c7844', 'info_dict': { - 'id': '1310741', + 'id': '6513023', 'ext': 'mp4', - 'title': 'Horny babes show their awesome bodeis and', - 'duration': 129, + 'title': 'italian babe', + 'format_id': '360p', + 'duration': 321.0, 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, } - } + }, { + 'url': 'https://m.nuvid.com/video/6523263', + 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52', + 'info_dict': { + 'id': '6523263', + 'ext': 'mp4', + 'title': 'Slut brunette college student anal dorm', + 'format_id': '720p', + 'duration': 421.0, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }, { + 'url': 'http://m.nuvid.com/video/6415801/', + 'md5': '638d5ececb138d5753593f751ae3f697', + 'info_dict': { + 'id': '6415801', + 'ext': 'mp4', + 'title': 'My best friend wanted to fuck my wife for a long time', + 'format_id': '720p', + 'duration': 1882, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - page_url = 'http://m.nuvid.com/video/%s' % video_id + qualities = { + 'lq': '360p', + 'hq': '720p', + } + + json_url = 'https://www.nuvid.com/player_config_json/?vid={video_id}&aid=0&domain_id=0&embed=0&check_speed=0'.format(**locals()) + video_data = self._download_json( + json_url, video_id, headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) or {} + + # nice to have, not required webpage = self._download_webpage( - page_url, video_id, 'Downloading video page') - # When dwnld_speed exists and has a value larger than the MP4 file's - # bitrate, Nuvid returns the MP4 URL - # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm - self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') - mp4_webpage = self._download_webpage( - page_url, video_id, 'Downloading video page for MP4 format') + 'http://m.nuvid.com/video/%s' % (video_id, ), + video_id, 'Downloading video page', fatal=False) or '' + + title = ( + try_get(video_data, lambda x: x['title'], compat_str) + or self._html_search_regex( + (r''']*?\btitle\s*=\s*(?P"|'|\b)(?P[^"]+)(?P=q)\s*>''', + r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''', + r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''), + webpage, 'title', group='title')).strip() - html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', - video_url = self._html_search_regex(html5_video_re, webpage, video_id) - mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) formats = [{ - 'url': video_url, - }] - if mp4_video_url != video_url: - formats.append({ - 'url': mp4_video_url, - }) + 'url': source, + 'format_id': qualities.get(quality), + 'height': int_or_none(qualities.get(quality)[:-1]), + } for quality, source in video_data.get('files').items() if source] - title = self._html_search_regex( - [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', - r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() + self._check_formats(formats, video_id) + self._sort_formats(formats) + + duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) thumbnails = [ - { - 'url': thumb_url, - } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage) + {'url': thumb_url, } + for thumb_url in ( + url_or_none(src) for src in re.findall( + r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', + webpage)) ] - thumbnail = thumbnails[0]['url'] if thumbnails else None - duration = parse_duration(self._html_search_regex( - [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', - r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) return { 'id': video_id, + 'formats': formats, 'title': title, + 'thumbnail': url_or_none(video_data.get('poster')), 'thumbnails': thumbnails, - 'thumbnail': thumbnail, 'duration': duration, 'age_limit': 18, - 'formats': formats, } From 266b6ef18520f8de60fa143e154e4b12be12afb7 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 9 Feb 2022 21:21:59 +0000 Subject: [PATCH 1316/1705] [BBC] Also allow PID with leading 'l' (live?) --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 088af9823..378b52f4f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -40,7 +40,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ From 8ff961d10faed848009f9e2ec03fa390b486694d Mon Sep 17 00:00:00 2001 From: kikuyan <kikuyan@users.noreply.github.com> Date: Thu, 23 Dec 2021 11:40:45 +0900 Subject: [PATCH 1317/1705] [extractor/videa] fix extraction in Py2 Fixes #30416 --- youtube_dl/extractor/videa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index ab2c15cde..bdb95891d 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -91,7 +91,7 @@ class VideaIE(InfoExtractor): k = S[(S[i] + S[j]) % 256] res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) - return res.decode() + return res.decode('utf-8') def _real_extract(self, url): video_id = self._match_id(url) @@ -121,7 +121,7 @@ class VideaIE(InfoExtractor): compat_b64decode(b64_info), key), video_id) video = xpath_element(info, './video', 'video') - if not video: + if video is None: raise ExtractorError(xpath_element( info, './error', fatal=True), expected=True) sources = xpath_element( From 74f8cc48afa59e1a125f939c060b21654d29789c Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 9 Feb 2022 04:37:28 +0000 Subject: [PATCH 1318/1705] [extractor/videa] Back-port from yt-dlp PRs 463+1028 Authored by: nyuszika7h --- youtube_dl/extractor/videa.py | 53 ++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index bdb95891d..4589e78a1 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -12,6 +12,7 @@ from ..utils import ( mimetype2ext, parse_codecs, update_url_query, + urljoin, xpath_element, xpath_text, ) @@ -19,6 +20,7 @@ from ..compat import ( compat_b64decode, compat_ord, compat_struct_pack, + compat_urlparse, ) @@ -45,10 +47,24 @@ class VideaIE(InfoExtractor): }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', - 'only_matching': True, + 'md5': 'd57ccd8812c7fd491d33b1eab8c99975', + 'info_dict': { + 'id': 'jAHDWfWSJH5XuFhH', + 'ext': 'mp4', + 'title': 'Supercars előzés', + 'thumbnail': r're:^https?://.*', + 'duration': 64, + }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', - 'only_matching': True, + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': r're:^https?://.*', + 'duration': 21, + }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, @@ -95,9 +111,16 @@ class VideaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - query = {'v': video_id} - player_page = self._download_webpage( - 'https://videa.hu/player', video_id, query=query) + video_page = self._download_webpage(url, video_id) + + if 'videa.hu/player' in url: + player_url = url + player_page = video_page + else: + player_url = self._search_regex( + r'<iframe.*?src="(/player\?[^"]+)"', video_page, 'player url') + player_url = urljoin(url, player_url) + player_page = self._download_webpage(player_url, video_id) nonce = self._search_regex( r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') @@ -107,6 +130,7 @@ class VideaIE(InfoExtractor): for i in range(0, 32): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) query['_s'] = random_seed query['_t'] = result[:16] @@ -127,7 +151,7 @@ class VideaIE(InfoExtractor): sources = xpath_element( info, './video_sources', 'sources', fatal=True) hash_values = xpath_element( - info, './hash_values', 'hash values', fatal=True) + info, './hash_values', 'hash values', fatal=False) title = xpath_text(video, './title', fatal=True) @@ -136,15 +160,16 @@ class VideaIE(InfoExtractor): source_url = source.text source_name = source.get('name') source_exp = source.get('exp') - if not (source_url and source_name and source_exp): + if not (source_url and source_name): continue - hash_value = xpath_text(hash_values, 'hash_value_' + source_name) - if not hash_value: - continue - source_url = update_url_query(source_url, { - 'md5': hash_value, - 'expires': source_exp, - }) + hash_value = ( + xpath_text(hash_values, 'hash_value_' + source_name) + if hash_values is not None else None) + if hash_value and source_exp: + source_url = update_url_query(source_url, { + 'md5': hash_value, + 'expires': source_exp, + }) f = parse_codecs(source.get('codecs')) f.update({ 'url': self._proto_relative_url(source_url), From 29f7bfc4d7a80cecd67c19c25134481fbba6e175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Tue, 11 Jan 2022 17:56:18 +0100 Subject: [PATCH 1319/1705] [streamcz] cherry-pick from yt-dlp Cherry-picked-from: 7d449fff5346 ("[streamcz] Fix extractor (#1616)") --- youtube_dl/extractor/streamcz.py | 157 ++++++++++++++++--------------- 1 file changed, 80 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 58e0b4c80..0191c77de 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,105 +1,108 @@ # coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import time +import json from .common import InfoExtractor from ..utils import ( + float_or_none, int_or_none, - sanitized_Request, + parse_codecs, + traverse_obj, + urljoin, ) -def _get_api_key(api_path): - if api_path.endswith('?'): - api_path = api_path[:-1] - - api_key = 'fb5f58a820353bd7095de526253c14fd' - a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) - return hashlib.md5(a.encode('ascii')).hexdigest() - - class StreamCZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' - _API_URL = 'http://www.stream.cz/API' - + _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P<display_id>[^?#]+)-(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '934bb6a6d220d99c010783c9719960d5', + 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890', + 'md5': '40c41ade1464a390a0b447e333df4239', 'info_dict': { - 'id': '765767', + 'id': '57953890', 'ext': 'mp4', - 'title': 'Peklo na talíři: Éčka pro děti', - 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', - 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', - 'duration': 256, - }, + 'title': 'Bůh', + 'display_id': 'buh', + 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + } }, { - 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': '849a88c1e1ca47d41403c2ba5e59e261', + 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', + 'md5': '3ee4d0be040e8f4a543e67e509d55e3f', 'info_dict': { - 'id': '10002447', + 'id': '64147267', 'ext': 'mp4', - 'title': 'Kancelář Blaník: Tři roky pro Mazánka', - 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', - 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', - 'duration': 368, - }, + 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', + 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', + 'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744', + } }] + def _extract_formats(self, spl_url, video): + for ext, pref, streams in ( + ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), + ('mp4', 1, video.get('mp4'))): + for format_id, stream in streams.items(): + if not stream.get('url'): + continue + yield { + 'format_id': f'{format_id}-{ext}', + 'ext': ext, + 'source_preference': pref, + 'url': urljoin(spl_url, stream['url']), + 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), + 'duration': float_or_none(stream.get('duration'), scale=1000), + 'width': traverse_obj(stream, ('resolution', 0)), + 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')), + **parse_codecs(stream.get('codec')), + } + def _real_extract(self, url): - video_id = self._match_id(url) - api_path = '/episode/%s' % video_id + display_id, video_id = self._match_valid_url(url).groups() - req = sanitized_Request(self._API_URL + api_path) - req.add_header('Api-Password', _get_api_key(api_path)) - data = self._download_json(req, video_id) + data = self._download_json( + 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', + data=json.dumps({ + 'variables': {'urlName': video_id}, + 'query': ''' + query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } } + fragment VideoDetailFragmentOnEpisode on Episode { + id + spl + urlName + name + perex + duration + views + }''' + }).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=UTF-8'} + )['data']['episode'] - formats = [] - for quality, video in enumerate(data['video_qualities']): - for f in video['formats']: - typ = f['type'].partition('/')[2] - qlabel = video.get('quality_label') - formats.append({ - 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, - 'format_id': '%s-%s' % (typ, f['quality']), - 'url': f['source'], - 'height': int_or_none(f['quality'].rstrip('p')), - 'quality': quality, - }) - self._sort_formats(formats) - - image = data.get('image') - if image: - thumbnail = self._proto_relative_url( - image.replace('{width}', '1240').replace('{height}', '697'), - scheme='http:', - ) - else: - thumbnail = None - - stream = data.get('_embedded', {}).get('stream:show', {}).get('name') - if stream: - title = '%s: %s' % (stream, data['name']) - else: - title = data['name'] + spl_url = data['spl'] + 'spl2,3' + metadata = self._download_json(spl_url, video_id, 'Downloading playlist') + if 'Location' in metadata and 'data' not in metadata: + spl_url = metadata['Location'] + metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist') + video = metadata['data'] subtitles = {} - srt_url = data.get('subtitles_srt') - if srt_url: - subtitles['cs'] = [{ - 'ext': 'srt', - 'url': srt_url, - }] + for subs in video.get('subtitles', {}).values(): + if not subs.get('language'): + continue + for ext, sub_url in subs.get('urls').items(): + subtitles.setdefault(subs['language'], []).append({ + 'ext': ext, + 'url': urljoin(spl_url, sub_url) + }) + + formats = list(self._extract_formats(spl_url, video)) + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'description': data.get('web_site_text'), - 'duration': int_or_none(data.get('duration')), + 'display_id': display_id, + 'title': data.get('name'), + 'description': data.get('perex'), + 'duration': float_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'formats': formats, 'subtitles': subtitles, } From 8088ce036ac4ce282f8f864c6b5f4f3987647221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 11:55:13 +0100 Subject: [PATCH 1320/1705] revert: use _match_valid_url function --- youtube_dl/extractor/streamcz.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 0191c77de..998342e93 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,5 +1,6 @@ # coding: utf-8 import json +import re from .common import InfoExtractor from ..utils import ( @@ -55,7 +56,7 @@ class StreamCZIE(InfoExtractor): } def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() + display_id, video_id = re.match(self._VALID_URL, url).groups() data = self._download_json( 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', From b1297308fb7b423a60c3a28c74ac014d7b385a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 12:28:30 +0100 Subject: [PATCH 1321/1705] avoid traverse_obj function --- youtube_dl/extractor/streamcz.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 998342e93..fbdc44505 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -7,7 +7,6 @@ from ..utils import ( float_or_none, int_or_none, parse_codecs, - traverse_obj, urljoin, ) @@ -38,7 +37,7 @@ class StreamCZIE(InfoExtractor): def _extract_formats(self, spl_url, video): for ext, pref, streams in ( - ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), + ('ts', -1, video.get('http_stream', {}).get('qualities', {})), ('mp4', 1, video.get('mp4'))): for format_id, stream in streams.items(): if not stream.get('url'): @@ -50,8 +49,8 @@ class StreamCZIE(InfoExtractor): 'url': urljoin(spl_url, stream['url']), 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), 'duration': float_or_none(stream.get('duration'), scale=1000), - 'width': traverse_obj(stream, ('resolution', 0)), - 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')), + 'width': stream.get('resolution', 2 * [0])[0] or None, + 'height': stream.get('resolution', 2 * [0])[1] or int_or_none(format_id.replace('p', '')), **parse_codecs(stream.get('codec')), } From d02064218be76eba6350a13ccbbc473b1b439570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 12:30:29 +0100 Subject: [PATCH 1322/1705] do not use f-strings --- youtube_dl/extractor/streamcz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index fbdc44505..d1736c023 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -43,7 +43,7 @@ class StreamCZIE(InfoExtractor): if not stream.get('url'): continue yield { - 'format_id': f'{format_id}-{ext}', + 'format_id': '{}-{}'.format(format_id, ext), 'ext': ext, 'source_preference': pref, 'url': urljoin(spl_url, stream['url']), From d8adca1b664fceb07f2b28b55c7e1855407296ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 13:13:20 +0100 Subject: [PATCH 1323/1705] [streamcz] test fixes and one additional test --- youtube_dl/extractor/streamcz.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index d1736c023..60e770448 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -22,6 +22,20 @@ class StreamCZIE(InfoExtractor): 'title': 'Bůh', 'display_id': 'buh', 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + 'duration': 1369.6, + 'view_count': int, + } + }, { + 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937', + 'md5': '41fd358000086a1ccdb068c77809b158', + 'info_dict': { + 'id': '64087937', + 'ext': 'mp4', + 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna', + 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna', + 'description': 'md5:97a811000a6460266029d6c1c2ebcd59', + 'duration': 50.2, + 'view_count': int, } }, { 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', @@ -31,7 +45,9 @@ class StreamCZIE(InfoExtractor): 'ext': 'mp4', 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', - 'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744', + 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf', + 'duration': 442.84, + 'view_count': int, } }] From 85bf26c1d01f94b83476703e5c70022f01164ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 15:02:08 +0100 Subject: [PATCH 1324/1705] resolve problem with unpacking operator for <py3.5 --- youtube_dl/extractor/streamcz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 60e770448..179bdcaba 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, + merge_dicts, parse_codecs, urljoin, ) @@ -58,7 +59,7 @@ class StreamCZIE(InfoExtractor): for format_id, stream in streams.items(): if not stream.get('url'): continue - yield { + yield merge_dicts({ 'format_id': '{}-{}'.format(format_id, ext), 'ext': ext, 'source_preference': pref, @@ -67,8 +68,7 @@ class StreamCZIE(InfoExtractor): 'duration': float_or_none(stream.get('duration'), scale=1000), 'width': stream.get('resolution', 2 * [0])[0] or None, 'height': stream.get('resolution', 2 * [0])[1] or int_or_none(format_id.replace('p', '')), - **parse_codecs(stream.get('codec')), - } + }, parse_codecs(stream.get('codec'))) def _real_extract(self, url): display_id, video_id = re.match(self._VALID_URL, url).groups() From bf23bc0489cf304b2a8ab756f2f63b2cfa5586fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= <arkamar@atlas.cz> Date: Sat, 12 Feb 2022 15:27:10 +0100 Subject: [PATCH 1325/1705] add missing __future__ import unicode_literals --- youtube_dl/extractor/streamcz.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 179bdcaba..060ba32e0 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + import json import re From 34722270741fb9c06f978861c1e5f503291070d8 Mon Sep 17 00:00:00 2001 From: Vladimir Stavrinov <9163352+vstavrinov@users.noreply.github.com> Date: Mon, 14 Feb 2022 20:54:31 +0300 Subject: [PATCH 1326/1705] [rutv] fix vbr for empty string value (#30623) * [rutv] use str_to_int() (thx dirkf) --- youtube_dl/extractor/rutv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index d2713c19a..05f319396 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -6,7 +6,8 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none + int_or_none, + str_to_int ) @@ -179,7 +180,7 @@ class RUTVIE(InfoExtractor): 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', - 'vbr': int(quality), + 'vbr': str_to_int(quality), 'preference': preference, } elif transport == 'm3u8': From 782bfd26dbebea60e35f58ab18e218bedbecb782 Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" <nao20010128@gmail.com> Date: Thu, 24 Feb 2022 22:34:32 +0900 Subject: [PATCH 1327/1705] [bigo] add support for bigo.tv (#30635) * [bigo] add support for bigo.tv * [bigo] prepend "Bigo says" * title fallback * add error for invalid json data --- youtube_dl/extractor/bigo.py | 59 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/bigo.py diff --git a/youtube_dl/extractor/bigo.py b/youtube_dl/extractor/bigo.py new file mode 100644 index 000000000..ddf76ac55 --- /dev/null +++ b/youtube_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 50b7cb4a0..c73c4cd6c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -115,6 +115,7 @@ from .bfmtv import ( ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, From 923292ba643bf2a5c1fade797bd87a0de4f58d25 Mon Sep 17 00:00:00 2001 From: marieell <marieell@tuta.io> Date: Thu, 10 Feb 2022 10:36:24 +0100 Subject: [PATCH 1328/1705] [aliexpress] Fix test case --- youtube_dl/extractor/aliexpress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py index 6f241e683..9722fe9ac 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/youtube_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', From 1f13ccfd7fcafbfd79ddd652967e02f9eda7ce79 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 24 Feb 2022 18:26:58 +0000 Subject: [PATCH 1329/1705] Fixed groups() call on potentially empty regex search object (#30676) * Fixed groups() call on potentially empty regex search object. - https://github.com/ytdl-org/youtube-dl/issues/30521 * minimising lines changed Co-authored-by: yayorbitgum <50963144+yayorbitgum@users.noreply.github.com> --- youtube_dl/extractor/myspass.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index db7ebc94c..f540c52ee 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -35,7 +35,9 @@ class MySpassIE(InfoExtractor): title = xpath_text(metadata, 'title', fatal=True) video_url = xpath_text(metadata, 'url_flv', 'download url', True) video_id_int = int(video_id) - for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + + grps = re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url) + for group in grps.groups() if grps else []: group_int = int(group) if group_int > video_id_int: video_url = video_url.replace( From c4d1738316db45e03e0625650b3550334b66ab7f Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 24 Feb 2022 09:16:16 +0000 Subject: [PATCH 1330/1705] [CPAC] Add extractor for Canadian Parliament CPACIE: single episode CPACPlaylistIE: playlists and searches --- youtube_dl/extractor/cpac.py | 148 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/cpac.py diff --git a/youtube_dl/extractor/cpac.py b/youtube_dl/extractor/cpac.py new file mode 100644 index 000000000..22741152c --- /dev/null +++ b/youtube_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c73c4cd6c..7c99cb7e0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -255,6 +255,10 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE From f8e543c9063c1c7ad157936cb6a15b428ddb3896 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 7 Feb 2022 20:06:27 +0000 Subject: [PATCH 1331/1705] [Alsace20TV] Add new extractors Alsace20TVIE, Alsace20TVEmbedIE --- youtube_dl/extractor/alsace20tv.py | 89 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 ++ 2 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/alsace20tv.py diff --git a/youtube_dl/extractor/alsace20tv.py b/youtube_dl/extractor/alsace20tv.py new file mode 100644 index 000000000..228cec3ec --- /dev/null +++ b/youtube_dl/extractor/alsace20tv.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info['titre'] + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7c99cb7e0..535080d0a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -51,6 +51,10 @@ from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE From 4194d253c0b922addf0439228066cb4fb487bac3 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 30 Jul 2021 12:58:19 +0100 Subject: [PATCH 1332/1705] Avoid skipping ID when unlisted_hash is numeric Pattern needed a non-greedy match; also replaced a redundant test with one for this, issue 29690 --- youtube_dl/extractor/vimeo.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0b386f450..a66912502 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -271,7 +271,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? + (?:.*?/)?? (?: (?: play_redirect_hls| @@ -517,14 +517,28 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, - { - 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, - }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, + }, + { + # similar, but all numeric: ID must be 581039021, not 9603038895 + # issue #29690 + 'url': 'https://vimeo.com/581039021/9603038895', + 'info_dict': { + 'id': '581039021', + # these have to be provided but we don't care + 'ext': 'mp4', + 'timestamp': 1627621014, + 'title': 're:.+', + 'uploader_id': 're:.+', + 'uploader': 're:.+', + 'upload_date': r're:\d+', + }, + 'params': { + 'skip_download': True, + }, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header From 6508688e88c83bb811653083db9351702cd39a6a Mon Sep 17 00:00:00 2001 From: df <fieldhouse@gmx.net> Date: Sun, 1 Aug 2021 09:42:57 +0100 Subject: [PATCH 1333/1705] Make default upload_/release_date a compat_str Ensures download tests pass in Python 2 as well as 3; also add YoutubeDL tests for timestamp -> upload_date etc. --- test/test_YoutubeDL.py | 19 +++++++++++++++++++ youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a35effe0e..f8c8e619c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -997,6 +997,25 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'Video') self.assertEqual(downloaded['extractor_key'], 'Video') + def test_default_times(self): + """Test addition of missing upload/release/_date from /release_/timestamp""" + info = { + 'id': '1234', + 'url': TEST_URL, + 'title': 'Title', + 'ext': 'mp4', + 'timestamp': 1631352900, + 'release_timestamp': 1632995931, + } + + params = {'simulate': True, } + ydl = FakeYDL(params) + out_info = ydl.process_ie_result(info) + self.assertTrue(isinstance(out_info['upload_date'], compat_str)) + self.assertEqual(out_info['upload_date'], '20210911') + self.assertTrue(isinstance(out_info['release_date'], compat_str)) + self.assertEqual(out_info['release_date'], '20210930') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..69736acff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1529,7 +1529,7 @@ class YoutubeDL(object): # see http://bugs.python.org/issue1646728) try: upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') + info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d')) except (ValueError, OverflowError, OSError): pass From 49c5293014bc11ec8c009856cd63cffa6296c1e1 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 22 Feb 2022 11:24:06 +0000 Subject: [PATCH 1334/1705] Ignore --external-downloader-args if --external-downloader was rejected ... and generate warning --- youtube_dl/YoutubeDL.py | 11 ++++++++++- youtube_dl/downloader/__init__.py | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 69736acff..019e309cb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1906,8 +1906,17 @@ class YoutubeDL(object): if not self.params.get('skip_download', False): try: + def checked_get_suitable_downloader(info_dict, params): + ed_args = params.get('external_downloader_args') + dler = get_suitable_downloader(info_dict, params) + if ed_args and not params.get('external_downloader_args'): + # external_downloader_args was cleared because external_downloader was rejected + self.report_warning('Requested external downloader cannot be used: ' + 'ignoring --external-downloader-args.') + return dler + def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) + fd = checked_get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index d8f2fa342..d701d6292 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -50,6 +50,9 @@ def _get_suitable_downloader(info_dict, params={}): ed = get_external_downloader(external_downloader) if ed.can_download(info_dict): return ed + # Avoid using unwanted args since external_downloader was rejected + if params.get('external_downloader_args'): + params['external_downloader_args'] = None protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'): From 17d295a1ec6d04362740dd8a0c583690f5ba082a Mon Sep 17 00:00:00 2001 From: lihan7 <lihan7@xiaomi.com> Date: Fri, 25 Mar 2022 15:46:28 +0800 Subject: [PATCH 1335/1705] [extractor/bilibili] Fix path "/audio/auxxxxx" download return 403 --- youtube_dl/extractor/bilibili.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index bff6ea194..d42f0e98a 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -369,6 +369,11 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'filesize': int_or_none(play_data.get('size')), }] + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + song = self._call_api('song/info', au_id) title = song['title'] statistic = song.get('statistic') or {} From 9e5ca66f16998eb2a680e23a6e769e34001898c5 Mon Sep 17 00:00:00 2001 From: nixxo <nixxo@protonmail.com> Date: Mon, 4 Jan 2021 15:11:47 +0100 Subject: [PATCH 1336/1705] [RAI] Added checks for DRM protected content (PR #27657) reviewed by pukkandan (https://github.com/yt-dlp/yt-dlp/pull/150) --- youtube_dl/extractor/rai.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 67b86fc72..2abe164e0 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -158,6 +158,10 @@ class RaiPlayIE(RaiBaseIE): # subtitles at 'subtitlesArray' key (see #27698) 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -166,6 +170,13 @@ class RaiPlayIE(RaiBaseIE): media = self._download_json( base + '.json', video_id, 'Downloading video JSON') + if try_get( + media, + (lambda x: x['rights_management']['rights']['drm'], + lambda x: x['program_info']['rights_management']['rights']['drm']), + dict): + raise ExtractorError('This video is DRM protected.', expected=True) + title = media['name'] video = media['video'] From 1f50a07771fddb5f64617617d156bfdd593f951e Mon Sep 17 00:00:00 2001 From: nixxo <nixxo@protonmail.com> Date: Wed, 27 Jan 2021 12:24:50 +0100 Subject: [PATCH 1337/1705] [RAI] Extend formats with direct http mp4 link (PR #27990) * initial support for creating direct mp4 link * improved regexes and info extraction * added "connection: close" to request headers * updated to https://github.com/yt-dlp/yt-dlp/pull/208 --- youtube_dl/extractor/rai.py | 111 +++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 2abe164e0..7b0315a62 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -5,15 +5,16 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_str, + compat_urlparse, ) from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, + HEADRequest, int_or_none, parse_duration, remove_start, @@ -96,12 +97,100 @@ class RaiBaseIE(InfoExtractor): if not formats and geoprotection is True: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + formats.extend(self._create_http_urls(relinker_url, formats)) + return dict((k, v) for k, v in { 'is_live': is_live, 'duration': duration, 'formats': formats, }.items() if v is not None) + def _create_http_urls(self, relinker_url, fmts): + _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' + _QUALITY = { + # tbr: w, h + '250': [352, 198], + '400': [512, 288], + '700': [512, 288], + '800': [700, 394], + '1200': [736, 414], + '1800': [1024, 576], + '2400': [1280, 720], + '3200': [1440, 810], + '3600': [1440, 810], + '5000': [1920, 1080], + '10000': [1920, 1080], + } + + def test_url(url): + resp = self._request_webpage( + HEADRequest(url), None, headers={'User-Agent': 'Rai'}, + fatal=False, errnote=False, note=False) + + if resp is False: + return False + + if resp.code == 200: + return False if resp.url == url else resp.url + return None + + def get_format_info(tbr): + import math + br = int_or_none(tbr) + if len(fmts) == 1 and not br: + br = fmts[0].get('tbr') + if br > 300: + tbr = compat_str(math.floor(br / 100) * 100) + else: + tbr = '250' + + # try extracting info from available m3u8 formats + format_copy = None + for f in fmts: + if f.get('tbr'): + br_limit = math.floor(br / 100) + if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: + format_copy = f.copy() + return { + 'width': format_copy.get('width'), + 'height': format_copy.get('height'), + 'tbr': format_copy.get('tbr'), + 'vcodec': format_copy.get('vcodec'), + 'acodec': format_copy.get('acodec'), + 'fps': format_copy.get('fps'), + 'format_id': 'https-%s' % tbr, + } if format_copy else { + 'width': _QUALITY[tbr][0], + 'height': _QUALITY[tbr][1], + 'format_id': 'https-%s' % tbr, + 'tbr': int(tbr), + } + + loc = test_url(_MP4_TMPL % (relinker_url, '*')) + if not isinstance(loc, compat_str): + return [] + + mobj = re.match( + _RELINKER_REG, + test_url(relinker_url) or '') + if not mobj: + return [] + + available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] + available_qualities = [i for i in available_qualities if i] + + formats = [] + for q in available_qualities: + fmt = { + 'url': _MP4_TMPL % (relinker_url, q), + 'protocol': 'https', + 'ext': 'mp4', + } + fmt.update(get_format_info(q)) + formats.append(fmt) + return formats + @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' @@ -151,6 +240,22 @@ class RaiPlayIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # 1080p direct mp4 url + 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', + 'md5': '2e501e8651d72f05ffe8f5d286ad560b', + 'info_dict': { + 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', + 'ext': 'mp4', + 'title': 'Leonardo - S1E1', + 'alt_title': 'St 1 Ep 1 - Episodio 1', + 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 1', + 'duration': 3229, + 'series': 'Leonardo', + 'season': 'Season 1', + }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, @@ -318,7 +423,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', + 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', From 871645a4a4a0e12ec8f7bf78a3ad7bf75838ee5c Mon Sep 17 00:00:00 2001 From: nixxo <nixxo@protonmail.com> Date: Sat, 2 Apr 2022 07:57:56 +0200 Subject: [PATCH 1338/1705] [RAI] Fix extraction of http formats From https://github.com/yt-dlp/yt-dlp/pull/3272 Closes https://github.com/yt-dlp/yt-dlp/issues/3270 Authored by: nixxo --- youtube_dl/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 7b0315a62..563d3400f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -106,7 +106,7 @@ class RaiBaseIE(InfoExtractor): }.items() if v is not None) def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h From b764dbe7730bc5b0a4f30f4f89fd85e096d0c4a0 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 10 Apr 2022 05:49:09 +0100 Subject: [PATCH 1339/1705] Disable blank issues --- .github/ISSUE_TEMPLATE/config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/ISSUE_TEMPLATE/config.yml diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..3ba13e0ce --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false From a0068bd6bec16008bda7a39caecccbf84881c603 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 15 Apr 2022 16:07:09 +0100 Subject: [PATCH 1340/1705] [Youtube] Fix "n" descrambling for player fae06c11 Resolves #30856. --- youtube_dl/extractor/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 41695a561..ff6c7b0f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1464,15 +1464,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 def _extract_n_function_name(self, jscode): - target = r'(?P<nfunc>[a-zA-Z0-9$]{3})(?:\[(?P<idx>\d+)\])?' + target = r'(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?' nfunc_and_idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ), + r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ), jscode, 'Initial JS player n function name') nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') if not idx: return nfunc return self._parse_json(self._search_regex( - r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode, + r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode, 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): From ebc627847cd1f5faddf4bd90376c1635777283cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81rni=20Dagur?= <arni@dagur.eu> Date: Thu, 28 Apr 2022 11:18:10 +0200 Subject: [PATCH 1341/1705] [KTH] Add new extractor for KTH play (#30885) * Implement extractor for KTH play * Make KTH Play url regex more relaxed --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kaltura.py | 2 +- youtube_dl/extractor/kth.py | 31 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/kth.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 535080d0a..452caeade 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE +from .kth import KTHIE from .ku6 import Ku6IE from .kusi import KUSIIE from .kuwo import ( diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index c731612c4..6d4d93394 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -373,5 +373,5 @@ class KalturaIE(InfoExtractor): 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, - 'view_count': info.get('plays'), + 'view_count': int_or_none(info.get('plays')), } diff --git a/youtube_dl/extractor/kth.py b/youtube_dl/extractor/kth.py new file mode 100644 index 000000000..b8db461f5 --- /dev/null +++ b/youtube_dl/extractor/kth.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class KTHIE(InfoExtractor): + _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P<id>[a-z0-9_]+)' + _TEST = { + 'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9', + 'md5': 'd83ada6d00ca98b73243a88efe19e8a6', + 'info_dict': { + 'id': '0_uoop6oz9', + 'ext': 'mp4', + 'title': 'md5:bd1d6931facb6828762a33e6ce865f37', + 'thumbnail': 're:https?://.+/thumbnail/.+', + 'duration': 3516, + 'timestamp': 1647345358, + 'upload_date': '20220315', + 'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self.url_result( + smuggle_url('kaltura:308:%s' % video_id, { + 'service_url': 'https://api.kaltura.nordu.net'}), + 'Kaltura') + return result From e27d8d819fa69d5714ea1682a1d5d56f617461fc Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 29 Apr 2022 13:36:02 +0100 Subject: [PATCH 1342/1705] [streamcz] Remove empty `'{}'.format()` for Py2.6 Use `'-join()'` here, or `{0}`, ..., in general. --- youtube_dl/extractor/streamcz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 060ba32e0..97b2eb7f8 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -62,7 +62,7 @@ class StreamCZIE(InfoExtractor): if not stream.get('url'): continue yield merge_dicts({ - 'format_id': '{}-{}'.format(format_id, ext), + 'format_id': '-'.join((format_id, ext)), 'ext': ext, 'source_preference': pref, 'url': urljoin(spl_url, stream['url']), From e988fa4523e489596a2a27c4d45275e44db49406 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 28 Apr 2022 15:25:49 +0100 Subject: [PATCH 1343/1705] [doc] Clarify test naming --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2841ed68f..cd888c731 100644 --- a/README.md +++ b/README.md @@ -1069,9 +1069,11 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test (actually, test case) then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note: + * the test names use the extractor class name **without the trailing `IE`** + * tests with `only_matching` key in test's dict are not counted. +8. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. +9. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py From c7965b9fc2cae54f244f31f5373cb81a40e822ab Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 9 May 2022 18:54:41 +0100 Subject: [PATCH 1344/1705] [NHK] Support alphabetic characters in 7-char NhkVod IDs (#29682) --- youtube_dl/extractor/nhk.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 8a9331a79..46a800e7e 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -23,7 +24,7 @@ class NhkBaseIE(InfoExtractor): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() - if episode_id.isdigit(): + if len(episode_id) == 7: episode_id = episode_id[:4] + '-' + episode_id[4:] is_video = m_type == 'video' @@ -84,7 +85,8 @@ class NhkBaseIE(InfoExtractor): class NhkVodIE(NhkBaseIE): - _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg + _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -124,6 +126,19 @@ class NhkVodIE(NhkBaseIE): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, + }, { + # video, alphabetic character in ID #29670 + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', + 'only_matching': True, + 'info_dict': { + 'id': 'qfjay6cg', + 'ext': 'mp4', + 'title': 'DESIGN TALKS plus - Fishermen’s Finery', + 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448', + 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', + 'upload_date': '20210615', + 'timestamp': 1623722008, + } }] def _real_extract(self, url): From c3deca86aedd2d8ab7cd0c596fd68b7aeb7c042d Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 19 May 2022 17:41:48 +0000 Subject: [PATCH 1345/1705] [wat.tv] Add version `pver` to metadata API call Resolves #30959. --- youtube_dl/extractor/wat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index f1bccc2d6..b15e03768 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -57,7 +57,7 @@ class WatIE(InfoExtractor): # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1'}) + video_id, query={'context': 'MYTF1', 'pver': '4001000'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') From be35e5343a6c31f5f32ee216ab4486a1992260c5 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Wed, 13 Apr 2022 07:21:23 -0500 Subject: [PATCH 1346/1705] Update options.py --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 0a0641bd4..6521ad881 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -270,11 +270,11 @@ def parseOpts(overrideArguments=None): selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', - help='Download only matching titles (regex or caseless sub-string)') + help='Download only matching titles (case-insensitive regex or sub-string)') selection.add_option( '--reject-title', dest='rejecttitle', metavar='REGEX', - help='Skip download for matching titles (regex or caseless sub-string)') + help='Skip download for matching titles (case-insensitive regex or sub-string)') selection.add_option( '--max-downloads', dest='max_downloads', metavar='NUMBER', type=int, default=None, From 187a48aee29847664e0c4cd80fe90c32e1fb334b Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 24 May 2022 15:33:00 +0100 Subject: [PATCH 1347/1705] [YouTube] Handle player c5a4daa1 with indirect n-function definition * resolves #30976 --- youtube_dl/extractor/youtube.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ff6c7b0f8..9c62b8890 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1471,9 +1471,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') if not idx: return nfunc + if int_or_none(idx) == 0: + real_nfunc = self._search_regex( + r'var %s\s*=\s*\[([a-zA-Z_$][\w$]*)\];' % (re.escape(nfunc), ), jscode, + 'Initial JS player n function alias ({nfunc}[{idx}])'.format(**locals())) + if real_nfunc: + return real_nfunc return self._parse_json(self._search_regex( r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode, - 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] + 'Initial JS player n function name ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) @@ -1482,7 +1488,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if func_code: jsi = JSInterpreter(func_code) else: - player_id = self._extract_player_info(player_url) jscode = self._get_player_code(video_id, player_url, player_id) funcname = self._extract_n_function_name(jscode) jsi = JSInterpreter(jscode) From 52c3751df722ab6f31f0229a415c7389a95c2307 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sat, 28 May 2022 13:52:51 +0100 Subject: [PATCH 1348/1705] [utils] Enable ALPN in HTTPS to satisfy broken servers See https://github.com/yt-dlp/yt-dlp/issues/3878 --- youtube_dl/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e722eed58..4ff27db3d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2292,12 +2292,30 @@ def formatSeconds(secs): def make_HTTPS_handler(params, **kwargs): + + # https://www.rfc-editor.org/info/rfc7301 + ALPN_PROTOCOLS = ['http/1.1'] + + def set_alpn_protocols(ctx): + # From https://github.com/yt-dlp/yt-dlp/commit/2c6dcb65fb612fc5bc5c61937bf438d3c473d8d0 + # Thanks @coletdjnz + # Some servers may (wrongly) reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/yt-dlp/yt-dlp/issues/3878 + try: + ctx.set_alpn_protocols(ALPN_PROTOCOLS) + except (AttributeError, NotImplementedError): + # Python < 2.7.10, not ssl.HAS_ALPN + pass + opts_no_check_certificate = params.get('nocheckcertificate', False) if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) + set_alpn_protocols(context) if opts_no_check_certificate: context.check_hostname = False context.verify_mode = ssl.CERT_NONE + try: return YoutubeDLHTTPSHandler(params, context=context, **kwargs) except TypeError: @@ -2313,6 +2331,7 @@ def make_HTTPS_handler(params, **kwargs): if opts_no_check_certificate else ssl.CERT_REQUIRED) context.set_default_verify_paths() + set_alpn_protocols(context) return YoutubeDLHTTPSHandler(params, context=context, **kwargs) From 04fd3289d30de3c99c7d2de34d555b050bc96d4d Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sat, 28 May 2022 13:54:32 +0100 Subject: [PATCH 1349/1705] [YouPorn] Improve `upload_date` extraction See https://github.com/yt-dlp/yt-dlp/issues/2701#issuecomment-1034341883 --- youtube_dl/extractor/youporn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 7084d3d12..31e8abb72 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -137,9 +138,10 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'UPLOADED:\s*<span>([^<]+)', + (r'UPLOADED:\s*<span>([^<]+)', r'Date\s+[Aa]dded:\s*<span>([^<]+)', - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], + r'''(?s)<div[^>]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)</div>''', + r'(?s)<label\b[^>]*>Uploaded[^<]*</label>\s*<span\b[^>]*>(.+?)</span>'), webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) From 9aa8e5340f3d5ece372b983f8e399277ca1f1fe4 Mon Sep 17 00:00:00 2001 From: LewdyCoder <88900506+LewdyCoder@users.noreply.github.com> Date: Mon, 30 May 2022 03:50:50 +0200 Subject: [PATCH 1350/1705] [Readme] Clarified extractor naming (#29799) * Exported usable extractors must be named `xxxxIE` Co-authored-by: dirkf <fieldhouse@gmx.net> --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 58ab3a4b8..ff40cef78 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -150,7 +150,7 @@ After you have ensured this site is distributing its content legally, you can fo # TODO more properties (see youtube_dl/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). +5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). This makes the extractor available for use, as long as the class ends with `IE`. 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): From 1baa0f5f6678c047624785dc9a3ab3cb44a72809 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 29 Apr 2021 04:56:09 +0530 Subject: [PATCH 1351/1705] [utils] Escape URL while sanitizing Closes #31008, #yt-dlp/263 While this fixes the issue in question, it does not try to address the root-cause of the problem Refer: 915f911e365736227e134ad654601443dbfd7ccb, f5fa042c82300218a2d07b95dd6b9c0756745db3 --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4ff27db3d..8aa2a43a2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2151,7 +2151,7 @@ def sanitize_url(url): for mistake, fixup in COMMON_TYPOS: if re.match(mistake, url): return re.sub(mistake, fixup, url) - return url + return escape_url(url) def sanitized_Request(url, *args, **kwargs): From 530f4582d011cd94986cf4d233f9fb9263f72150 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 6 Jun 2022 19:29:48 +0100 Subject: [PATCH 1352/1705] [HRFernsehen] Back-port new extractor from yt-dlp Closes #26445, where this was originally proposed. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hrfernsehen.py | 101 ++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/hrfernsehen.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 452caeade..751fc38b6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -479,6 +479,7 @@ from .hotstar import ( ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrfernsehen import HRFernsehenIE from .hrti import ( HRTiIE, HRTiPlaylistIE, diff --git a/youtube_dl/extractor/hrfernsehen.py b/youtube_dl/extractor/hrfernsehen.py new file mode 100644 index 000000000..11b879dbd --- /dev/null +++ b/youtube_dl/extractor/hrfernsehen.py @@ -0,0 +1,101 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import json +import re + +from ..utils import ( + int_or_none, + unified_timestamp, + unescapeHTML +) +from .common import InfoExtractor + + +class HRFernsehenIE(InfoExtractor): + IE_NAME = 'hrfernsehen' + _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' + + _TESTS = [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', + 'md5': '5c4e0ba94677c516a2f65a84110fc536', + 'info_dict': { + 'id': '130546', + 'ext': 'mp4', + 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / ' + 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / ' + 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music', + 'subtitles': {'de': [{ + 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' + }]}, + 'timestamp': 1598470200, + 'upload_date': '20200826', + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', + 'title': 'hessenschau vom 26.08.2020' + } + }, { + 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', + 'only_matching': True + }] + + _GEO_COUNTRIES = ['DE'] + + def extract_airdate(self, loader_data): + airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') + + if airdate_str is None: + return None + + return unified_timestamp(airdate_str) + + def extract_formats(self, loader_data): + stream_formats = [] + for stream_obj in loader_data["videoResolutionLevels"]: + stream_format = { + 'format_id': str(stream_obj['verticalResolution']) + "p", + 'height': stream_obj['verticalResolution'], + 'url': stream_obj['url'], + } + + quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', + stream_obj['url']) + if quality_information: + stream_format['width'] = int_or_none(quality_information.group(1)) + stream_format['height'] = int_or_none(quality_information.group(2)) + stream_format['fps'] = int_or_none(quality_information.group(3)) + stream_format['tbr'] = int_or_none(quality_information.group(4)) + + stream_formats.append(stream_format) + + self._sort_formats(stream_formats) + return stream_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage) + description = self._html_search_meta( + ['description'], webpage) + + loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_data = json.loads(loader_str) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': self.extract_formats(loader_data), + 'timestamp': self.extract_airdate(loader_data) + } + + if "subtitle" in loader_data: + info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} + + thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) + if len(thumbnails) > 0: + info["thumbnails"] = [{"url": t} for t in thumbnails] + + return info From ef044be34bb64c489558dd07818616b514d2e2ad Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 8 Jun 2022 15:52:21 +0100 Subject: [PATCH 1353/1705] [test] Skip not _WORKING IE in subtitle tests; use unittest.skipTest throughout --- test/test_download.py | 7 +++---- test/test_subtitles.py | 3 +++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 8e43cfa12..0951a171a 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -88,7 +88,6 @@ class TestDownload(unittest.TestCase): # Dynamically generate tests - def generator(test_case, tname): def test_template(self): @@ -100,9 +99,10 @@ def generator(test_case, tname): def print_skipping(reason): print('Skipping %s: %s' % (test_case['name'], reason)) + self.skipTest(reason) + if not ie.working(): print_skipping('IE marked as not _WORKING') - return for tc in test_cases: info_dict = tc.get('info_dict', {}) @@ -111,11 +111,10 @@ def generator(test_case, tname): if 'skip' in test_case: print_skipping(test_case['skip']) - return + for other_ie in other_ies: if not other_ie.working(): print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) - return params = get_params(test_case.get('params', {})) params['outtmpl'] = tname + '_' + params['outtmpl'] diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 550e0ca00..c250473be 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -38,6 +38,9 @@ class BaseTestSubtitles(unittest.TestCase): self.DL = FakeYDL() self.ie = self.IE() self.DL.add_info_extractor(self.ie) + if not self.IE.working(): + print('Skipping: %s marked as not _WORKING' % self.IE.ie_key()) + self.skipTest('IE marked as not _WORKING') def getInfoDict(self): info_dict = self.DL.extract_info(self.url, download=False) From 3aa94d7945dfaa0e04acf2700ffe0e43b00db498 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 8 Jun 2022 23:11:33 +0100 Subject: [PATCH 1354/1705] [test] Fix workable subtitle tests (except YT) and mark others as skip, broken * broken tests need to be fixed when fixing the respective IE --- test/test_subtitles.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c250473be..23cf06e09 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -131,6 +131,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE @@ -155,18 +156,19 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') + self.assertEqual(md5(subtitles['en']), '386cbc9320b94e25cb364b97935e5dd1') + self.assertEqual(md5(subtitles['fr']), 'c9b69eef35bc6641c0d4da8a04f9dfac') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://vimeo.com/56015672' + self.url = 'http://vimeo.com/68093876' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE @@ -188,6 +190,7 @@ class TestWallaSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE @@ -209,6 +212,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestLyndaSubtitles(BaseTestSubtitles): url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' IE = LyndaIE @@ -221,6 +225,7 @@ class TestLyndaSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') +@unittest.skip('IE broken') class TestNPOSubtitles(BaseTestSubtitles): url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' IE = NPOIE @@ -233,6 +238,7 @@ class TestNPOSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') +@unittest.skip('IE broken') class TestMTVSubtitles(BaseTestSubtitles): url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE @@ -256,8 +262,8 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') + self.assertEqual(set(subtitles.keys()), set(['nb-ttv'])) + self.assertEqual(md5(subtitles['nb-ttv']), '67e06ff02d0deaf975e68f6cb8f6a149') class TestRaiPlaySubtitles(BaseTestSubtitles): @@ -280,6 +286,7 @@ class TestRaiPlaySubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') +@unittest.skip('IE broken - DRM only') class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' IE = VikiIE @@ -306,6 +313,7 @@ class TestThePlatformSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +@unittest.skip('IE broken') class TestThePlatformFeedSubtitles(BaseTestSubtitles): url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' IE = ThePlatformFeedIE @@ -341,7 +349,7 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + self.assertEqual(md5(subtitles['en']), 'a3cc4c0b5eadd74d9974f1c1f5101045') def test_subtitles_in_page(self): self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' @@ -349,7 +357,7 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + self.assertEqual(md5(subtitles['en']), 'a3cc4c0b5eadd74d9974f1c1f5101045') if __name__ == '__main__': From 811c480f7b6c25ca510a033e6365d00174135392 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 9 Jun 2022 15:25:23 +0100 Subject: [PATCH 1355/1705] [YouTube] Support JSON3 subtitle format * subtitle tests updated to match --- test/test_subtitles.py | 74 ++++++++++++++++++++++++--------- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 23cf06e09..4cbc69ccd 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -59,6 +59,21 @@ class BaseTestSubtitles(unittest.TestCase): class TestYoutubeSubtitles(BaseTestSubtitles): + # Available subtitles for QRS8MkLhQmM: + # Language formats + # ru vtt, ttml, srv3, srv2, srv1, json3 + # fr vtt, ttml, srv3, srv2, srv1, json3 + # en vtt, ttml, srv3, srv2, srv1, json3 + # nl vtt, ttml, srv3, srv2, srv1, json3 + # de vtt, ttml, srv3, srv2, srv1, json3 + # ko vtt, ttml, srv3, srv2, srv1, json3 + # it vtt, ttml, srv3, srv2, srv1, json3 + # zh-Hant vtt, ttml, srv3, srv2, srv1, json3 + # hi vtt, ttml, srv3, srv2, srv1, json3 + # pt-BR vtt, ttml, srv3, srv2, srv1, json3 + # es-MX vtt, ttml, srv3, srv2, srv1, json3 + # ja vtt, ttml, srv3, srv2, srv1, json3 + # pl vtt, ttml, srv3, srv2, srv1, json3 url = 'QRS8MkLhQmM' IE = YoutubeIE @@ -67,41 +82,60 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) - self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') + self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d') + self.assertEqual(md5(subtitles['it']), '0e0b667ba68411d88fd1c5f4f4eab2f9') for lang in ['fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - def test_youtube_subtitles_ttml_format(self): + def _test_subtitles_format(self, fmt, md5_hash, lang='en'): self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'ttml' + self.DL.params['subtitlesformat'] = fmt subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54') + self.assertEqual(md5(subtitles[lang]), md5_hash) + + def test_youtube_subtitles_ttml_format(self): + self._test_subtitles_format('ttml', 'c97ddf1217390906fa9fbd34901f3da2') def test_youtube_subtitles_vtt_format(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'vtt' + self._test_subtitles_format('vtt', 'ae1bd34126571a77aabd4d276b28044d') + + def test_youtube_subtitles_json3_format(self): + self._test_subtitles_format('json3', '688dd1ce0981683867e7fe6fde2a224b') + + def _test_automatic_captions(self, url, lang): + self.url = url + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslangs'] = [lang] subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') + self.assertTrue(subtitles[lang] is not None) def test_youtube_automatic_captions(self): - self.url = '8YoUxe5ncPo' - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertTrue(subtitles['it'] is not None) + # Available automatic captions for 8YoUxe5ncPo: + # Language formats (all in vtt, ttml, srv3, srv2, srv1, json3) + # gu, zh-Hans, zh-Hant, gd, ga, gl, lb, la, lo, tt, tr, + # lv, lt, tk, th, tg, te, fil, haw, yi, ceb, yo, de, da, + # el, eo, en, eu, et, es, ru, rw, ro, bn, be, bg, uk, jv, + # bs, ja, or, xh, co, ca, cy, cs, ps, pt, pa, vi, pl, hy, + # hr, ht, hu, hmn, hi, ha, mg, uz, ml, mn, mi, mk, ur, + # mt, ms, mr, ug, ta, my, af, sw, is, am, + # *it*, iw, sv, ar, + # su, zu, az, id, ig, nl, no, ne, ny, fr, ku, fy, fa, fi, + # ka, kk, sr, sq, ko, kn, km, st, sk, si, so, sn, sm, sl, + # ky, sd + # ... + self._test_automatic_captions('8YoUxe5ncPo', 'it') + @unittest.skip('ASR subs all in all supported langs now') def test_youtube_translated_subtitles(self): - # This video has a subtitles track, which can be translated - self.url = 'Ky9eprVWzlI' - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertTrue(subtitles['it'] is not None) + # This video has a subtitles track, which can be translated (#4555) + self._test_automatic_captions('Ky9eprVWzlI', 'it') def test_youtube_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'n5BB19UTcdA' + # Available automatic captions for 8YoUxe5ncPo: + # ... + # 8YoUxe5ncPo has no subtitles + self.url = '8YoUxe5ncPo' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9c62b8890..91a3b6058 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -499,7 +499,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False From 0700fde6403aa9eec1ff02bff7323696a205900c Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Sat, 9 Jan 2021 17:56:12 +0530 Subject: [PATCH 1356/1705] [utils, etc] Kill child processes when yt-dl is killed * derived from PR #26592, closes #26592 Authored by: Unrud --- youtube_dl/YoutubeDL.py | 3 ++- youtube_dl/compat.py | 3 ++- youtube_dl/downloader/external.py | 16 ++++++++++------ youtube_dl/downloader/rtmp.py | 10 ++++++---- youtube_dl/extractor/openload.py | 3 ++- youtube_dl/postprocessor/embedthumbnail.py | 5 +++-- youtube_dl/postprocessor/ffmpeg.py | 5 +++-- youtube_dl/utils.py | 18 ++++++++++++++---- 8 files changed, 42 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 019e309cb..3895b408f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -73,6 +73,7 @@ from .utils import ( PostProcessingError, preferredencoding, prepend_extension, + process_communicate_or_kill, register_socks_protocols, render_table, replace_extension, @@ -2323,7 +2324,7 @@ class YoutubeDL(object): ['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() + out, err = process_communicate_or_kill(sp) out = out.decode().strip() if re.match('[0-9a-f]+', out): self._write_string('[debug] Git HEAD: ' + out + '\n') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2004a405a..9f5f85dae 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2890,6 +2890,7 @@ else: _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) def compat_get_terminal_size(fallback=(80, 24)): + from .utils import process_communicate_or_kill columns = compat_getenv('COLUMNS') if columns: columns = int(columns) @@ -2906,7 +2907,7 @@ else: sp = subprocess.Popen( ['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() + out, err = process_communicate_or_kill(sp) _lines, _columns = map(int, out.split()) except Exception: _columns, _lines = _terminal_size(*fallback) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index c31f8910a..a06ab2e50 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -22,6 +22,7 @@ from ..utils import ( handle_youtubedl_headers, check_executable, is_outdated_version, + process_communicate_or_kill, ) @@ -104,7 +105,7 @@ class ExternalFD(FileDownloader): p = subprocess.Popen( cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate() + _, stderr = process_communicate_or_kill(p) if p.returncode != 0: self.to_stderr(stderr.decode('utf-8', 'replace')) return p.returncode @@ -141,7 +142,7 @@ class CurlFD(ExternalFD): # curl writes the progress to stderr so don't capture it. p = subprocess.Popen(cmd) - p.communicate() + process_communicate_or_kill(p) return p.returncode @@ -336,14 +337,17 @@ class FFmpegFD(ExternalFD): proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the + except BaseException as e: + # subprocess.run would send the SIGKILL signal to ffmpeg and the # mp4 file couldn't be played, but if we ask ffmpeg to quit it # produces a file that is playable (this is mostly useful for live # streams). Note that Windows is not affected and produces playable # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32': + process_communicate_or_kill(proc, b'q') + else: + proc.kill() + proc.wait() raise return retval diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index fbb7f51b0..8a25dbc8d 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -89,11 +89,13 @@ class RtmpFD(FileDownloader): self.to_screen('') cursor_in_new_line = True self.to_screen('[rtmpdump] ' + line) - finally: + if not cursor_in_new_line: + self.to_screen('') + return proc.wait() + except BaseException: # Including KeyboardInterrupt + proc.kill() proc.wait() - if not cursor_in_new_line: - self.to_screen('') - return proc.returncode + raise url = info_dict['url'] player_url = info_dict.get('player_url') diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 0c20d0177..b05d60435 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, get_exe_version, is_outdated_version, + process_communicate_or_kill, std_headers, ) @@ -226,7 +227,7 @@ class PhantomJSwrapper(object): self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate() + out, err = process_communicate_or_kill(p) if p.returncode != 0: raise ExtractorError( 'Executing JS failed\n:' + encodeArgument(err)) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 3990908b6..5e7b6e2df 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -13,8 +13,9 @@ from ..utils import ( encodeFilename, PostProcessingError, prepend_extension, + process_communicate_or_kill, replace_extension, - shell_quote + shell_quote, ) @@ -109,7 +110,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() + stdout, stderr = process_communicate_or_kill(p) if p.returncode != 0: msg = stderr.decode('utf-8', 'replace').strip() diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 9f76c9d4e..8c29c8d59 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -16,6 +16,7 @@ from ..utils import ( is_outdated_version, PostProcessingError, prepend_extension, + process_communicate_or_kill, shell_quote, subtitles_filename, dfxp2srt, @@ -180,7 +181,7 @@ class FFmpegPostProcessor(PostProcessor): handle = subprocess.Popen( cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, stdin=subprocess.PIPE) - stdout_data, stderr_data = handle.communicate() + stdout_data, stderr_data = process_communicate_or_kill(handle) expected_ret = 0 if self.probe_available else 1 if handle.wait() != expected_ret: return None @@ -228,7 +229,7 @@ class FFmpegPostProcessor(PostProcessor): if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate() + stdout, stderr = process_communicate_or_kill(p) if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') msgs = stderr.strip().split('\n') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8aa2a43a2..4e00317f1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2212,6 +2212,15 @@ def unescapeHTML(s): r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) +def process_communicate_or_kill(p, *args, **kwargs): + try: + return p.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + p.kill() + p.wait() + raise + + def get_subprocess_encoding(): if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # For subprocess calls, encode with locale encoding @@ -3788,7 +3797,8 @@ def check_executable(exe, args=[]): """ Checks if the given binary is installed somewhere in PATH, and returns its name. args can be a list of arguments for a short output (like -version) """ try: - subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + process_communicate_or_kill(subprocess.Popen( + [exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)) except OSError: return False return exe @@ -3802,10 +3812,10 @@ def get_exe_version(exe, args=['--version'], # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if youtube-dl is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - out, _ = subprocess.Popen( + out, _ = process_communicate_or_kill(subprocess.Popen( [encodeArgument(exe)] + args, stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() + stdout=subprocess.PIPE, stderr=subprocess.STDOUT)) except OSError: return False if isinstance(out, bytes): # Python 2.x @@ -5744,7 +5754,7 @@ def write_xattr(path, key, value): cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) except EnvironmentError as e: raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate() + stdout, stderr = process_communicate_or_kill(p) stderr = stderr.decode('utf-8', 'replace') if p.returncode != 0: raise XAttrMetadataError(p.returncode, stderr) From cc179df346abf34c8f77dbb221b839092007f20c Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 12 Jun 2022 14:10:38 +0100 Subject: [PATCH 1357/1705] [XHamster] Support xhday.com alias, extract `uploader_id` * support xhday.com alias for xhamster.com (resolves #31023) Authored by: dirkf * extract `uploader_id`: from https://github.com/yt-dlp/yt-dlp/commit/908b56eaf7872149706dbd7fa071f838d0c786b7 (PR https://github.com/yt-dlp/yt-dlp/pull/844) Authored by: octotherp --- youtube_dl/extractor/xhamster.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index f73b9778f..f764021ba 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import itertools @@ -23,7 +24,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)' _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ @@ -34,7 +35,7 @@ class XHamsterIE(InfoExtractor): ''' % _DOMAINS _TESTS = [{ 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'md5': '98b4687efb1ffd331c4197854dc09e8f', + 'md5': '34e1ab926db5dc2750fed9e1f34304bb', 'info_dict': { 'id': '1509445', 'display_id': 'femaleagent-shy-beauty-takes-the-bait', @@ -43,6 +44,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1350194821, 'upload_date': '20121014', 'uploader': 'Ruseful2011', + 'uploader_id': 'ruseful2011', 'duration': 893, 'age_limit': 18, }, @@ -72,6 +74,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1454948101, 'upload_date': '20160208', 'uploader': 'parejafree', + 'uploader_id': 'parejafree', 'duration': 72, 'age_limit': 18, }, @@ -117,6 +120,9 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', 'only_matching': True, + }, { + 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', + 'only_matching': True, }] def _real_extract(self, url): @@ -245,6 +251,7 @@ class XHamsterIE(InfoExtractor): else: categories = None + uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) return { 'id': video_id, 'display_id': display_id, @@ -253,6 +260,8 @@ class XHamsterIE(InfoExtractor): 'timestamp': int_or_none(video.get('created')), 'uploader': try_get( video, lambda x: x['author']['name'], compat_str), + 'uploader_url': uploader_url, + 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, 'thumbnail': video.get('thumbURL'), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), @@ -261,7 +270,7 @@ class XHamsterIE(InfoExtractor): 'dislike_count': int_or_none(try_get( video, lambda x: x['rating']['dislikes'], int)), 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit, + 'age_limit': age_limit if age_limit is not None else 18, 'categories': categories, 'formats': formats, } @@ -352,6 +361,7 @@ class XHamsterIE(InfoExtractor): 'description': description, 'upload_date': upload_date, 'uploader': uploader, + 'uploader_id': uploader.lower() if uploader else None, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, @@ -420,6 +430,9 @@ class XHamsterUserIE(InfoExtractor): 'id': 'firatkaan', }, 'playlist_mincount': 1, + }, { + 'url': 'https://xhday.com/users/mobhunter', + 'only_matching': True, }] def _entries(self, user_id): From 11665dd2367a2eefd1ad090828f987fef11226e4 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 15 Jun 2022 18:26:54 +0100 Subject: [PATCH 1358/1705] [test] Fix linter for 3aa94d7945dfaa0e04acf2700ffe0e43b00db498 --- test/test_download.py | 1 + test/test_subtitles.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 0951a171a..6a6673bc2 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -88,6 +88,7 @@ class TestDownload(unittest.TestCase): # Dynamically generate tests + def generator(test_case, tname): def test_template(self): diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 4cbc69ccd..1197721ff 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -117,7 +117,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): # el, eo, en, eu, et, es, ru, rw, ro, bn, be, bg, uk, jv, # bs, ja, or, xh, co, ca, cy, cs, ps, pt, pa, vi, pl, hy, # hr, ht, hu, hmn, hi, ha, mg, uz, ml, mn, mi, mk, ur, - # mt, ms, mr, ug, ta, my, af, sw, is, am, + # mt, ms, mr, ug, ta, my, af, sw, is, am, # *it*, iw, sv, ar, # su, zu, az, id, ig, nl, no, ne, ny, fr, ku, fy, fa, fi, # ka, kk, sr, sq, ko, kn, km, st, sk, si, so, sn, sm, sl, From 8a158a936c8b002ef536e9e2b778ded02c09c0fa Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 14 Jun 2022 19:45:34 +0100 Subject: [PATCH 1359/1705] [NHK] Use new API URL --- youtube_dl/extractor/nhk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 46a800e7e..f43d91cd5 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -8,7 +8,7 @@ from ..utils import urljoin class NhkBaseIE(InfoExtractor): - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' _TYPE_REGEX = r'/(?P<type>video|audio)/' From a03b9775d544b06a5b4f2aa630214c7c22fc2229 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 26 Jun 2022 14:18:33 +0100 Subject: [PATCH 1360/1705] [Mediaset] Support player version number in URL pattern Ref: https://github.com/yt-dlp/yt-dlp/issues/4141 --- youtube_dl/extractor/mediaset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 2c16fc9e2..20048c6ab 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -24,7 +24,7 @@ class MediasetIE(ThePlatformBaseIE): (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| - player/index\.html\?.*?\bprogramGuid= + player(?:/v\d+)?/index\.html\?.*?\bprogramGuid= ) )(?P<id>[0-9A-Z]{16,}) ''' @@ -73,6 +73,10 @@ class MediasetIE(ThePlatformBaseIE): # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, + }, { + # embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/) + 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/', + 'only_matching': True, }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, From 090acd58c1d810fbef1bac08d70bbfad9c0a7504 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 3 Jul 2022 20:05:21 +0100 Subject: [PATCH 1361/1705] [options] Improve be35e53 (--match-/reject-title parameter value) Resolves #31064. --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6521ad881..f6621ef91 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -270,11 +270,11 @@ def parseOpts(overrideArguments=None): selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', - help='Download only matching titles (case-insensitive regex or sub-string)') + help='Download only matching titles (case-insensitive regex or alphanumeric sub-string)') selection.add_option( '--reject-title', dest='rejecttitle', metavar='REGEX', - help='Skip download for matching titles (case-insensitive regex or sub-string)') + help='Skip download for matching titles (case-insensitive regex or alphanumeric sub-string)') selection.add_option( '--max-downloads', dest='max_downloads', metavar='NUMBER', type=int, default=None, From 5f5c127ece74e52aa5b49b6d2941cc0f848d3c36 Mon Sep 17 00:00:00 2001 From: Kyraminol Endyeran <kyraminari@gmail.com> Date: Tue, 12 Jul 2022 01:35:40 +0200 Subject: [PATCH 1362/1705] [VVVVID] Support video/dash types (#31060) Resolves #31030. --- youtube_dl/extractor/vvvvid.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index bc196f8a0..6a0d4e8f0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -64,6 +64,18 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/dash' + 'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi', + 'info_dict': { + 'id': '693786', + 'ext': 'mp4', + 'title': 'Nanachi', + }, + 'params': { + 'skip_download': True, + 'format': 'mp4', + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -205,6 +217,9 @@ class VVVVIDIE(InfoExtractor): }) is_youtube = True break + elif video_type == 'video/dash': + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) From adb5294177265ba35b45746dbb600965076ed150 Mon Sep 17 00:00:00 2001 From: Wes <morganw@gmail.com> Date: Fri, 29 Jul 2022 20:10:00 -0500 Subject: [PATCH 1363/1705] [aenetworks] Update _THEPLATFORM_KEY and _THEPLATFORM_SECRET (#29749) Fixes ytdl-org/youtube-dl#29300 --- youtube_dl/extractor/aenetworks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index e55c03fd7..2a1f08e39 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -20,8 +20,8 @@ class AENetworksBaseIE(ThePlatformIE): (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/''' - _THEPLATFORM_KEY = 'crazyjava' - _THEPLATFORM_SECRET = 's3cr3t' + _THEPLATFORM_KEY = '43jXaGRQud' + _THEPLATFORM_SECRET = 'S10BPXHMlb' _DOMAIN_MAP = { 'history.com': ('HISTORY', 'history'), 'aetv.com': ('AETV', 'aetv'), From deee741fb145360576ceae9d69b1b43db082c404 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 9 Aug 2022 21:05:00 +0100 Subject: [PATCH 1364/1705] [test, etc] Improve download test logs; also clean up some new flake8 issues (#31153) * [test] Identify testcase errors better * [test] Identify download errors better * [extractor/minds] Linter * [extractor/aes] Linter --- test/test_download.py | 7 +++++-- youtube_dl/aes.py | 2 +- youtube_dl/extractor/minds.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 6a6673bc2..19936969f 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -33,6 +33,7 @@ from youtube_dl.compat import ( from youtube_dl.utils import ( DownloadError, ExtractorError, + error_to_compat_str, format_bytes, UnavailableVideoError, ) @@ -108,7 +109,7 @@ def generator(test_case, tname): for tc in test_cases: info_dict = tc.get('info_dict', {}) if not (info_dict.get('id') and info_dict.get('ext')): - raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') + raise Exception('Test definition (%s) requires both \'id\' and \'ext\' keys present to define the output file' % (tname, )) if 'skip' in test_case: print_skipping(test_case['skip']) @@ -161,7 +162,9 @@ def generator(test_case, tname): except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): - raise + msg = getattr(err, 'msg', error_to_compat_str(err)) + err.msg = '%s (%s)' % (msg, tname, ) + raise err if try_num == RETRIES: report_warning('%s failed due to network errors, skipping...' % tname) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 461bb6d41..d0de2d93f 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -303,7 +303,7 @@ def xor(data1, data2): def rijndael_mul(a, b): - if(a == 0 or b == 0): + if (a == 0 or b == 0): return 0 return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py index 8e9f0f825..e8fd582aa 100644 --- a/youtube_dl/extractor/minds.py +++ b/youtube_dl/extractor/minds.py @@ -78,7 +78,7 @@ class MindsIE(MindsBaseIE): else: return self.url_result(entity['perma_url']) else: - assert(entity['subtype'] == 'video') + assert (entity['subtype'] == 'video') video_id = entity_id # 1080p and webm formats available only on the sources array video = self._call_api( From e6a836d54ca1d3cd02f3ee45ef707a46f23e8291 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 10 Aug 2022 15:37:59 +0100 Subject: [PATCH 1365/1705] [core] Make `--max-downloads ...` stop immediately on reaching the limit Based on and closes #26638. --- youtube_dl/YoutubeDL.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3895b408f..e77b8d50c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1779,10 +1779,9 @@ class YoutubeDL(object): assert info_dict.get('_type', 'video') == 'video' - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + max_downloads = int_or_none(self.params.get('max_downloads')) or float('inf') + if self._num_downloads >= max_downloads: + raise MaxDownloadsReached() # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] @@ -2062,6 +2061,9 @@ class YoutubeDL(object): self.report_error('postprocessing: %s' % str(err)) return self.record_download_archive(info_dict) + # avoid possible nugatory search for further items (PR #26638) + if self._num_downloads >= max_downloads: + raise MaxDownloadsReached() def download(self, url_list): """Download a given list of URLs.""" From d231b56717c73ee597d2e077d11b69ed48a1b02d Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 14 Aug 2022 18:45:45 +0100 Subject: [PATCH 1366/1705] [jsinterp] Overhaul JSInterp to handle new YT players 4c3f79c5, 324f67b9 (#31170) * back-port from yt-dlp 8f53dc44a0cc1c2d98c35740b9293462c080f5d0, thanks pukkandan * also support void, improve <</>> precedence, improve expressions in comma-list * add more tests --- test/test_jsinterp.py | 49 ++- test/test_utils.py | 3 + test/test_youtube_signature.py | 13 + youtube_dl/compat.py | 54 ++- youtube_dl/jsinterp.py | 581 ++++++++++++++++++++------------- youtube_dl/utils.py | 47 ++- 6 files changed, 500 insertions(+), 247 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index acdabffb1..c6c931743 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -19,6 +19,9 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) + jsi = JSInterpreter('function x3(){42}') + self.assertEqual(jsi.call_function('x3'), None) + jsi = JSInterpreter('var x5 = function(){return 42;}') self.assertEqual(jsi.call_function('x5'), 42) @@ -51,8 +54,11 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return 11 >> 2;}') self.assertEqual(jsi.call_function('f'), 2) + jsi = JSInterpreter('function f(){return []? 2+3: 4;}') + self.assertEqual(jsi.call_function('f'), 5) + def test_array_access(self): - jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2] = 7; return x;}') + jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) def test_parens(self): @@ -62,6 +68,10 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return (1 + 2) * 3;}') self.assertEqual(jsi.call_function('f'), 9) + def test_quotes(self): + jsi = JSInterpreter(r'function f(){return "a\"\\("}') + self.assertEqual(jsi.call_function('f'), r'a"\(') + def test_assignments(self): jsi = JSInterpreter('function f(){var x = 20; x = 30 + 1; return x;}') self.assertEqual(jsi.call_function('f'), 31) @@ -104,18 +114,29 @@ class TestJSInterpreter(unittest.TestCase): }''') self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) + def test_builtins(self): + jsi = JSInterpreter(''' + function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + jsi = JSInterpreter(''' + function x(dt) { return new Date(dt) - 0; } + ''') + self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } - function y(a) { return x() + a; } + function y(a) { return x() + (a?a:0); } function z() { return y(3); } ''') self.assertEqual(jsi.call_function('z'), 5) + self.assertEqual(jsi.call_function('y'), 2) def test_for_loop(self): # function x() { a=0; for (i=0; i-10; i++) {a++} a } jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i = i + 1) {a++} a } + function x() { a=0; for (i=0; i-10; i++) {a++} return a } ''') self.assertEqual(jsi.call_function('x'), 10) @@ -156,19 +177,19 @@ class TestJSInterpreter(unittest.TestCase): def test_for_loop_continue(self): jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a } + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } ''') self.assertEqual(jsi.call_function('x'), 0) def test_for_loop_break(self): jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { break; a++ } a } + function x() { a=0; for (i=0; i-10; i++) { break; a++ } return a } ''') self.assertEqual(jsi.call_function('x'), 0) def test_literal_list(self): jsi = JSInterpreter(''' - function x() { [1, 2, "asdf", [5, 6, 7]][3] } + function x() { return [1, 2, "asdf", [5, 6, 7]][3] } ''') self.assertEqual(jsi.call_function('x'), [5, 6, 7]) @@ -177,6 +198,22 @@ class TestJSInterpreter(unittest.TestCase): function x() { a=5; a -= 1, a+=3; return a } ''') self.assertEqual(jsi.call_function('x'), 7) + jsi = JSInterpreter(''' + function x() { a=5; return (a -= 1, a+=3, a); } + ''') + self.assertEqual(jsi.call_function('x'), 7) + + def test_void(self): + jsi = JSInterpreter(''' + function x() { return void 42; } + ''') + self.assertEqual(jsi.call_function('x'), None) + + def test_return_function(self): + jsi = JSInterpreter(''' + function x() { return [1, function(){return 1}][1] } + ''') + self.assertEqual(jsi.call_function('x')([]), 1) if __name__ == '__main__': diff --git a/test/test_utils.py b/test/test_utils.py index 259c4763e..f1a748dde 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -370,6 +370,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) + self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) + self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index fc5e9828e..6e955e0f0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -90,12 +90,25 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js', 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw', ), + ( + 'https://www.youtube.com/s/player/5dd88d1d/player-plasma-ias-phone-en_US.vflset/base.js', + 'kSxKFLeqzv_ZyHSAt', 'n8gS8oRlHOxPFA', + ), + ( + 'https://www.youtube.com/s/player/324f67b9/player_ias.vflset/en_US/base.js', + 'xdftNy7dh9QGnhW', '22qLGxrmX8F1rA', + ), + ( + 'https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', + 'TDCstCG66tEAO5pR9o', 'dbxNtZ14c-yWyw', + ), ] class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( + ('https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', '4c3f79c5'), ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9f5f85dae..6d2c31a61 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2985,7 +2985,6 @@ except ImportError: except ImportError: compat_filter = filter - try: from future_builtins import zip as compat_zip except ImportError: # not 2.6+ or is 3.x @@ -2995,6 +2994,57 @@ except ImportError: # not 2.6+ or is 3.x compat_zip = zip +# method renamed between Py2/3 +try: + from itertools import zip_longest as compat_itertools_zip_longest +except ImportError: + from itertools import izip_longest as compat_itertools_zip_longest + + +# new class in collections +try: + from collections import ChainMap as compat_collections_chain_map +except ImportError: + # Py < 3.3 + class compat_collections_chain_map(compat_collections_abc.MutableMapping): + + maps = [{}] + + def __init__(self, *maps): + self.maps = list(maps) or [{}] + + def __getitem__(self, k): + for m in self.maps: + if k in m: + return m[k] + raise KeyError(k) + + def __setitem__(self, k, v): + self.maps[0].__setitem__(k, v) + return + + def __delitem__(self, k): + if k in self.maps[0]: + del self.maps[0][k] + return + raise KeyError(k) + + def __iter__(self): + return itertools.chain(*reversed(self.maps)) + + def __len__(self): + return len(iter(self)) + + def new_child(self, m=None, **kwargs): + m = m or {} + m.update(kwargs) + return compat_collections_chain_map(m, *self.maps) + + @property + def parents(self): + return compat_collections_chain_map(*(self.maps[1:])) + + if sys.version_info < (3, 3): def compat_b64decode(s, *args, **kwargs): if isinstance(s, compat_str): @@ -3031,6 +3081,7 @@ __all__ = [ 'compat_basestring', 'compat_chr', 'compat_collections_abc', + 'compat_collections_chain_map', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', @@ -3051,6 +3102,7 @@ __all__ = [ 'compat_input', 'compat_integer_types', 'compat_itertools_count', + 'compat_itertools_zip_longest', 'compat_kwargs', 'compat_map', 'compat_numeric_types', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 8eaa911cd..c60a9b3c2 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,42 +1,87 @@ from __future__ import unicode_literals +import itertools import json +import math import operator import re from .utils import ( + NO_DEFAULT, ExtractorError, + js_to_json, remove_quotes, + unified_timestamp, ) from .compat import ( - compat_collections_abc, + compat_collections_chain_map as ChainMap, + compat_itertools_zip_longest as zip_longest, compat_str, ) -MutableMapping = compat_collections_abc.MutableMapping +_NAME_RE = r'[a-zA-Z_$][\w$]*' -class Nonlocal: - pass +# (op, definition) in order of binding priority, tightest first +# avoid dict to maintain order +# definition None => Defined in JSInterpreter._operator +_DOT_OPERATORS = ( + ('.', None), + # TODO: ('?.', None), +) - -_OPERATORS = [ +_OPERATORS = ( ('|', operator.or_), ('^', operator.xor), ('&', operator.and_), ('>>', operator.rshift), ('<<', operator.lshift), - ('-', operator.sub), ('+', operator.add), - ('%', operator.mod), - ('/', operator.truediv), + ('-', operator.sub), ('*', operator.mul), -] -_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) + ('/', operator.truediv), + ('%', operator.mod), +) -_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +_COMP_OPERATORS = ( + ('===', operator.is_), + ('==', operator.eq), + ('!==', operator.is_not), + ('!=', operator.ne), + ('<=', operator.le), + ('>=', operator.ge), + ('<', operator.lt), + ('>', operator.gt), +) + +_LOG_OPERATORS = ( + ('&', operator.and_), + ('|', operator.or_), + ('^', operator.xor), +) + +_SC_OPERATORS = ( + ('?', None), + ('||', None), + ('&&', None), + # TODO: ('??', None), +) + +_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) +_QUOTES = '\'"' + + +def _ternary(cndn, if_true=True, if_false=False): + """Simulate JS's ternary operator (cndn?if_true:if_false)""" + if cndn in (False, None, 0, ''): + return if_false + try: + if math.isnan(cndn): # NB: NaN cannot be checked by membership + return if_false + except TypeError: + pass + return if_true class JS_Break(ExtractorError): @@ -49,70 +94,77 @@ class JS_Continue(ExtractorError): ExtractorError.__init__(self, 'Invalid continue') -class LocalNameSpace(MutableMapping): - def __init__(self, *stack): - self.stack = tuple(stack) - - def __getitem__(self, key): - for scope in self.stack: - if key in scope: - return scope[key] - raise KeyError(key) - +class LocalNameSpace(ChainMap): def __setitem__(self, key, value): - for scope in self.stack: + for scope in self.maps: if key in scope: scope[key] = value - break - else: - self.stack[0][key] = value - return value + return + self.maps[0][key] = value def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') - def __iter__(self): - for scope in self.stack: - for scope_item in iter(scope): - yield scope_item - - def __len__(self, key): - return len(iter(self)) - def __repr__(self): - return 'LocalNameSpace%s' % (self.stack, ) + return 'LocalNameSpace%s' % (self.maps, ) class JSInterpreter(object): + __named_object_counter = 0 + def __init__(self, code, objects=None): - if objects is None: - objects = {} - self.code = code - self._functions = {} - self._objects = objects - self.__named_object_counter = 0 + self.code, self._functions = code, {} + self._objects = {} if objects is None else objects + + class Exception(ExtractorError): + def __init__(self, msg, *args, **kwargs): + expr = kwargs.pop('expr', None) + if expr is not None: + msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) + super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) def _named_object(self, namespace, obj): self.__named_object_counter += 1 - name = '__youtube_dl_jsinterp_obj%s' % (self.__named_object_counter, ) + name = '__youtube_dl_jsinterp_obj%d' % (self.__named_object_counter, ) namespace[name] = obj return name @staticmethod - def _separate(expr, delim=',', max_split=None): + def _separate(expr, delim=',', max_split=None, skip_delims=None): if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} - start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 + start, splits, pos, skipping, delim_len = 0, 0, 0, 0, len(delim) - 1 + in_quote, escaping = None, False for idx, char in enumerate(expr): - if char in _MATCHING_PARENS: - counters[_MATCHING_PARENS[char]] += 1 - elif char in counters: - counters[char] -= 1 - if char != delim[pos] or any(counters.values()): - pos = 0 + if not in_quote: + if char in _MATCHING_PARENS: + counters[_MATCHING_PARENS[char]] += 1 + elif char in counters: + counters[char] -= 1 + if not escaping: + if char in _QUOTES and in_quote in (char, None): + in_quote = None if in_quote else char + else: + escaping = in_quote and char == '\\' + else: + escaping = False + + if char != delim[pos] or any(counters.values()) or in_quote: + pos = skipping = 0 continue - elif pos != delim_len: + elif skipping > 0: + skipping -= 1 + continue + elif pos == 0 and skip_delims: + here = expr[idx:] + for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]: + if here.startswith(s) and s: + skipping = len(s) - 1 + break + if skipping > 0: + continue + if pos < delim_len: pos += 1 continue yield expr[start: idx - delim_len] @@ -122,61 +174,108 @@ class JSInterpreter(object): break yield expr[start:] - @staticmethod - def _separate_at_paren(expr, delim): - separated = list(JSInterpreter._separate(expr, delim, 1)) + @classmethod + def _separate_at_paren(cls, expr, delim): + separated = list(cls._separate(expr, delim, 1)) + if len(separated) < 2: - raise ExtractorError('No terminating paren {0} in {1}'.format(delim, expr)) + raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() + @staticmethod + def _all_operators(): + return itertools.chain( + _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS) + + def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): + if op in ('||', '&&'): + if (op == '&&') ^ _ternary(left_val): + return left_val # short circuiting + elif op == '?': + right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) + + right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) + opfunc = op and next((v for k, v in self._all_operators() if k == op), None) + if not opfunc: + return right_val + + try: + return opfunc(left_val, right_val) + except Exception as e: + raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e) + + def _index(self, obj, idx): + if idx == 'length': + return len(obj) + try: + return obj[int(idx)] if isinstance(obj, list) else obj[idx] + except Exception as e: + raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) + + def _dump(self, obj, namespace): + try: + return json.dumps(obj) + except TypeError: + return self._named_object(namespace, obj) + def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: - raise ExtractorError('Recursion limit reached') + raise self.Exception('Recursion limit reached') + allow_recursion -= 1 - sub_statements = list(self._separate(stmt, ';')) - stmt = (sub_statements or ['']).pop() + should_return = False + sub_statements = list(self._separate(stmt, ';')) or [''] + expr = stmt = sub_statements.pop().strip() for sub_stmt in sub_statements: - ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) - if should_abort: - return ret + ret, should_return = self.interpret_statement(sub_stmt, local_vars, allow_recursion) + if should_return: + return ret, should_return - should_abort = False - stmt = stmt.lstrip() - stmt_m = re.match(r'var\s', stmt) - if stmt_m: - expr = stmt[len(stmt_m.group(0)):] - else: - return_m = re.match(r'return(?:\s+|$)', stmt) - if return_m: - expr = stmt[len(return_m.group(0)):] - should_abort = True + m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|$)', stmt) + if m: + expr = stmt[len(m.group(0)):].strip() + should_return = not m.group('var') + if not expr: + return None, should_return + + if expr[0] in _QUOTES: + inner, outer = self._separate(expr, expr[0], 1) + inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) + if not outer: + return inner, should_return + expr = self._named_object(local_vars, inner) + outer + + if expr.startswith('new '): + obj = expr[4:] + if obj.startswith('Date('): + left, right = self._separate_at_paren(obj[4:], ')') + left = self.interpret_expression(left, local_vars, allow_recursion) + expr = unified_timestamp(left, False) + if not expr: + raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr) + expr = self._dump(int(expr * 1000), local_vars) + right else: - # Try interpreting it as an expression - expr = stmt + raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr) - v = self.interpret_expression(expr, local_vars, allow_recursion) - return v, should_abort - - def interpret_expression(self, expr, local_vars, allow_recursion): - expr = expr.strip() - if expr == '': # Empty expression - return None + if expr.startswith('void '): + left = self.interpret_expression(expr[5:], local_vars, allow_recursion) + return None, should_return if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') - inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1) + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: - return inner + return inner, should_abort or should_return else: - expr = json.dumps(inner) + outer + expr = self._dump(inner, local_vars) + outer if expr.startswith('('): inner, outer = self._separate_at_paren(expr, ')') - inner = self.interpret_expression(inner, local_vars, allow_recursion) - if not outer: - return inner + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + if not outer or should_abort: + return inner, should_abort or should_return else: - expr = json.dumps(inner) + outer + expr = self._dump(inner, local_vars) + outer if expr.startswith('['): inner, outer = self._separate_at_paren(expr, ']') @@ -185,57 +284,53 @@ class JSInterpreter(object): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'try\s*', expr) - if m: + m = re.match(r'(?P<try>try|finally)\s*|(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) + md = m.groupdict() if m else {} + if md.get('try'): if expr[m.end()] == '{': try_expr, expr = self._separate_at_paren(expr[m.end():], '}') else: try_expr, expr = expr[m.end() - 1:], '' - ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) if should_abort: - return ret - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + return ret, True + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return - m = re.match(r'(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) - md = m.groupdict() if m else {} - if md.get('catch'): + elif md.get('catch'): # We ignore the catch block _, expr = self._separate_at_paren(expr, '}') - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return elif md.get('for'): - def raise_constructor_error(c): - raise ExtractorError( - 'Premature return in the initialization of a for loop in {0!r}'.format(c)) - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining, '}') else: - m = re.match(r'switch\s*\(', remaining) # FIXME - if m: - switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')') + switch_m = re.match(r'switch\s*\(', remaining) # FIXME + if switch_m: + switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:], ')') body, expr = self._separate_at_paren(remaining, '}') body = 'switch(%s){%s}' % (switch_val, body) else: body, expr = remaining, '' start, cndn, increment = self._separate(constructor, ';') - if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: - raise_constructor_error(constructor) + self.interpret_expression(start, local_vars, allow_recursion) while True: - if not self.interpret_expression(cndn, local_vars, allow_recursion): + if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): break try: - ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) if should_abort: - return ret + return ret, True except JS_Break: break except JS_Continue: pass - if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: - raise_constructor_error(constructor) - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + self.interpret_expression(increment, local_vars, allow_recursion) + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return elif md.get('switch'): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') @@ -245,7 +340,7 @@ class JSInterpreter(object): for default in (False, True): matched = False for item in items: - case, stmt = [i.strip() for i in self._separate(item, ':', 1)] + case, stmt = (i.strip() for i in self._separate(item, ':', 1)) if default: matched = matched or case == 'default' elif not matched: @@ -254,24 +349,28 @@ class JSInterpreter(object): if not matched: continue try: - ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion) if should_abort: return ret except JS_Break: break if matched: break - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return # Comma separated statements sub_expressions = list(self._separate(expr)) - expr = sub_expressions.pop().strip() if sub_expressions else '' - for sub_expr in sub_expressions: - self.interpret_expression(sub_expr, local_vars, allow_recursion) + if len(sub_expressions) > 1: + for sub_expr in sub_expressions: + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + return ret, False for m in re.finditer(r'''(?x) - (?P<pre_sign>\+\+|--)(?P<var1>%(_NAME_RE)s)| - (?P<var2>%(_NAME_RE)s)(?P<post_sign>\+\+|--)''' % globals(), expr): + (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})| + (?P<var2>{_NAME_RE})(?P<post_sign>\+\+|--)'''.format(**globals()), expr): var = m.group('var1') or m.group('var2') start, end = m.span() sign = m.group('pre_sign') or m.group('post_sign') @@ -279,85 +378,87 @@ class JSInterpreter(object): local_vars[var] += 1 if sign[0] == '+' else -1 if m.group('pre_sign'): ret = local_vars[var] - expr = expr[:start] + json.dumps(ret) + expr[end:] + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] - for op, opfunc in _ASSIGN_OPERATORS: - m = re.match(r'''(?x) - (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])? - \s*%s - (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr) - if not m: - continue - right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) + if not expr: + return None, should_return - if m.groupdict().get('index'): - lvar = local_vars[m.group('out')] - idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) - if not isinstance(idx, int): - raise ExtractorError('List indices must be integers: %s' % (idx, )) - cur = lvar[idx] - val = opfunc(cur, right_val) - lvar[idx] = val - return val - else: - cur = local_vars.get(m.group('out')) - val = opfunc(cur, right_val) - local_vars[m.group('out')] = val - return val + m = re.match(r'''(?x) + (?P<assign> + (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* + (?P<op>{_OPERATOR_RE})? + =(?P<expr>.*)$ + )|(?P<return> + (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ + )|(?P<indexing> + (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ + )|(?P<attribute> + (?P<var>{_NAME_RE})(?:\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* + )|(?P<function> + (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ + )'''.format(**globals()), expr) + md = m.groupdict() if m else {} + if md.get('assign'): + left_val = local_vars.get(m.group('out')) - if expr.isdigit(): - return int(expr) + if not m.group('index'): + local_vars[m.group('out')] = self._operator( + m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) + return local_vars[m.group('out')], should_return + elif left_val is None: + raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) - if expr == 'break': + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, (int, float)): + raise self.Exception('List index %s must be integer' % (idx, ), expr=expr) + idx = int(idx) + left_val[idx] = self._operator( + m.group('op'), left_val[idx], m.group('expr'), expr, local_vars, allow_recursion) + return left_val[idx], should_return + + elif expr.isdigit(): + return int(expr), should_return + + elif expr == 'break': raise JS_Break() elif expr == 'continue': raise JS_Continue() - var_m = re.match( - r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE, - expr) - if var_m: - return local_vars[var_m.group('name')] + elif md.get('return'): + return local_vars[m.group('name')], should_return try: - return json.loads(expr) + ret = json.loads(js_to_json(expr)) # strict=True) + if not md.get('attribute'): + return ret, should_return except ValueError: pass - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: + if md.get('indexing'): val = local_vars[m.group('in')] idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) - return val[idx] + return self._index(val, idx), should_return - def raise_expr_error(where, op, exp): - raise ExtractorError('Premature {0} return of {1} in {2!r}'.format(where, op, exp)) - - for op, opfunc in _OPERATORS: - separated = list(self._separate(expr, op)) + for op, _ in self._all_operators(): + # hackety: </> have higher priority than <</>>, but don't confuse them + skip_delim = (op + op) if op in ('<', '>') else None + separated = list(self._separate(expr, op, skip_delims=skip_delim)) if len(separated) < 2: continue - right_val = separated.pop() - left_val = op.join(separated) - left_val, should_abort = self.interpret_statement( - left_val, local_vars, allow_recursion - 1) - if should_abort: - raise_expr_error('left-side', op, expr) - right_val, should_abort = self.interpret_statement( - right_val, local_vars, allow_recursion - 1) - if should_abort: - raise_expr_error('right-side', op, expr) - return opfunc(left_val or 0, right_val) - m = re.match( - r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE, - expr) - if m: + right_expr = separated.pop() + while op == '-' and len(separated) > 1 and not separated[-1].strip(): + right_expr = '-' + right_expr + separated.pop() + left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) + return self._operator(op, 0 if left_val is None else left_val, + right_expr, expr, local_vars, allow_recursion), should_return + + if md.get('attribute'): variable = m.group('var') - nl = Nonlocal() - - nl.member = remove_quotes(m.group('member') or m.group('member2')) + member = m.group('member') + if not member: + member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] if arg_str.startswith('('): arg_str, remaining = self._separate_at_paren(arg_str, ')') @@ -367,25 +468,24 @@ class JSInterpreter(object): def assertion(cndn, msg): """ assert, but without risk of getting optimized out """ if not cndn: - raise ExtractorError('{0} {1}: {2}'.format(nl.member, msg, expr)) + raise ExtractorError('{member} {msg}'.format(**locals()), expr=expr) def eval_method(): - # nonlocal member - member = nl.member - if variable == 'String': - obj = compat_str - elif variable in local_vars: - obj = local_vars[variable] - else: + if (variable, member) == ('console', 'debug'): + return + types = { + 'String': compat_str, + 'Math': float, + } + obj = local_vars.get(variable, types.get(variable, NO_DEFAULT)) + if obj is NO_DEFAULT: if variable not in self._objects: self._objects[variable] = self.extract_object(variable) obj = self._objects[variable] + # Member access if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] + return self._index(obj, member) # Function call argvals = [ @@ -396,12 +496,17 @@ class JSInterpreter(object): if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) - raise ExtractorError('Unsupported string method %s' % (member, )) + raise self.Exception('Unsupported string method ' + member, expr=expr) + elif obj == float: + if member == 'pow': + assertion(len(argvals) == 2, 'takes two arguments') + return argvals[0] ** argvals[1] + raise self.Exception('Unsupported Math method ' + member, expr=expr) if member == 'split': assertion(argvals, 'takes one or more arguments') - assertion(argvals == [''], 'with arguments is not implemented') - return list(obj) + assertion(len(argvals) == 1, 'with limit argument is not implemented') + return obj.split(argvals[0]) if argvals[0] else list(obj) elif member == 'join': assertion(isinstance(obj, list), 'must be applied on a list') assertion(len(argvals) == 1, 'takes exactly one argument') @@ -447,7 +552,7 @@ class JSInterpreter(object): assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') f, this = (argvals + [''])[:2] - return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)] + return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)] elif member == 'indexOf': assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') @@ -457,32 +562,35 @@ class JSInterpreter(object): except ValueError: return -1 - if isinstance(obj, list): - member = int(member) - nl.member = member - return obj[member](argvals) + idx = int(member) if isinstance(obj, list) else member + return obj[idx](argvals, allow_recursion=allow_recursion) if remaining: - return self.interpret_expression( + ret, should_abort = self.interpret_statement( self._named_object(local_vars, eval_method()) + remaining, local_vars, allow_recursion) + return ret, should_return or should_abort else: - return eval_method() + return eval_method(), should_return - m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) - if m: - fname = m.group('func') - argvals = tuple([ - int(v) if v.isdigit() else local_vars[v] - for v in self._separate(m.group('args'))]) + elif md.get('function'): + fname = m.group('fname') + argvals = [self.interpret_expression(v, local_vars, allow_recursion) + for v in self._separate(m.group('args'))] if fname in local_vars: - return local_vars[fname](argvals) + return local_vars[fname](argvals, allow_recursion=allow_recursion), should_return elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) - return self._functions[fname](argvals) + return self._functions[fname](argvals, allow_recursion=allow_recursion), should_return - if expr: - raise ExtractorError('Unsupported JS expression %r' % expr) + raise self.Exception( + 'Unsupported JS expression ' + (expr[:40] if expr != stmt else ''), expr=stmt) + + def interpret_expression(self, expr, local_vars, allow_recursion): + ret, should_return = self.interpret_statement(expr, local_vars, allow_recursion) + if should_return: + raise self.Exception('Cannot return from an expression', expr) + return ret def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -494,15 +602,17 @@ class JSInterpreter(object): }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) + if not obj_m: + raise self.Exception('Could not find object ' + objname) fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( r'''(?x) - (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} - ''' % _FUNC_NAME_RE, + (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)} + ''' % (_FUNC_NAME_RE, _NAME_RE), fields) for f in fields_m: - argnames = f.group('args').split(',') + argnames = self.build_arglist(f.group('args')) obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) return obj @@ -510,15 +620,19 @@ class JSInterpreter(object): def extract_function_code(self, funcname): """ @returns argnames, code """ func_m = re.search( - r'''(?x) - (?:function\s+%(f_n)s|[{;,]\s*%(f_n)s\s*=\s*function|var\s+%(f_n)s\s*=\s*function)\s* + r'''(?xs) + (?: + function\s+%(name)s| + [{;,]\s*%(name)s\s*=\s*function| + (?:var|const|let)\s+%(name)s\s*=\s*function + )\s* \((?P<args>[^)]*)\)\s* - (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % {'f_n': re.escape(funcname), }, + (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match if func_m is None: - raise ExtractorError('Could not find JS function %r' % funcname) - return func_m.group('args').split(','), code + raise self.Exception('Could not find JS function "{funcname}"'.format(**locals())) + return self.build_arglist(func_m.group('args')), code def extract_function(self, funcname): return self.extract_function_from_code(*self.extract_function_code(funcname)) @@ -534,7 +648,7 @@ class JSInterpreter(object): name = self._named_object( local_vars, self.extract_function_from_code( - [x.strip() for x in mobj.group('args').split(',')], + self.build_arglist(mobj.group('args')), body, local_vars, *global_stack)) code = code[:start] + name + remaining return self.build_function(argnames, code, local_vars, *global_stack) @@ -542,17 +656,22 @@ class JSInterpreter(object): def call_function(self, funcname, *args): return self.extract_function(funcname)(args) + @classmethod + def build_arglist(cls, arg_text): + if not arg_text: + return [] + return list(filter(None, (x.strip() or None for x in cls._separate(arg_text)))) + def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] - local_vars = global_stack.pop(0) + argnames = tuple(argnames) - def resf(args, **kwargs): - local_vars.update(dict(zip(argnames, args))) - local_vars.update(kwargs) - var_stack = LocalNameSpace(local_vars, *global_stack) - for stmt in self._separate(code.replace('\n', ''), ';'): - ret, should_abort = self.interpret_statement(stmt, var_stack) - if should_abort: - break - return ret + def resf(args, kwargs={}, allow_recursion=100): + global_stack[0].update( + zip_longest(argnames, args, fillvalue=None)) + global_stack[0].update(kwargs) + var_stack = LocalNameSpace(*global_stack) + ret, should_abort = self.interpret_statement(code.replace('\n', ''), var_stack, allow_recursion - 1) + if should_abort: + return ret return resf diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4e00317f1..a5f584ec5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1696,6 +1696,17 @@ MONTH_NAMES = { 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], } +# Timezone names for RFC2822 obs-zone +# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42 +TIMEZONE_NAMES = { + 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0, + 'AST': -4, 'ADT': -3, # Atlantic (used in Canada) + 'EST': -5, 'EDT': -4, # Eastern + 'CST': -6, 'CDT': -5, # Central + 'MST': -7, 'MDT': -6, # Mountain + 'PST': -8, 'PDT': -7 # Pacific +} + KNOWN_EXTENSIONS = ( 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', 'flv', 'f4v', 'f4a', 'f4b', @@ -1735,12 +1746,17 @@ DATE_FORMATS = ( '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', + '%Y.%m.%d.', '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y%m%d%H%M', + '%Y%m%d%H%M%S', + '%Y%m%d', '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', + '%Y-%m-%d %H:%M:%S:%f', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', @@ -1753,6 +1769,7 @@ DATE_FORMATS = ( '%b %d %Y at %H:%M:%S', '%B %d %Y at %H:%M', '%B %d %Y at %H:%M:%S', + '%H:%M %d-%b-%Y', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) @@ -1763,6 +1780,7 @@ DATE_FORMATS_DAY_FIRST.extend([ '%d/%m/%Y', '%d/%m/%y', '%d/%m/%Y %H:%M:%S', + '%d-%m-%Y %H:%M', ]) DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) @@ -2966,10 +2984,22 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): def extract_timezone(date_str): m = re.search( - r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) + r'''(?x) + ^.{8,}? # >=8 char non-TZ prefix, if present + (?P<tz>Z| # just the UTC Z, or + (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or + (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits + [ ]? # optional space + (?P<sign>\+|-) # +/- + (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm + $) + ''', date_str) if not m: - timezone = datetime.timedelta() + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) + if timezone is not None: + date_str = date_str[:-len(m.group('tz'))] + timezone = datetime.timedelta(hours=timezone or 0) else: date_str = date_str[:-len(m.group('tz'))] if not m.group('sign'): @@ -3037,7 +3067,8 @@ def unified_timestamp(date_str, day_first=True): if date_str is None: return None - date_str = re.sub(r'[,|]', '', date_str) + date_str = re.sub(r'\s+', ' ', re.sub( + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -3063,7 +3094,7 @@ def unified_timestamp(date_str, day_first=True): pass timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 + return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() def determine_ext(url, default_ext='unknown_video'): @@ -3673,13 +3704,11 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) - if v == '': - v = None - if v is None: + if v in (None, ''): return default try: return int(v) * invscale // scale - except (ValueError, TypeError): + except (ValueError, TypeError, OverflowError): return default From e52e8b8111cf7ca27daef184bacd926865e951b1 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 15 Aug 2022 16:45:04 +0100 Subject: [PATCH 1367/1705] [postprocessor] Don't replace existing value with null metadata parsed from title --- youtube_dl/postprocessor/metadatafromtitle.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index f5c14d974..6cd5bb70f 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -40,6 +40,8 @@ class MetadataFromTitlePP(PostProcessor): % self._titleformat) return [], info for attribute, value in match.groupdict().items(): + if value is None: + continue info[attribute] = value self._downloader.to_screen( '[fromtitle] parsed %s: %s' From b0a60ce2032172aeaaf27fe3866ab72768f10cb2 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 17 Aug 2022 14:22:02 +0100 Subject: [PATCH 1368/1705] [jsinterp] Improve JS language support (#31175) * operator ?? * operator ?. * operator ** * accurate operator functions * `undefined` handling * object literals {a: 1, "b": expr} * more tests for weird JS comparisons: see https://github.com/ytdl-org/youtube-dl/issues/31173#issuecomment-1217854397. --- test/test_jsinterp.py | 114 ++++++++++++++++++++ test/test_youtube_signature.py | 4 + youtube_dl/jsinterp.py | 189 ++++++++++++++++++++++++++------- 3 files changed, 267 insertions(+), 40 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c6c931743..328941e09 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -8,7 +8,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import math + from youtube_dl.jsinterp import JSInterpreter +undefined = JSInterpreter.undefined class TestJSInterpreter(unittest.TestCase): @@ -48,6 +51,9 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return 1 << 5;}') self.assertEqual(jsi.call_function('f'), 32) + jsi = JSInterpreter('function f(){return 2 ** 5}') + self.assertEqual(jsi.call_function('f'), 32) + jsi = JSInterpreter('function f(){return 19 & 21;}') self.assertEqual(jsi.call_function('f'), 17) @@ -57,6 +63,15 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return []? 2+3: 4;}') self.assertEqual(jsi.call_function('f'), 5) + jsi = JSInterpreter('function f(){return 1 == 2}') + self.assertEqual(jsi.call_function('f'), False) + + jsi = JSInterpreter('function f(){return 0 && 1 || 2;}') + self.assertEqual(jsi.call_function('f'), 2) + + jsi = JSInterpreter('function f(){return 0 ?? 42;}') + self.assertEqual(jsi.call_function('f'), 0) + def test_array_access(self): jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) @@ -203,6 +218,11 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 7) + jsi = JSInterpreter(''' + function x() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) } + ''') + self.assertEqual(jsi.call_function('x'), 5) + def test_void(self): jsi = JSInterpreter(''' function x() { return void 42; } @@ -215,6 +235,100 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x')([]), 1) + def test_null(self): + jsi = JSInterpreter(''' + function x() { return null; } + ''') + self.assertIs(jsi.call_function('x'), None) + + jsi = JSInterpreter(''' + function x() { return [null > 0, null < 0, null == 0, null === 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, False, False]) + + jsi = JSInterpreter(''' + function x() { return [null >= 0, null <= 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [True, True]) + + def test_undefined(self): + jsi = JSInterpreter(''' + function x() { return undefined === undefined; } + ''') + self.assertTrue(jsi.call_function('x')) + + jsi = JSInterpreter(''' + function x() { return undefined; } + ''') + self.assertIs(jsi.call_function('x'), undefined) + + jsi = JSInterpreter(''' + function x() { let v; return v; } + ''') + self.assertIs(jsi.call_function('x'), undefined) + + jsi = JSInterpreter(''' + function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } + ''') + self.assertEqual(jsi.call_function('x'), [True, True, False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined === 0, undefined == 0, undefined < 0, undefined > 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined >= 0, undefined <= 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined > null, undefined < null, undefined == null, undefined === null]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, True, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined === null, undefined == null, undefined < null, undefined > null]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, True, False, False]) + + jsi = JSInterpreter(''' + function x() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } + ''') + for y in jsi.call_function('x'): + self.assertTrue(math.isnan(y)) + + jsi = JSInterpreter(''' + function x() { let v; return v**0; } + ''') + self.assertEqual(jsi.call_function('x'), 1) + + jsi = JSInterpreter(''' + function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, undefined, undefined]) + + jsi = JSInterpreter('function x(){return undefined ?? 42; }') + self.assertEqual(jsi.call_function('x'), 42) + + def test_object(self): + jsi = JSInterpreter(''' + function x() { return {}; } + ''') + self.assertEqual(jsi.call_function('x'), {}) + jsi = JSInterpreter(''' + function x() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; } + ''') + self.assertEqual(jsi.call_function('x'), [42, 0]) + jsi = JSInterpreter(''' + function x() { let a; return a?.qq; } + ''') + self.assertIs(jsi.call_function('x'), undefined) + jsi = JSInterpreter(''' + function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } + ''') + self.assertIs(jsi.call_function('x'), undefined) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 6e955e0f0..4d756dad3 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -102,6 +102,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', 'TDCstCG66tEAO5pR9o', 'dbxNtZ14c-yWyw', ), + ( + 'https://www.youtube.com/s/player/c81bbb4a/player_ias.vflset/en_US/base.js', + 'gre3EcLurNY2vqp94', 'Z9DfGxWP115WTg', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c60a9b3c2..8e119d08a 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -7,7 +7,6 @@ import operator import re from .utils import ( - NO_DEFAULT, ExtractorError, js_to_json, remove_quotes, @@ -21,6 +20,70 @@ from .compat import ( _NAME_RE = r'[a-zA-Z_$][\w$]*' +_UNDEFINED = object() + + +def _js_bit_op(op): + + def wrapped(a, b): + def zeroise(x): + return 0 if x in (None, _UNDEFINED) else x + return op(zeroise(a), zeroise(b)) + + return wrapped + + +def _js_arith_op(op): + + def wrapped(a, b): + if _UNDEFINED in (a, b): + return float('nan') + return op(a or 0, b or 0) + + return wrapped + + +def _js_div(a, b): + if _UNDEFINED in (a, b) or not (a and b): + return float('nan') + return float('inf') if not b else operator.truediv(a or 0, b) + + +def _js_mod(a, b): + if _UNDEFINED in (a, b) or not b: + return float('nan') + return (a or 0) % b + + +def _js_exp(a, b): + if not b: + # even 0 ** 0 !! + return 1 + if _UNDEFINED in (a, b): + return float('nan') + return (a or 0) ** b + + +def _js_eq_op(op): + + def wrapped(a, b): + if set((a, b)) <= set((None, _UNDEFINED)): + return op(a, a) + return op(a, b) + + return wrapped + + +def _js_comp_op(op): + + def wrapped(a, b): + if _UNDEFINED in (a, b): + return False + return op(a or 0, b or 0) + + return wrapped + + # (op, definition) in order of binding priority, tightest first # avoid dict to maintain order # definition None => Defined in JSInterpreter._operator @@ -30,40 +93,38 @@ _DOT_OPERATORS = ( ) _OPERATORS = ( - ('|', operator.or_), - ('^', operator.xor), - ('&', operator.and_), - ('>>', operator.rshift), - ('<<', operator.lshift), - ('+', operator.add), - ('-', operator.sub), - ('*', operator.mul), - ('/', operator.truediv), - ('%', operator.mod), + ('>>', _js_bit_op(operator.rshift)), + ('<<', _js_bit_op(operator.lshift)), + ('+', _js_arith_op(operator.add)), + ('-', _js_arith_op(operator.sub)), + ('*', _js_arith_op(operator.mul)), + ('/', _js_div), + ('%', _js_mod), + ('**', _js_exp), ) _COMP_OPERATORS = ( ('===', operator.is_), - ('==', operator.eq), + ('==', _js_eq_op(operator.eq)), ('!==', operator.is_not), - ('!=', operator.ne), - ('<=', operator.le), - ('>=', operator.ge), - ('<', operator.lt), - ('>', operator.gt), + ('!=', _js_eq_op(operator.ne)), + ('<=', _js_comp_op(operator.le)), + ('>=', _js_comp_op(operator.ge)), + ('<', _js_comp_op(operator.lt)), + ('>', _js_comp_op(operator.gt)), ) _LOG_OPERATORS = ( - ('&', operator.and_), - ('|', operator.or_), - ('^', operator.xor), + ('|', _js_bit_op(operator.or_)), + ('^', _js_bit_op(operator.xor)), + ('&', _js_bit_op(operator.and_)), ) _SC_OPERATORS = ( ('?', None), + ('??', None), ('||', None), ('&&', None), - # TODO: ('??', None), ) _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) @@ -74,7 +135,7 @@ _QUOTES = '\'"' def _ternary(cndn, if_true=True, if_false=False): """Simulate JS's ternary operator (cndn?if_true:if_false)""" - if cndn in (False, None, 0, ''): + if cndn in (False, None, 0, '', _UNDEFINED): return if_false try: if math.isnan(cndn): # NB: NaN cannot be checked by membership @@ -95,6 +156,12 @@ class JS_Continue(ExtractorError): class LocalNameSpace(ChainMap): + def __getitem__(self, key): + try: + return super(LocalNameSpace, self).__getitem__(key) + except KeyError: + return _UNDEFINED + def __setitem__(self, key, value): for scope in self.maps: if key in scope: @@ -105,6 +172,13 @@ class LocalNameSpace(ChainMap): def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') + def __contains__(self, key): + try: + super(LocalNameSpace, self).__getitem__(key) + return True + except KeyError: + return False + def __repr__(self): return 'LocalNameSpace%s' % (self.maps, ) @@ -112,6 +186,8 @@ class LocalNameSpace(ChainMap): class JSInterpreter(object): __named_object_counter = 0 + undefined = _UNDEFINED + def __init__(self, code, objects=None): self.code, self._functions = code, {} self._objects = {} if objects is None else objects @@ -185,12 +261,16 @@ class JSInterpreter(object): @staticmethod def _all_operators(): return itertools.chain( + # Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS) def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): if op in ('||', '&&'): if (op == '&&') ^ _ternary(left_val): return left_val # short circuiting + elif op == '??': + if left_val not in (None, self.undefined): + return left_val elif op == '?': right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) @@ -204,12 +284,14 @@ class JSInterpreter(object): except Exception as e: raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e) - def _index(self, obj, idx): + def _index(self, obj, idx, allow_undefined=False): if idx == 'length': return len(obj) try: return obj[int(idx)] if isinstance(obj, list) else obj[idx] except Exception as e: + if allow_undefined: + return self.undefined raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): @@ -249,8 +331,8 @@ class JSInterpreter(object): obj = expr[4:] if obj.startswith('Date('): left, right = self._separate_at_paren(obj[4:], ')') - left = self.interpret_expression(left, local_vars, allow_recursion) - expr = unified_timestamp(left, False) + expr = unified_timestamp( + self.interpret_expression(left, local_vars, allow_recursion), False) if not expr: raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr) expr = self._dump(int(expr * 1000), local_vars) + right @@ -263,6 +345,14 @@ class JSInterpreter(object): if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') + # try for object expression + sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] + if all(len(sub_expr) == 2 for sub_expr in sub_expressions): + return dict( + (key_expr if re.match(_NAME_RE, key_expr) else key_expr, + self.interpret_expression(val_expr, local_vars, allow_recursion)) + for key_expr, val_expr in sub_expressions), should_return + # or statement list inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: return inner, should_abort or should_return @@ -387,13 +477,13 @@ class JSInterpreter(object): (?P<assign> (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* (?P<op>{_OPERATOR_RE})? - =(?P<expr>.*)$ + =(?!=)(?P<expr>.*)$ )|(?P<return> (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ )|(?P<indexing> (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ )|(?P<attribute> - (?P<var>{_NAME_RE})(?:\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* + (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* )|(?P<function> (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ )'''.format(**globals()), expr) @@ -405,7 +495,7 @@ class JSInterpreter(object): local_vars[m.group('out')] = self._operator( m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) return local_vars[m.group('out')], should_return - elif left_val is None: + elif left_val in (None, self.undefined): raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) @@ -424,6 +514,9 @@ class JSInterpreter(object): elif expr == 'continue': raise JS_Continue() + elif expr == 'undefined': + return self.undefined, should_return + elif md.get('return'): return local_vars[m.group('name')], should_return @@ -441,7 +534,9 @@ class JSInterpreter(object): for op, _ in self._all_operators(): # hackety: </> have higher priority than <</>>, but don't confuse them - skip_delim = (op + op) if op in ('<', '>') else None + skip_delim = (op + op) if op in '<>*?' else None + if op == '?': + skip_delim = (skip_delim, '?.') separated = list(self._separate(expr, op, skip_delims=skip_delim)) if len(separated) < 2: continue @@ -451,12 +546,10 @@ class JSInterpreter(object): right_expr = '-' + right_expr separated.pop() left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) - return self._operator(op, 0 if left_val is None else left_val, - right_expr, expr, local_vars, allow_recursion), should_return + return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return if md.get('attribute'): - variable = m.group('var') - member = m.group('member') + variable, member, nullish = m.group('var', 'member', 'nullish') if not member: member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] @@ -477,15 +570,24 @@ class JSInterpreter(object): 'String': compat_str, 'Math': float, } - obj = local_vars.get(variable, types.get(variable, NO_DEFAULT)) - if obj is NO_DEFAULT: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] + obj = local_vars.get(variable) + if obj in (self.undefined, None): + obj = types.get(variable, self.undefined) + if obj is self.undefined: + try: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + except self.Exception: + if not nullish: + raise + + if nullish and obj is self.undefined: + return self.undefined # Member access if arg_str is None: - return self._index(obj, member) + return self._index(obj, member, nullish) # Function call argvals = [ @@ -660,7 +762,14 @@ class JSInterpreter(object): def build_arglist(cls, arg_text): if not arg_text: return [] - return list(filter(None, (x.strip() or None for x in cls._separate(arg_text)))) + + def valid_arg(y): + y = y.strip() + if not y: + raise cls.Exception('Missing arg in "%s"' % (arg_text, )) + return y + + return [valid_arg(x) for x in cls._separate(arg_text)] def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] From 538ec65ba7634bb9ad9f8eb4ce72713c673969dc Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 19 Aug 2022 11:45:04 +0100 Subject: [PATCH 1369/1705] [jsinterp] Handle regexp literals and throw/catch execution (#31182) * based on https://github.com/yt-dlp/yt-dlp/commit/f6ca640b122239d5ab215f8c2564efb7ac3e8c65, thanks pukkandan * adds parse support for regexp flags --- test/test_jsinterp.py | 21 +++++ test/test_youtube_signature.py | 4 + youtube_dl/jsinterp.py | 136 +++++++++++++++++++++++++++------ 3 files changed, 139 insertions(+), 22 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 328941e09..faddf00d5 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -9,6 +9,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math +import re from youtube_dl.jsinterp import JSInterpreter undefined = JSInterpreter.undefined @@ -316,19 +317,39 @@ class TestJSInterpreter(unittest.TestCase): function x() { return {}; } ''') self.assertEqual(jsi.call_function('x'), {}) + jsi = JSInterpreter(''' function x() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; } ''') self.assertEqual(jsi.call_function('x'), [42, 0]) + jsi = JSInterpreter(''' function x() { let a; return a?.qq; } ''') self.assertIs(jsi.call_function('x'), undefined) + jsi = JSInterpreter(''' function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } ''') self.assertIs(jsi.call_function('x'), undefined) + def test_regex(self): + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/; } + ''') + self.assertIs(jsi.call_function('x'), None) + + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/; return a; } + ''') + # Pythons disagree on the type of a pattern + self.assertTrue(isinstance(jsi.call_function('x'), type(re.compile('')))) + + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/i; return a; } + ''') + self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4d756dad3..43e22388d 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -106,6 +106,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/c81bbb4a/player_ias.vflset/en_US/base.js', 'gre3EcLurNY2vqp94', 'Z9DfGxWP115WTg', ), + ( + 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', + 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 8e119d08a..48c27a1c0 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -7,6 +7,7 @@ import operator import re from .utils import ( + error_to_compat_str, ExtractorError, js_to_json, remove_quotes, @@ -130,7 +131,7 @@ _SC_OPERATORS = ( _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) -_QUOTES = '\'"' +_QUOTES = '\'"/' def _ternary(cndn, if_true=True, if_false=False): @@ -155,6 +156,12 @@ class JS_Continue(ExtractorError): ExtractorError.__init__(self, 'Invalid continue') +class JS_Throw(ExtractorError): + def __init__(self, e): + self.error = e + ExtractorError.__init__(self, 'Uncaught exception ' + error_to_compat_str(e)) + + class LocalNameSpace(ChainMap): def __getitem__(self, key): try: @@ -172,6 +179,17 @@ class LocalNameSpace(ChainMap): def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') + # except + def pop(self, key, *args): + try: + off = self.__getitem__(key) + super(LocalNameSpace, self).__delitem__(key) + return off + except KeyError: + if len(args) > 0: + return args[0] + raise + def __contains__(self, key): try: super(LocalNameSpace, self).__getitem__(key) @@ -188,9 +206,29 @@ class JSInterpreter(object): undefined = _UNDEFINED + RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + _EXC_NAME = '__youtube_dl_exception__' + _OBJ_NAME = '__youtube_dl_jsinterp_obj' + + OP_CHARS = None + def __init__(self, code, objects=None): self.code, self._functions = code, {} self._objects = {} if objects is None else objects + if type(self).OP_CHARS is None: + type(self).OP_CHARS = self.OP_CHARS = self.__op_chars() class Exception(ExtractorError): def __init__(self, msg, *args, **kwargs): @@ -199,32 +237,64 @@ class JSInterpreter(object): msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) + @classmethod + def __op_chars(cls): + op_chars = set(';,') + for op in cls._all_operators(): + for c in op[0]: + op_chars.add(c) + return op_chars + def _named_object(self, namespace, obj): self.__named_object_counter += 1 - name = '__youtube_dl_jsinterp_obj%d' % (self.__named_object_counter, ) + name = '%s%d' % (self._OBJ_NAME, self.__named_object_counter) namespace[name] = obj return name - @staticmethod - def _separate(expr, delim=',', max_split=None, skip_delims=None): + @classmethod + def _regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls.RE_FLAGS: + break + flags |= cls.RE_FLAGS[ch] + return flags, expr[idx:] if idx > 0 else expr + + @classmethod + def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} - start, splits, pos, skipping, delim_len = 0, 0, 0, 0, len(delim) - 1 - in_quote, escaping = None, False + start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 + in_quote, escaping, skipping = None, False, 0 + after_op, in_regex_char_group, skip_re = True, False, 0 + for idx, char in enumerate(expr): + if skip_re > 0: + skip_re -= 1 + continue if not in_quote: if char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 elif char in counters: counters[char] -= 1 - if not escaping: - if char in _QUOTES and in_quote in (char, None): - in_quote = None if in_quote else char - else: - escaping = in_quote and char == '\\' - else: - escaping = False + if not escaping and char in _QUOTES and in_quote in (char, None): + if in_quote or after_op or char != '/': + in_quote = None if in_quote and not in_regex_char_group else char + if in_quote is None and char == '/' and delim != '/': + # regexp flags + n_idx = idx + 1 + while n_idx < len(expr) and expr[n_idx] in cls.RE_FLAGS: + n_idx += 1 + skip_re = n_idx - idx - 1 + if skip_re > 0: + continue + elif in_quote == '/' and char in '[]': + in_regex_char_group = char == '[' + escaping = not escaping and in_quote and char == '\\' + after_op = not in_quote and char in cls.OP_CHARS or (char == ' ' and after_op) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -313,16 +383,23 @@ class JSInterpreter(object): if should_return: return ret, should_return - m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|$)', stmt) + m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt) if m: expr = stmt[len(m.group(0)):].strip() + if m.group('throw'): + raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion)) should_return = not m.group('var') if not expr: return None, should_return if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) - inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) + if expr[0] == '/': + flags, _ = self._regex_flags(outer) + inner, outer = inner.replace('"', r'\"'), '' + inner = re.compile(js_to_json(inner + expr[0]), flags=flags) # , strict=True)) + else: + inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) if not outer: return inner, should_return expr = self._named_object(local_vars, inner) + outer @@ -374,22 +451,37 @@ class JSInterpreter(object): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'(?P<try>try|finally)\s*|(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) + m = re.match(r'''(?x) + (?P<try>try|finally)\s*| + (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))| + (?P<switch>switch)\s*\(| + (?P<for>for)\s*\(|'''.format(**globals()), expr) md = m.groupdict() if m else {} if md.get('try'): if expr[m.end()] == '{': try_expr, expr = self._separate_at_paren(expr[m.end():], '}') else: try_expr, expr = expr[m.end() - 1:], '' - ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) - if should_abort: - return ret, True + try: + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + except JS_Throw as e: + local_vars[self._EXC_NAME] = e.error + except Exception as e: + # XXX: This works for now, but makes debugging future issues very hard + local_vars[self._EXC_NAME] = e ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return elif md.get('catch'): - # We ignore the catch block - _, expr = self._separate_at_paren(expr, '}') + catch_expr, expr = self._separate_at_paren(expr[m.end():], '}') + if self._EXC_NAME in local_vars: + catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)}) + ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion) + if should_abort: + return ret, True + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return @@ -503,7 +595,7 @@ class JSInterpreter(object): raise self.Exception('List index %s must be integer' % (idx, ), expr=expr) idx = int(idx) left_val[idx] = self._operator( - m.group('op'), left_val[idx], m.group('expr'), expr, local_vars, allow_recursion) + m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) return left_val[idx], should_return elif expr.isdigit(): From 46b8ae2f520c17aaa756082676788c6287b6809e Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 19 Aug 2022 15:34:33 +0100 Subject: [PATCH 1370/1705] [jsinterp] Clean up and pull yt-dlp style * add compat_re_Pattern * improve compat_collections_chain_map * use class JS_Undefined * remove unused code --- test/test_jsinterp.py | 20 +++--- test/test_youtube_signature.py | 3 +- youtube_dl/compat.py | 21 +++++- youtube_dl/jsinterp.py | 123 ++++++++++++--------------------- 4 files changed, 77 insertions(+), 90 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index faddf00d5..96786a84c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -11,8 +11,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math import re -from youtube_dl.jsinterp import JSInterpreter -undefined = JSInterpreter.undefined +from youtube_dl.compat import compat_re_Pattern + +from youtube_dl.jsinterp import JS_Undefined, JSInterpreter class TestJSInterpreter(unittest.TestCase): @@ -261,12 +262,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { return undefined; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) jsi = JSInterpreter(''' function x() { let v; return v; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) jsi = JSInterpreter(''' function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } @@ -307,7 +308,7 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } ''') - self.assertEqual(jsi.call_function('x'), [False, False, undefined, undefined]) + self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined]) jsi = JSInterpreter('function x(){return undefined ?? 42; }') self.assertEqual(jsi.call_function('x'), 42) @@ -326,12 +327,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a; return a?.qq; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) jsi = JSInterpreter(''' function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } ''') - self.assertIs(jsi.call_function('x'), undefined) + self.assertIs(jsi.call_function('x'), JS_Undefined) def test_regex(self): jsi = JSInterpreter(''' @@ -342,13 +343,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/; return a; } ''') - # Pythons disagree on the type of a pattern - self.assertTrue(isinstance(jsi.call_function('x'), type(re.compile('')))) + self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern) jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/i; return a; } ''') - self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) if __name__ == '__main__': diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 43e22388d..327d4c40d 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -12,10 +12,11 @@ import io import re import string +from youtube_dl.compat import compat_str, compat_urlretrieve + from test.helper import FakeYDL from youtube_dl.extractor import YoutubeIE from youtube_dl.jsinterp import JSInterpreter -from youtube_dl.compat import compat_str, compat_urlretrieve _SIG_TESTS = [ ( diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6d2c31a61..3002109ca 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3023,18 +3023,34 @@ except ImportError: self.maps[0].__setitem__(k, v) return - def __delitem__(self, k): + def __contains__(self, k): + return any((k in m) for m in self.maps) + + def __delitem(self, k): if k in self.maps[0]: del self.maps[0][k] return raise KeyError(k) + def __delitem__(self, k): + self.__delitem(k) + def __iter__(self): return itertools.chain(*reversed(self.maps)) def __len__(self): return len(iter(self)) + # to match Py3, don't del directly + def pop(self, k, *args): + if self.__contains__(k): + off = self.__getitem__(k) + self.__delitem(k) + return off + elif len(args) > 0: + return args[0] + raise KeyError(k) + def new_child(self, m=None, **kwargs): m = m or {} m.update(kwargs) @@ -3044,6 +3060,8 @@ except ImportError: def parents(self): return compat_collections_chain_map(*(self.maps[1:])) +# Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) +compat_re_Pattern = type(re.compile('')) if sys.version_info < (3, 3): def compat_b64decode(s, *args, **kwargs): @@ -3110,6 +3128,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_re_Pattern', 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 48c27a1c0..6719d0dfd 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -19,16 +19,12 @@ from .compat import ( compat_str, ) -_NAME_RE = r'[a-zA-Z_$][\w$]*' - -_UNDEFINED = object() - def _js_bit_op(op): def wrapped(a, b): def zeroise(x): - return 0 if x in (None, _UNDEFINED) else x + return 0 if x in (None, JS_Undefined) else x return op(zeroise(a), zeroise(b)) return wrapped @@ -37,7 +33,7 @@ def _js_bit_op(op): def _js_arith_op(op): def wrapped(a, b): - if _UNDEFINED in (a, b): + if JS_Undefined in (a, b): return float('nan') return op(a or 0, b or 0) @@ -45,22 +41,21 @@ def _js_arith_op(op): def _js_div(a, b): - if _UNDEFINED in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a and b): return float('nan') return float('inf') if not b else operator.truediv(a or 0, b) def _js_mod(a, b): - if _UNDEFINED in (a, b) or not b: + if JS_Undefined in (a, b) or not b: return float('nan') return (a or 0) % b def _js_exp(a, b): if not b: - # even 0 ** 0 !! - return 1 - if _UNDEFINED in (a, b): + return 1 # even 0 ** 0 !! + elif JS_Undefined in (a, b): return float('nan') return (a or 0) ** b @@ -68,7 +63,7 @@ def _js_exp(a, b): def _js_eq_op(op): def wrapped(a, b): - if set((a, b)) <= set((None, _UNDEFINED)): + if set((a, b)) <= set((None, JS_Undefined)): return op(a, a) return op(a, b) @@ -78,21 +73,28 @@ def _js_eq_op(op): def _js_comp_op(op): def wrapped(a, b): - if _UNDEFINED in (a, b): + if JS_Undefined in (a, b): return False return op(a or 0, b or 0) return wrapped +def _js_ternary(cndn, if_true=True, if_false=False): + """Simulate JS's ternary operator (cndn?if_true:if_false)""" + if cndn in (False, None, 0, '', JS_Undefined): + return if_false + try: + if math.isnan(cndn): # NB: NaN cannot be checked by membership + return if_false + except TypeError: + pass + return if_true + + # (op, definition) in order of binding priority, tightest first # avoid dict to maintain order # definition None => Defined in JSInterpreter._operator -_DOT_OPERATORS = ( - ('.', None), - # TODO: ('?.', None), -) - _OPERATORS = ( ('>>', _js_bit_op(operator.rshift)), ('<<', _js_bit_op(operator.lshift)), @@ -130,20 +132,13 @@ _SC_OPERATORS = ( _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) +_NAME_RE = r'[a-zA-Z_$][\w$]*' _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) _QUOTES = '\'"/' -def _ternary(cndn, if_true=True, if_false=False): - """Simulate JS's ternary operator (cndn?if_true:if_false)""" - if cndn in (False, None, 0, '', _UNDEFINED): - return if_false - try: - if math.isnan(cndn): # NB: NaN cannot be checked by membership - return if_false - except TypeError: - pass - return if_true +class JS_Undefined(object): + pass class JS_Break(ExtractorError): @@ -167,7 +162,7 @@ class LocalNameSpace(ChainMap): try: return super(LocalNameSpace, self).__getitem__(key) except KeyError: - return _UNDEFINED + return JS_Undefined def __setitem__(self, key, value): for scope in self.maps: @@ -179,24 +174,6 @@ class LocalNameSpace(ChainMap): def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') - # except - def pop(self, key, *args): - try: - off = self.__getitem__(key) - super(LocalNameSpace, self).__delitem__(key) - return off - except KeyError: - if len(args) > 0: - return args[0] - raise - - def __contains__(self, key): - try: - super(LocalNameSpace, self).__getitem__(key) - return True - except KeyError: - return False - def __repr__(self): return 'LocalNameSpace%s' % (self.maps, ) @@ -204,9 +181,7 @@ class LocalNameSpace(ChainMap): class JSInterpreter(object): __named_object_counter = 0 - undefined = _UNDEFINED - - RE_FLAGS = { + _RE_FLAGS = { # special knowledge: Python's re flags are bitmask values, current max 128 # invent new bitmask values well above that for literal parsing # TODO: new pattern class to execute matches with these flags @@ -257,10 +232,10 @@ class JSInterpreter(object): if not expr: return flags, expr for idx, ch in enumerate(expr): - if ch not in cls.RE_FLAGS: + if ch not in cls._RE_FLAGS: break - flags |= cls.RE_FLAGS[ch] - return flags, expr[idx:] if idx > 0 else expr + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] @classmethod def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): @@ -283,14 +258,6 @@ class JSInterpreter(object): if not escaping and char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': in_quote = None if in_quote and not in_regex_char_group else char - if in_quote is None and char == '/' and delim != '/': - # regexp flags - n_idx = idx + 1 - while n_idx < len(expr) and expr[n_idx] in cls.RE_FLAGS: - n_idx += 1 - skip_re = n_idx - idx - 1 - if skip_re > 0: - continue elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' @@ -336,13 +303,13 @@ class JSInterpreter(object): def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): if op in ('||', '&&'): - if (op == '&&') ^ _ternary(left_val): + if (op == '&&') ^ _js_ternary(left_val): return left_val # short circuiting elif op == '??': - if left_val not in (None, self.undefined): + if left_val not in (None, JS_Undefined): return left_val elif op == '?': - right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) + right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1)) right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) opfunc = op and next((v for k, v in self._all_operators() if k == op), None) @@ -361,7 +328,7 @@ class JSInterpreter(object): return obj[int(idx)] if isinstance(obj, list) else obj[idx] except Exception as e: if allow_undefined: - return self.undefined + return JS_Undefined raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): @@ -395,9 +362,8 @@ class JSInterpreter(object): if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': - flags, _ = self._regex_flags(outer) - inner, outer = inner.replace('"', r'\"'), '' - inner = re.compile(js_to_json(inner + expr[0]), flags=flags) # , strict=True)) + flags, outer = self._regex_flags(outer) + inner = re.compile(inner[1:], flags=flags) # , strict=True)) else: inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) if not outer: @@ -422,7 +388,7 @@ class JSInterpreter(object): if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') - # try for object expression + # try for object expression (Map) sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] if all(len(sub_expr) == 2 for sub_expr in sub_expressions): return dict( @@ -455,7 +421,8 @@ class JSInterpreter(object): (?P<try>try|finally)\s*| (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))| (?P<switch>switch)\s*\(| - (?P<for>for)\s*\(|'''.format(**globals()), expr) + (?P<for>for)\s*\(| + '''.format(**globals()), expr) md = m.groupdict() if m else {} if md.get('try'): if expr[m.end()] == '{': @@ -500,7 +467,7 @@ class JSInterpreter(object): start, cndn, increment = self._separate(constructor, ';') self.interpret_expression(start, local_vars, allow_recursion) while True: - if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): + if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): break try: ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) @@ -587,7 +554,7 @@ class JSInterpreter(object): local_vars[m.group('out')] = self._operator( m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) return local_vars[m.group('out')], should_return - elif left_val in (None, self.undefined): + elif left_val in (None, JS_Undefined): raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) @@ -607,7 +574,7 @@ class JSInterpreter(object): raise JS_Continue() elif expr == 'undefined': - return self.undefined, should_return + return JS_Undefined, should_return elif md.get('return'): return local_vars[m.group('name')], should_return @@ -663,9 +630,9 @@ class JSInterpreter(object): 'Math': float, } obj = local_vars.get(variable) - if obj in (self.undefined, None): - obj = types.get(variable, self.undefined) - if obj is self.undefined: + if obj in (JS_Undefined, None): + obj = types.get(variable, JS_Undefined) + if obj is JS_Undefined: try: if variable not in self._objects: self._objects[variable] = self.extract_object(variable) @@ -674,8 +641,8 @@ class JSInterpreter(object): if not nullish: raise - if nullish and obj is self.undefined: - return self.undefined + if nullish and obj is JS_Undefined: + return JS_Undefined # Member access if arg_str is None: From fd3f3bebd0699f4b782a24a503093c965c4f4f5e Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 19 Aug 2022 19:11:08 +0100 Subject: [PATCH 1371/1705] [uktvplay] Support domain without .uktv --- youtube_dl/extractor/uktvplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py index f28fd514d..9ef9638cd 100644 --- a/youtube_dl/extractor/uktvplay.py +++ b/youtube_dl/extractor/uktvplay.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)' + _VALID_URL = r'https?://uktvplay\.(?:uktv\.)?co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)' _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', 'info_dict': { From a8d5316aaf3dc740aad486b8c394b2f3e70f5a58 Mon Sep 17 00:00:00 2001 From: gudata <gudata@users.noreply.github.com> Date: Fri, 19 Aug 2022 23:00:21 +0300 Subject: [PATCH 1372/1705] [infoq] Avoid crash if the page has no `mp3Form` * proposed fix for issue #31131, aligns with yt-dlp Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/infoq.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 0a70a1fb4..60b02b699 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +from ..utils import ( + ExtractorError, +) from ..compat import ( compat_b64decode, @@ -90,7 +93,11 @@ class InfoQIE(BokeCCBaseIE): }] def _extract_http_audio(self, webpage, video_id): - fields = self._form_hidden_inputs('mp3Form', webpage) + try: + fields = self._form_hidden_inputs('mp3Form', webpage) + except ExtractorError: + fields = {} + http_audio_url = fields.get('filename') if not http_audio_url: return [] From 556862bc911bb54435b7b0b01451789b884b0390 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 21 Aug 2022 00:19:19 +0100 Subject: [PATCH 1373/1705] [utils] Ensure RFC3986 encoding result is unicode --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5f584ec5..fea38ed32 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3970,7 +3970,8 @@ def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str): s = s.encode('utf-8') - return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + # ensure unicode: after quoting, it can always be converted + return compat_str(compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")) def escape_url(url): From 66e58dccc29de65cc95ee97915987d785b2b4b31 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sun, 21 Aug 2022 00:21:02 +0100 Subject: [PATCH 1374/1705] [core] Avoid processing empty format list after removing bad formats * also ensure compat encoding of error strings --- youtube_dl/YoutubeDL.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e77b8d50c..8e8546596 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -721,7 +721,7 @@ class YoutubeDL(object): filename = encodeFilename(filename, True).decode(preferredencoding()) return sanitize_path(filename) except ValueError as err: - self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') + self.report_error('Error in output template: ' + error_to_compat_str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None def _match_entry(self, info_dict, incomplete): @@ -1570,9 +1570,6 @@ class YoutubeDL(object): else: formats = info_dict['formats'] - if not formats: - raise ExtractorError('No video formats found!') - def is_wellformed(f): url = f.get('url') if not url: @@ -1585,7 +1582,10 @@ class YoutubeDL(object): return True # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + raise ExtractorError('No video formats found!') formats_dict = {} @@ -2058,7 +2058,7 @@ class YoutubeDL(object): try: self.post_process(filename, info_dict) except (PostProcessingError) as err: - self.report_error('postprocessing: %s' % str(err)) + self.report_error('postprocessing: %s' % error_to_compat_str(err)) return self.record_download_archive(info_dict) # avoid possible nugatory search for further items (PR #26638) From 573b13410e5c2f939676116e2700ec8efd9cf97b Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 25 Aug 2022 12:14:59 +0100 Subject: [PATCH 1375/1705] [YouTube] Improve error check for n-sig processing --- youtube_dl/extractor/youtube.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 91a3b6058..3d12e2e4a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1500,7 +1500,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return lambda s: jsi.extract_function_from_code(*func_code)([s]) def _n_descramble(self, n_param, player_url, video_id): - """Compute the response to YT's "n" parameter challenge + """Compute the response to YT's "n" parameter challenge, + or None Args: n_param -- challenge string that is the value of the @@ -1518,7 +1519,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_id not in self._player_cache: self._player_cache[player_id] = self._extract_n_function(video_id, player_url) func = self._player_cache[player_id] - self._player_cache[sig_id] = func(n_param) + ret = func(n_param) + if ret.startswith('enhanced_except_'): + raise ExtractorError('Unhandled exception in decode') + self._player_cache[sig_id] = ret if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) return self._player_cache[sig_id] @@ -1539,10 +1543,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue n_param = n_param[-1] n_response = self._n_descramble(n_param, player_url, video_id) - if n_response: - qs['n'] = [n_response] - fmt['url'] = compat_urlparse.urlunparse( - parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + if n_response is None: + # give up if descrambling failed + break + qs['n'] = [n_response] + fmt['url'] = compat_urlparse.urlunparse( + parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( From d619dd712f63aab1964f8fdde9ceea514a5e581d Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 25 Aug 2022 12:16:10 +0100 Subject: [PATCH 1376/1705] [jsinterp] Fix bug in operator precedence * from https://github.com/yt-dlp/yt-dlp/commit/164b03c4864b0d44cfee5e7702f7c2317164a6cf * added tests --- test/test_jsinterp.py | 25 +++++++++++++++++++++++++ test/test_youtube_signature.py | 4 ++++ youtube_dl/jsinterp.py | 7 ++++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 96786a84c..0a97bdbc4 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -192,6 +192,31 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 10) + def test_catch(self): + jsi = JSInterpreter(''' + function x() { try{throw 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 5) + + @unittest.expectedFailure + def test_finally(self): + jsi = JSInterpreter(''' + function x() { try{throw 10} finally {return 42} } + ''') + self.assertEqual(jsi.call_function('x'), 42) + jsi = JSInterpreter(''' + function x() { try{throw 10} catch(e){return 5} finally {return 42} } + ''') + self.assertEqual(jsi.call_function('x'), 42) + + def test_nested_try(self): + jsi = JSInterpreter(''' + function x() {try { + try{throw 10} finally {throw 42} + } catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 5) + def test_for_loop_continue(self): jsi = JSInterpreter(''' function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 327d4c40d..4bb0a30b0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -111,6 +111,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', ), + ( + 'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js', + '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 6719d0dfd..a8456ec1c 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -5,6 +5,7 @@ import json import math import operator import re +from collections import Counter from .utils import ( error_to_compat_str, @@ -108,8 +109,8 @@ _OPERATORS = ( _COMP_OPERATORS = ( ('===', operator.is_), - ('==', _js_eq_op(operator.eq)), ('!==', operator.is_not), + ('==', _js_eq_op(operator.eq)), ('!=', _js_eq_op(operator.ne)), ('<=', _js_comp_op(operator.le)), ('>=', _js_comp_op(operator.ge)), @@ -241,7 +242,9 @@ class JSInterpreter(object): def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): if not expr: return + # collections.Counter() is ~10% slower counters = {k: 0 for k in _MATCHING_PARENS.values()} + # counters = Counter() start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 after_op, in_regex_char_group, skip_re = True, False, 0 @@ -442,6 +445,7 @@ class JSInterpreter(object): return ret, should_abort or should_return elif md.get('catch'): + catch_expr, expr = self._separate_at_paren(expr[m.end():], '}') if self._EXC_NAME in local_vars: catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)}) @@ -450,6 +454,7 @@ class JSInterpreter(object): return ret, True ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return elif md.get('for'): From 4c6fba37650d60acbd32a9f2d6e2468a730d0f1c Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 26 Aug 2022 08:17:54 +0100 Subject: [PATCH 1377/1705] [jsinterp] Improve try/catch/finally support --- test/test_jsinterp.py | 14 ++++++- youtube_dl/jsinterp.py | 88 +++++++++++++++++++++++------------------- 2 files changed, 61 insertions(+), 41 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 0a97bdbc4..fb4882d00 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -74,6 +74,9 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return 0 ?? 42;}') self.assertEqual(jsi.call_function('f'), 0) + jsi = JSInterpreter('function f(){return "life, the universe and everything" < 42;}') + self.assertFalse(jsi.call_function('f')) + def test_array_access(self): jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) @@ -198,7 +201,6 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 5) - @unittest.expectedFailure def test_finally(self): jsi = JSInterpreter(''' function x() { try{throw 10} finally {return 42} } @@ -212,7 +214,7 @@ class TestJSInterpreter(unittest.TestCase): def test_nested_try(self): jsi = JSInterpreter(''' function x() {try { - try{throw 10} finally {throw 42} + try{throw 10} finally {throw 42} } catch(e){return 5} } ''') self.assertEqual(jsi.call_function('x'), 5) @@ -229,6 +231,14 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 0) + def test_for_loop_try(self): + jsi = JSInterpreter(''' + function x() { + for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; + return 42 } + ''') + self.assertEqual(jsi.call_function('x'), 42) + def test_literal_list(self): jsi = JSInterpreter(''' function x() { return [1, 2, "asdf", [5, 6, 7]][3] } diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a8456ec1c..08726e478 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -5,7 +5,6 @@ import json import math import operator import re -from collections import Counter from .utils import ( error_to_compat_str, @@ -15,6 +14,7 @@ from .utils import ( unified_timestamp, ) from .compat import ( + compat_basestring, compat_collections_chain_map as ChainMap, compat_itertools_zip_longest as zip_longest, compat_str, @@ -76,6 +76,10 @@ def _js_comp_op(op): def wrapped(a, b): if JS_Undefined in (a, b): return False + if isinstance(a, compat_basestring): + b = compat_str(b or 0) + elif isinstance(b, compat_basestring): + a = compat_str(a or 0) return op(a or 0, b or 0) return wrapped @@ -195,7 +199,6 @@ class JSInterpreter(object): 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string } - _EXC_NAME = '__youtube_dl_exception__' _OBJ_NAME = '__youtube_dl_jsinterp_obj' OP_CHARS = None @@ -242,9 +245,8 @@ class JSInterpreter(object): def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): if not expr: return - # collections.Counter() is ~10% slower + # collections.Counter() is ~10% slower in both 2.7 and 3.9 counters = {k: 0 for k in _MATCHING_PARENS.values()} - # counters = Counter() start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 after_op, in_regex_char_group, skip_re = True, False, 0 @@ -291,7 +293,9 @@ class JSInterpreter(object): yield expr[start:] @classmethod - def _separate_at_paren(cls, expr, delim): + def _separate_at_paren(cls, expr, delim=None): + if delim is None: + delim = expr and _MATCHING_PARENS[expr[0]] separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: @@ -376,7 +380,7 @@ class JSInterpreter(object): if expr.startswith('new '): obj = expr[4:] if obj.startswith('Date('): - left, right = self._separate_at_paren(obj[4:], ')') + left, right = self._separate_at_paren(obj[4:]) expr = unified_timestamp( self.interpret_expression(left, local_vars, allow_recursion), False) if not expr: @@ -390,7 +394,7 @@ class JSInterpreter(object): return None, should_return if expr.startswith('{'): - inner, outer = self._separate_at_paren(expr, '}') + inner, outer = self._separate_at_paren(expr) # try for object expression (Map) sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] if all(len(sub_expr) == 2 for sub_expr in sub_expressions): @@ -406,7 +410,7 @@ class JSInterpreter(object): expr = self._dump(inner, local_vars) + outer if expr.startswith('('): - inner, outer = self._separate_at_paren(expr, ')') + inner, outer = self._separate_at_paren(expr) inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: return inner, should_abort or should_return @@ -414,57 +418,63 @@ class JSInterpreter(object): expr = self._dump(inner, local_vars) + outer if expr.startswith('['): - inner, outer = self._separate_at_paren(expr, ']') + inner, outer = self._separate_at_paren(expr) name = self._named_object(local_vars, [ self.interpret_expression(item, local_vars, allow_recursion) for item in self._separate(inner)]) expr = name + outer m = re.match(r'''(?x) - (?P<try>try|finally)\s*| - (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))| - (?P<switch>switch)\s*\(| - (?P<for>for)\s*\(| - '''.format(**globals()), expr) + (?P<try>try)\s*\{| + (?P<switch>switch)\s*\(| + (?P<for>for)\s*\( + ''', expr) md = m.groupdict() if m else {} if md.get('try'): - if expr[m.end()] == '{': - try_expr, expr = self._separate_at_paren(expr[m.end():], '}') - else: - try_expr, expr = expr[m.end() - 1:], '' + try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + err = None try: ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) if should_abort: return ret, True - except JS_Throw as e: - local_vars[self._EXC_NAME] = e.error except Exception as e: # XXX: This works for now, but makes debugging future issues very hard - local_vars[self._EXC_NAME] = e - ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) - return ret, should_abort or should_return + err = e - elif md.get('catch'): + pending = (None, False) + m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if err: + catch_vars = {} + if m.group('err'): + catch_vars[m.group('err')] = err.error if isinstance(err, JS_Throw) else err + catch_vars = local_vars.new_child(m=catch_vars) + err = None + pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion) - catch_expr, expr = self._separate_at_paren(expr[m.end():], '}') - if self._EXC_NAME in local_vars: - catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)}) - ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion) + m = re.match(r'finally\s*\{', expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) if should_abort: return ret, True - ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + ret, should_abort = pending + if should_abort: + return ret, True - return ret, should_abort or should_return + if err: + raise err elif md.get('for'): - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) if remaining.startswith('{'): - body, expr = self._separate_at_paren(remaining, '}') + body, expr = self._separate_at_paren(remaining) else: switch_m = re.match(r'switch\s*\(', remaining) # FIXME if switch_m: - switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:], ')') + switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') body = 'switch(%s){%s}' % (switch_val, body) else: @@ -483,11 +493,9 @@ class JSInterpreter(object): except JS_Continue: pass self.interpret_expression(increment, local_vars, allow_recursion) - ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) - return ret, should_abort or should_return elif md.get('switch'): - switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) body, expr = self._separate_at_paren(remaining, '}') items = body.replace('default:', 'case default:').split('case ')[1:] @@ -510,6 +518,8 @@ class JSInterpreter(object): break if matched: break + + if md: ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return @@ -618,7 +628,7 @@ class JSInterpreter(object): member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] if arg_str.startswith('('): - arg_str, remaining = self._separate_at_paren(arg_str, ')') + arg_str, remaining = self._separate_at_paren(arg_str) else: arg_str, remaining = None, arg_str @@ -795,7 +805,7 @@ class JSInterpreter(object): \((?P<args>[^)]*)\)\s* (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match + code, _ = self._separate_at_paren(func_m.group('code')) # refine the match if func_m is None: raise self.Exception('Could not find JS function "{funcname}"'.format(**locals())) return self.build_arglist(func_m.group('args')), code @@ -810,7 +820,7 @@ class JSInterpreter(object): if mobj is None: break start, body_start = mobj.span() - body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + body, remaining = self._separate_at_paren(code[body_start - 1:]) name = self._named_object( local_vars, self.extract_function_from_code( From 0f6422590e44e99e9b81cf2367666efe89fae3aa Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 26 Aug 2022 10:17:56 +0100 Subject: [PATCH 1378/1705] [compat] Replace deficient ChainMap class in Py3.3 and earlier --- youtube_dl/compat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 3002109ca..366a93924 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3004,8 +3004,11 @@ except ImportError: # new class in collections try: from collections import ChainMap as compat_collections_chain_map + # Py3.3's ChainMap is deficient + if sys.version_info <= (3, 3): + raise ImportError except ImportError: - # Py < 3.3 + # Py <= 3.3 class compat_collections_chain_map(compat_collections_abc.MutableMapping): maps = [{}] @@ -3060,6 +3063,7 @@ except ImportError: def parents(self): return compat_collections_chain_map(*(self.maps[1:])) + # Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) compat_re_Pattern = type(re.compile('')) From ed5c44e7b74ac77f87ca5ed6cb5e964a0c6a0678 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 26 Aug 2022 12:22:01 +0100 Subject: [PATCH 1379/1705] [compat] Replace deficient ChainMap class in Py3.3 and earlier * fix version check --- youtube_dl/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 366a93924..eca6d63de 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3005,7 +3005,7 @@ except ImportError: try: from collections import ChainMap as compat_collections_chain_map # Py3.3's ChainMap is deficient - if sys.version_info <= (3, 3): + if sys.version_info < (3, 4): raise ImportError except ImportError: # Py <= 3.3 From 4050e10a4c3445c5399239567eb074acb2f65c18 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 29 Aug 2022 13:02:17 +0100 Subject: [PATCH 1380/1705] [options] Document that postprocessing is not forced by --postprocessor-args Resolves #30307 --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index f6621ef91..f6d2b0898 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -801,7 +801,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--postprocessor-args', dest='postprocessor_args', metavar='ARGS', - help='Give these arguments to the postprocessor') + help='Give these arguments to the postprocessor (if postprocessing is required)') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, From 55c823634db890a328ffc23588fcd6f35d9b3ddf Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 31 Aug 2022 23:22:48 +0100 Subject: [PATCH 1381/1705] [jsinterp] Handle new YT players 113ca41c, c57c113c * add NaN * allow any white-space character for `after_op` * align with yt-dlp f26af78a8ac11d9d617ed31ea5282cfaa5bcbcfa (charcodeAt and bitwise overflow) * allow escaping in regex, fixing player c57c113c --- test/test_jsinterp.py | 21 ++++++++++++++++ test/test_youtube_signature.py | 16 ++++++++++++ youtube_dl/jsinterp.py | 46 +++++++++++++++++++++------------- 3 files changed, 65 insertions(+), 18 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index fb4882d00..5121c8cf8 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -135,6 +135,11 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) def test_builtins(self): + jsi = JSInterpreter(''' + function x() { return NaN } + ''') + self.assertTrue(math.isnan(jsi.call_function('x'))) + jsi = JSInterpreter(''' function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } ''') @@ -385,6 +390,22 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + def test_char_code_at(self): + jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') + self.assertEqual(jsi.call_function('x', 0), 116) + self.assertEqual(jsi.call_function('x', 1), 101) + self.assertEqual(jsi.call_function('x', 2), 115) + self.assertEqual(jsi.call_function('x', 3), 116) + self.assertEqual(jsi.call_function('x', 4), None) + self.assertEqual(jsi.call_function('x', 'not_a_number'), 116) + + def test_bitwise_operators_overflow(self): + jsi = JSInterpreter('function x(){return -524999584 << 5}') + self.assertEqual(jsi.call_function('x'), 379882496) + + jsi = JSInterpreter('function x(){return 1236566549 << 5}') + self.assertEqual(jsi.call_function('x'), 915423904) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4bb0a30b0..ec914a871 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -111,10 +111,26 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', ), + ( + 'https://www.youtube.com/s/player/009f1d77/player_ias.vflset/en_US/base.js', + '5dwFHw8aFWQUQtffRq', 'audescmLUzI3jw', + ), ( 'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js', '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ', ), + ( + 'https://www.youtube.com/s/player/c2199353/player_ias.vflset/en_US/base.js', + '5EHDMgYLV6HPGk_Mu-kk', 'AD5rgS85EkrE7', + ), + ( + 'https://www.youtube.com/s/player/113ca41c/player_ias.vflset/en_US/base.js', + 'cgYl-tlYkhjT7A', 'hI7BBr2zUgcmMg', + ), + ( + 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', + '-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 08726e478..d13329396 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -23,10 +23,11 @@ from .compat import ( def _js_bit_op(op): + def zeroise(x): + return 0 if x in (None, JS_Undefined) else x + def wrapped(a, b): - def zeroise(x): - return 0 if x in (None, JS_Undefined) else x - return op(zeroise(a), zeroise(b)) + return op(zeroise(a), zeroise(b)) & 0xffffffff return wrapped @@ -44,7 +45,7 @@ def _js_arith_op(op): def _js_div(a, b): if JS_Undefined in (a, b) or not (a and b): return float('nan') - return float('inf') if not b else operator.truediv(a or 0, b) + return operator.truediv(a or 0, b) if b else float('inf') def _js_mod(a, b): @@ -260,13 +261,14 @@ class JSInterpreter(object): counters[_MATCHING_PARENS[char]] += 1 elif char in counters: counters[char] -= 1 - if not escaping and char in _QUOTES and in_quote in (char, None): - if in_quote or after_op or char != '/': - in_quote = None if in_quote and not in_regex_char_group else char - elif in_quote == '/' and char in '[]': - in_regex_char_group = char == '[' + if not escaping: + if char in _QUOTES and in_quote in (char, None): + if in_quote or after_op or char != '/': + in_quote = None if in_quote and not in_regex_char_group else char + elif in_quote == '/' and char in '[]': + in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and char in cls.OP_CHARS or (char == ' ' and after_op) + after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -590,6 +592,8 @@ class JSInterpreter(object): elif expr == 'undefined': return JS_Undefined, should_return + elif expr == 'NaN': + return float('NaN'), should_return elif md.get('return'): return local_vars[m.group('name')], should_return @@ -635,7 +639,8 @@ class JSInterpreter(object): def assertion(cndn, msg): """ assert, but without risk of getting optimized out """ if not cndn: - raise ExtractorError('{member} {msg}'.format(**locals()), expr=expr) + memb = member + raise self.Exception('{member} {msg}'.format(**locals()), expr=expr) def eval_method(): if (variable, member) == ('console', 'debug'): @@ -737,6 +742,13 @@ class JSInterpreter(object): return obj.index(idx, start) except ValueError: return -1 + elif member == 'charCodeAt': + assertion(isinstance(obj, compat_str), 'must be applied on a string') + # assertion(len(argvals) == 1, 'takes exactly one argument') # but not enforced + idx = argvals[0] if isinstance(argvals[0], int) else 0 + if idx >= len(obj): + return None + return ord(obj[idx]) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) @@ -820,12 +832,10 @@ class JSInterpreter(object): if mobj is None: break start, body_start = mobj.span() - body, remaining = self._separate_at_paren(code[body_start - 1:]) - name = self._named_object( - local_vars, - self.extract_function_from_code( - self.build_arglist(mobj.group('args')), - body, local_vars, *global_stack)) + body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + name = self._named_object(local_vars, self.extract_function_from_code( + [x.strip() for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) code = code[:start] + name + remaining return self.build_function(argnames, code, local_vars, *global_stack) @@ -854,7 +864,7 @@ class JSInterpreter(object): zip_longest(argnames, args, fillvalue=None)) global_stack[0].update(kwargs) var_stack = LocalNameSpace(*global_stack) - ret, should_abort = self.interpret_statement(code.replace('\n', ''), var_stack, allow_recursion - 1) + ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1) if should_abort: return ret return resf From 218c423bc042674a8834ffc09520a94fbbe7b138 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 1 Sep 2022 13:28:30 +0100 Subject: [PATCH 1382/1705] [cache] Add cache validation by program version, based on yt-dlp --- test/test_cache.py | 16 ++++++++++++++-- youtube_dl/cache.py | 28 +++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/test/test_cache.py b/test/test_cache.py index a16160142..931074aa1 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -3,17 +3,18 @@ from __future__ import unicode_literals -import shutil - # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import shutil from test.helper import FakeYDL from youtube_dl.cache import Cache +from youtube_dl.utils import version_tuple +from youtube_dl.version import __version__ def _is_empty(d): @@ -54,6 +55,17 @@ class TestCache(unittest.TestCase): self.assertFalse(os.path.exists(self.test_dir)) self.assertEqual(c.load('test_cache', 'k.'), None) + def test_cache_validation(self): + ydl = FakeYDL({ + 'cachedir': self.test_dir, + }) + c = Cache(ydl) + obj = {'x': 1, 'y': ['ä', '\\a', True]} + c.store('test_cache', 'k.', obj) + self.assertEqual(c.load('test_cache', 'k.', min_ver='1970.01.01'), obj) + new_version = '.'.join(('%d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__))) + self.assertIs(c.load('test_cache', 'k.', min_ver=new_version), None) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 7bdade1bd..4822439d0 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -10,12 +10,21 @@ import traceback from .compat import compat_getenv from .utils import ( + error_to_compat_str, expand_path, + is_outdated_version, + try_get, write_json_file, ) +from .version import __version__ class Cache(object): + + _YTDL_DIR = 'youtube-dl' + _VERSION_KEY = _YTDL_DIR + '_version' + _DEFAULT_VERSION = '2021.12.17' + def __init__(self, ydl): self._ydl = ydl @@ -23,7 +32,7 @@ class Cache(object): res = self._ydl.params.get('cachedir') if res is None: cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') - res = os.path.join(cache_root, 'youtube-dl') + res = os.path.join(cache_root, self._YTDL_DIR) return expand_path(res) def _get_cache_fn(self, section, key, dtype): @@ -50,13 +59,22 @@ class Cache(object): except OSError as ose: if ose.errno != errno.EEXIST: raise - write_json_file(data, fn) + write_json_file({self._VERSION_KEY: __version__, 'data': data}, fn) except Exception: tb = traceback.format_exc() self._ydl.report_warning( 'Writing cache to %r failed: %s' % (fn, tb)) - def load(self, section, key, dtype='json', default=None): + def _validate(self, data, min_ver): + version = try_get(data, lambda x: x[self._VERSION_KEY]) + if not version: # Backward compatibility + data, version = {'data': data}, self._DEFAULT_VERSION + if not is_outdated_version(version, min_ver or '0', assume_new=False): + return data['data'] + self._ydl.to_screen( + 'Discarding old cache from version {version} (needs {min_ver})'.format(**locals())) + + def load(self, section, key, dtype='json', default=None, min_ver=None): assert dtype in ('json',) if not self.enabled: @@ -66,12 +84,12 @@ class Cache(object): try: try: with io.open(cache_fn, 'r', encoding='utf-8') as cachef: - return json.load(cachef) + return self._validate(json.load(cachef), min_ver) except ValueError: try: file_size = os.path.getsize(cache_fn) except (OSError, IOError) as oe: - file_size = str(oe) + file_size = error_to_compat_str(oe) self._ydl.report_warning( 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) except IOError: From 7009bb9f3182449ae8cc05cc28b768b63030a485 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 2 Sep 2022 20:41:39 +0530 Subject: [PATCH 1383/1705] [jsinterp] Workaround operator associativity issue * temporary fix for player 5a3b6271 [1] 1. https://github.com/yt-dlp/yt-dlp/issues/4635#issuecomment-1235384480 --- test/test_youtube_signature.py | 4 ++++ youtube_dl/jsinterp.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index ec914a871..4e678cae0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -131,6 +131,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', '-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg', ), + ( + 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', + 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index d13329396..99dd98435 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -107,8 +107,8 @@ _OPERATORS = ( ('+', _js_arith_op(operator.add)), ('-', _js_arith_op(operator.sub)), ('*', _js_arith_op(operator.mul)), - ('/', _js_div), ('%', _js_mod), + ('/', _js_div), ('**', _js_exp), ) From 9493ffdb8b690732e995422621bad3ed6c9041f5 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 4 Oct 2022 00:42:15 +0100 Subject: [PATCH 1384/1705] [test] Use windows-2019 for tests (At least for now) resolves #31249 --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 90bd63c32..a609f3704 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,12 +15,12 @@ jobs: run-tests-ext: [sh] include: # python 3.2 is only available on windows via setup-python - - os: windows-latest + - os: windows-2019 python-version: 3.2 python-impl: cpython ytdl-test-set: core run-tests-ext: bat - - os: windows-latest + - os: windows-2019 python-version: 3.2 python-impl: cpython ytdl-test-set: download From d35557a75d943865e40410d51bfcc18276e98532 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 23 Sep 2022 12:10:35 +1200 Subject: [PATCH 1385/1705] [Telegraaf] Use mobile GraphQL API endpoint Workaround for Cloudflare 403 Fixes https://github.com/yt-dlp/yt-dlp/issues/5000 Authored by: coletdjnz --- youtube_dl/extractor/telegraaf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 2dc020537..5174898f2 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -34,7 +34,9 @@ class TelegraafIE(InfoExtractor): article_id = self._match_id(url) video_id = self._download_json( - 'https://www.telegraaf.nl/graphql', article_id, query={ + 'https://app.telegraaf.nl/graphql', article_id, + headers={'User-Agent': 'De Telegraaf/6.8.11 (Android 11; en_US)'}, + query={ 'query': '''{ article(uid: %s) { videos { From 22127b271c8f3e9266840bc5a2fb994d6248e369 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 10 Oct 2022 17:41:40 +0000 Subject: [PATCH 1386/1705] [NRK] Remove explicit Accept-Encoding header that invites Brotli Fixes #31285 --- youtube_dl/extractor/nrk.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6d01a25c3..5a62b50fc 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -60,8 +60,7 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('https://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query, - headers={'Accept-Encoding': 'gzip, deflate, br'}) + fatal=fatal, query=query) class NRKIE(NRKBaseIE): From 1b1442887e67b63545453e10816904e2b4c561c1 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 10 Oct 2022 19:26:32 +0100 Subject: [PATCH 1387/1705] [manyvids] Improve extraction (#31172) * extract all formats from page * extract description, uploader, views, likes * downrate previews * fix tests * use txt_or_none() --- youtube_dl/extractor/manyvids.py | 113 +++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index e8d7163e4..6805102ba 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -1,11 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( determine_ext, + extract_attributes, int_or_none, str_to_int, + url_or_none, urlencode_postdata, ) @@ -20,17 +25,20 @@ class ManyVidsIE(InfoExtractor): 'id': '133957', 'ext': 'mp4', 'title': 'everthing about me (Preview)', + 'uploader': 'ellyxxix', 'view_count': int, 'like_count': int, }, }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', - 'md5': 'f3e8f7086409e9b470e2643edb96bdcc', + 'md5': 'bb47bab0e0802c2a60c24ef079dfe60f', 'info_dict': { 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', + 'description': 'md5:ec5901d41808b3746fed90face161612', + 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, }, @@ -41,15 +49,43 @@ class ManyVidsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'video URL', group='url') + info = self._search_regex( + r'''(<div\b[^>]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', + webpage, 'meta details', default='') + info = extract_attributes(info) - title = self._html_search_regex( - (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)', - r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) + player = self._search_regex( + r'''(<div\b[^>]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', + webpage, 'player details', default='') + player = extract_attributes(player) + + video_urls_and_ids = ( + (info.get('data-meta-video'), 'video'), + (player.get('data-video-transcoded'), 'transcoded'), + (player.get('data-video-filepath'), 'filepath'), + (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), + ) + + def txt_or_none(s, default=None): + return (s.strip() or default) if isinstance(s, compat_str) else default + + uploader = txt_or_none(info.get('data-meta-author')) + + def mung_title(s): + if uploader: + s = re.sub(r'^\s*%s\s+[|-]' % (re.escape(uploader), ), '', s) + return txt_or_none(s) + + title = ( + mung_title(info.get('data-meta-title')) + or self._html_search_regex( + (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)', + r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'), + webpage, 'title', default=None) + or self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True)) + + title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title if any(p in webpage for p in ('preview_videos', '_preview.mp4')): title += ' (Preview)' @@ -70,23 +106,56 @@ class ManyVidsIE(InfoExtractor): 'X-Requested-With': 'XMLHttpRequest' }) - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - formats = [{'url': video_url}] + formats = [] + for v_url, fmt in video_urls_and_ids: + v_url = url_or_none(v_url) + if not v_url: + continue + if determine_ext(v_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls')) + else: + formats.append({ + 'url': v_url, + 'format_id': fmt, + }) - like_count = int_or_none(self._search_regex( - r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) - view_count = str_to_int(self._html_search_regex( - r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, - 'view count', default=None)) + self._remove_duplicate_formats(formats) + + for f in formats: + if f.get('height') is None: + f['height'] = int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) + if '/preview/' in f['url']: + f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) + f['preference'] = -10 + if 'transcoded' in f['format_id']: + f['preference'] = f.get('preference', -1) - 1 + + self._sort_formats(formats) + + def get_likes(): + likes = self._search_regex( + r'''(<a\b[^>]*\bdata-id\s*=\s*(['"])%s\2[^>]*>)''' % (video_id, ), + webpage, 'likes', default='') + likes = extract_attributes(likes) + return int_or_none(likes.get('data-likes')) + + def get_views(): + return str_to_int(self._html_search_regex( + r'''(?s)<span\b[^>]*\bclass\s*=["']views-wrapper\b[^>]+>.+?<span\b[^>]+>\s*(\d[\d,.]*)\s*</span>''', + webpage, 'view count', default=None)) return { 'id': video_id, 'title': title, - 'view_count': view_count, - 'like_count': like_count, 'formats': formats, + 'description': txt_or_none(info.get('data-meta-description')), + 'uploader': txt_or_none(info.get('data-meta-author')), + 'thumbnail': ( + url_or_none(info.get('data-meta-image')) + or url_or_none(player.get('data-video-screenshot'))), + 'view_count': get_views(), + 'like_count': get_likes(), } From 82e4eca711a128138ed0b84ddb4321e403d56340 Mon Sep 17 00:00:00 2001 From: Xiyue <113869642+xiyue077@users.noreply.github.com> Date: Tue, 11 Oct 2022 09:52:48 +1100 Subject: [PATCH 1388/1705] [motherless] Fixed the broken uploader_id in the extractor (#31243) * Fixed the broken uploader_id in the extractor. * Make uploader_id RE looser * Fix uploader_id in test Motherless_3 * Fix group pagination * # coding: utf-8 Co-authored-by: Andy Xuming <xuminic@gmail.com> Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/motherless.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index ef1e081f2..35d2b46ed 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import datetime @@ -71,7 +72,7 @@ class MotherlessIE(InfoExtractor): 'title': 'a/ Hot Teens', 'categories': list, 'upload_date': '20210104', - 'uploader_id': 'yonbiw', + 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, @@ -127,7 +128,7 @@ class MotherlessIE(InfoExtractor): comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( - r'"thumb-member-username">\s+<a href="/m/([^"]+)"', + r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''', webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage, default=None) @@ -169,7 +170,7 @@ class MotherlessGroupIE(InfoExtractor): 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' 'any kind!' }, - 'playlist_mincount': 9, + 'playlist_mincount': 0, }] @classmethod @@ -208,9 +209,9 @@ class MotherlessGroupIE(InfoExtractor): r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) description = self._html_search_meta( 'description', webpage, fatal=False) - page_count = self._int(self._search_regex( - r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', - webpage, 'page_count'), 'page_count') + page_count = str_to_int(self._search_regex( + r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', + webpage, 'page_count', default='1')) PAGE_SIZE = 80 def _get_page(idx): From 2ced5a79128f53faad94dc494d05925eb957c414 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 9 Aug 2022 19:34:34 +0100 Subject: [PATCH 1389/1705] [test] Implement string "lambda x: condition(x)" as an expected value Semantics equivalent to `assert condition(got)` --- test/helper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index e62aab11e..c6a2f0667 100644 --- a/test/helper.py +++ b/test/helper.py @@ -128,6 +128,12 @@ def expect_value(self, got, expected, field): self.assertTrue( contains_str in got, 'field %s (value: %r) should contain %r' % (field, got, contains_str)) + elif isinstance(expected, compat_str) and re.match(r'^lambda \w+:', expected): + fn = eval(expected) + suite = expected.split(':', 1)[1].strip() + self.assertTrue( + fn(got), + 'Expected field %s to meet condition %s, but value %r failed ' % (field, suite, got)) elif isinstance(expected, type): self.assertTrue( isinstance(got, expected), @@ -137,7 +143,7 @@ def expect_value(self, got, expected, field): elif isinstance(expected, list) and isinstance(got, list): self.assertEqual( len(expected), len(got), - 'Expect a list of length %d, but got a list of length %d for field %s' % ( + 'Expected a list of length %d, but got a list of length %d for field %s' % ( len(expected), len(got), field)) for index, (item_got, item_expected) in enumerate(zip(got, expected)): type_got = type(item_got) From c282e5f8d723763ba88c521221e4535f46453949 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 9 Aug 2022 19:37:58 +0100 Subject: [PATCH 1390/1705] [ZDF] Overhaul ZDF extractors * pull some yt-dlp changes into ZDFBaseIE._extract_format() * add test cases from yt-dlp to ZDFIE * fix crash in ZDFIE._extract_mobile() when object had no `formitaeten` * improve title extraction in ZDFChannelIE (remove trailing station ident) * avoid extracting non-video playlist items (fixes #31149) --- youtube_dl/extractor/zdf.py | 169 +++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 3d39bb33a..fcc63ef52 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -8,13 +8,14 @@ from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, + extract_attributes, float_or_none, int_or_none, merge_dicts, NO_DEFAULT, - orderedSet, parse_codecs, qualities, + str_or_none, try_get, unified_timestamp, update_url_query, @@ -57,28 +58,39 @@ class ZDFBaseIE(InfoExtractor): format_urls.add(format_url) mime_type = meta.get('mimeType') ext = determine_ext(format_url) + + join_nonempty = lambda s, l: s.join(filter(None, l)) + meta_map = lambda t: map(lambda x: str_or_none(meta.get(x)), t) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + new_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8_native', fatal=False) elif mime_type == 'application/f4m+xml' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + new_formats = self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) + if not f: + data = meta.get('type', '').split('_') + if try_get(data, lambda x: x[2]) == ext: + f = dict(zip(('vcodec', 'acodec'), data[1])) + format_id = ['http'] - for p in (meta.get('type'), meta.get('quality')): - if p and isinstance(p, compat_str): - format_id.append(p) + format_id.extend(join_nonempty('-', meta_map(('type', 'quality')))) f.update({ 'url': format_url, 'format_id': '-'.join(format_id), - 'format_note': meta.get('quality'), - 'language': meta.get('language'), - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - 'preference': -10, + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)) }) - formats.append(f) + new_formats = [f] + + formats.extend(merge_dicts(f, { + 'format_note': join_nonempty(',', meta_map(('quality', 'class'))), + 'language': meta.get('language'), + 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + }) for f in new_formats) def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( @@ -107,6 +119,7 @@ class ZDFBaseIE(InfoExtractor): 'type': f.get('type'), 'mimeType': f.get('mimeType'), 'quality': quality.get('quality'), + 'class': track.get('class'), 'language': track.get('language'), }) self._sort_formats(formats) @@ -171,6 +184,20 @@ class ZDFIE(ZDFBaseIE): 'duration': 2615, 'timestamp': 1465021200, 'upload_date': '20160604', + 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', + }, + }, { + 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', + 'md5': '1b93bdec7d02fc0b703c5e7687461628', + 'info_dict': { + 'ext': 'mp4', + 'id': 'video_funk_1770473', + 'duration': 1278, + 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'title': 'Alles ist verzaubert', + 'timestamp': 1635520560, + 'upload_date': '20211029', + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799', }, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche @@ -204,6 +231,19 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1641355200, 'upload_date': '20220105', }, + 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"' + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': '191205_1800_sendung_sok8', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'timestamp': 1654790700, + 'upload_date': '20220609', + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -265,15 +305,16 @@ class ZDFIE(ZDFBaseIE): 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, video_id) - document = video['document'] - - title = document['titel'] - content_id = document['basename'] - formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(content_id, formats, format_urls, f) + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -320,9 +361,9 @@ class ZDFChannelIE(ZDFBaseIE): 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { 'id': 'das-aktuelle-sportstudio', - 'title': 'das aktuelle sportstudio | ZDF', + 'title': 'das aktuelle sportstudio', }, - 'playlist_mincount': 23, + 'playlist_mincount': 18, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { @@ -330,6 +371,14 @@ class ZDFChannelIE(ZDFBaseIE): 'title': 'planet e.', }, 'playlist_mincount': 50, + }, { + 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', + 'info_dict': { + 'id': 'aktenzeichen-xy-ungeloest', + 'title': 'Aktenzeichen XY... ungelöst', + 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)", + }, + 'playlist_mincount': 2, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, @@ -339,60 +388,36 @@ class ZDFChannelIE(ZDFBaseIE): def suitable(cls, url): return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) + def _og_search_title(self, webpage, fatal=False): + title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal) + return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None + def _real_extract(self, url): channel_id = self._match_id(url) webpage = self._download_webpage(url, channel_id) - entries = [ - self.url_result(item_url, ie=ZDFIE.ie_key()) - for item_url in orderedSet(re.findall( - r'data-plusbar-url=["\'](http.+?\.html)', webpage))] + matches = re.finditer( + r'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>%s)\1''' % ZDFIE._VALID_URL, + webpage) - return self.playlist_result( - entries, channel_id, self._og_search_title(webpage, fatal=False)) + if self._downloader.params.get('noplaylist', False): + entry = next( + (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches), + None) + self.to_screen('Downloading just the main video because of --no-playlist') + if entry: + return entry + else: + self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, )) - r""" - player = self._extract_player(webpage, channel_id) + def check_video(m): + v_ref = self._search_regex( + r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ), + webpage, 'check id', default='') + v_ref = extract_attributes(v_ref) + return v_ref.get('data-target-video-type') != 'novideo' - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ + return self.playlist_from_matches( + (m.group('url') for m in matches if check_video(m)), + channel_id, self._og_search_title(webpage, fatal=False)) From 6e2626f092c63a5fa22a31df409610b5deaf3968 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 11 Oct 2022 05:58:10 +0100 Subject: [PATCH 1391/1705] [JSInterp] Improve separation logic Based on https://github.com/yt-dlp/yt-dlp/commit/0468a3b3253957bfbeb98b4a7c71542ff80e9e06 --- youtube_dl/jsinterp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 99dd98435..530a705b4 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -214,7 +214,7 @@ class JSInterpreter(object): def __init__(self, msg, *args, **kwargs): expr = kwargs.pop('expr', None) if expr is not None: - msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) + msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) @classmethod @@ -268,7 +268,7 @@ class JSInterpreter(object): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) + after_op = not in_quote and (char in cls.OP_CHARS or char == '[' or (char.isspace() and after_op)) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -301,7 +301,7 @@ class JSInterpreter(object): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) + raise cls.Exception('No terminating paren {delim} in {expr:.100}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @staticmethod From c94a459a248352fd97dccc79ed6604a558459bfd Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 11 Oct 2022 12:18:12 +0000 Subject: [PATCH 1392/1705] [utils] Sanitize look-alike Unicode glyphs in non-ID filename fields when --restrict-filenames Implements https://github.com/ytdl-org/youtube-dl/issues/31216#issuecomment-1236102822, which has a test. --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fea38ed32..23a65a81c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -33,6 +33,7 @@ import sys import tempfile import time import traceback +import unicodedata import xml.etree.ElementTree import zlib @@ -2118,6 +2119,9 @@ def sanitize_filename(s, restricted=False, is_id=False): return '_' return char + # Replace look-alike Unicode glyphs + if restricted and not is_id: + s = unicodedata.normalize('NFKC', s) # Handle timestamps s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) result = ''.join(map(replace_insane, s)) From 11b284c81fe2988813c817918536fc3a5630870a Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 11 Oct 2022 12:36:44 +0000 Subject: [PATCH 1393/1705] [Common:JWPlayer] Fix x1000 scaling error See https://github.com/yt-dlp/yt-dlp/issues/5106#issuecomment-1264625161 --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 797c35fd5..1f33a1e06 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2844,7 +2844,7 @@ class InfoExtractor(object): 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'ext': ext, } if source_url.startswith('rtmp'): From c91cbf60729af93c4677864aa6c8b74b576146ca Mon Sep 17 00:00:00 2001 From: Xie Yanbo <xieyanbo@gmail.com> Date: Tue, 11 Oct 2022 20:55:09 +0800 Subject: [PATCH 1394/1705] [netease] Get netease music download url through player api (#31235) * remove unplayable song from test * compatible with python 2 * using standard User_Agent, fix imports * use hash instead of long description * fix lint * fix hash --- test/test_aes.py | 9 +- youtube_dl/aes.py | 37 +++++++- youtube_dl/extractor/neteasemusic.py | 121 +++++++++++++++++++-------- 3 files changed, 128 insertions(+), 39 deletions(-) diff --git a/test/test_aes.py b/test/test_aes.py index cc89fb6ab..0f181466b 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text, aes_ecb_encrypt from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes import base64 @@ -58,6 +58,13 @@ class TestAES(unittest.TestCase): decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) + def test_ecb_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key)) + self.assertEqual( + encrypted, + b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index d0de2d93f..a94a41079 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -8,6 +8,18 @@ from .utils import bytes_to_intlist, intlist_to_bytes BLOCK_SIZE_BYTES = 16 +def pkcs7_padding(data): + """ + PKCS#7 padding + + @param {int[]} data cleartext + @returns {int[]} padding data + """ + + remaining_length = BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES + return data + [remaining_length] * remaining_length + + def aes_ctr_decrypt(data, key, counter): """ Decrypt with aes in counter mode @@ -76,8 +88,7 @@ def aes_cbc_encrypt(data, key, iv): previous_cipher_block = iv for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length + block = pkcs7_padding(block) mixed_block = xor(block, previous_cipher_block) encrypted_block = aes_encrypt(mixed_block, expanded_key) @@ -88,6 +99,28 @@ def aes_cbc_encrypt(data, key, iv): return encrypted_data +def aes_ecb_encrypt(data, key): + """ + Encrypt with aes in ECB mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block = pkcs7_padding(block) + + encrypted_block = aes_encrypt(block, expanded_key) + encrypted_data += encrypted_block + + return encrypted_data + + def key_expansion(data): """ Generate key schedule diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 978a05841..fad22a2cd 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -1,20 +1,31 @@ # coding: utf-8 from __future__ import unicode_literals -from hashlib import md5 from base64 import b64encode +from binascii import hexlify from datetime import datetime +from hashlib import md5 +from random import randint +import json import re +import time from .common import InfoExtractor +from ..aes import aes_ecb_encrypt, pkcs7_padding from ..compat import ( compat_urllib_parse_urlencode, compat_str, compat_itertools_count, ) from ..utils import ( - sanitized_Request, + ExtractorError, + bytes_to_intlist, float_or_none, + int_or_none, + intlist_to_bytes, + sanitized_Request, + std_headers, + try_get, ) @@ -35,32 +46,85 @@ class NetEaseMusicBaseIE(InfoExtractor): result = b64encode(m.digest()).decode('ascii') return result.replace('/', '_').replace('+', '-') + @classmethod + def make_player_api_request_data_and_headers(cls, song_id, bitrate): + KEY = b'e82ckenh8dichen8' + URL = '/api/song/enhance/player/url' + now = int(time.time() * 1000) + rand = randint(0, 1000) + cookie = { + 'osver': None, + 'deviceId': None, + 'appver': '8.0.0', + 'versioncode': '140', + 'mobilename': None, + 'buildver': '1623435496', + 'resolution': '1920x1080', + '__csrf': '', + 'os': 'pc', + 'channel': None, + 'requestId': '{0}_{1:04}'.format(now, rand), + } + request_text = json.dumps( + {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, + separators=(',', ':')) + message = 'nobody{0}use{1}md5forencrypt'.format( + URL, request_text).encode('latin1') + msg_digest = md5(message).hexdigest() + + data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( + URL, request_text, msg_digest) + data = pkcs7_padding(bytes_to_intlist(data)) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) + encrypted_params = hexlify(encrypted).decode('ascii').upper() + + cookie = '; '.join( + ['{0}={1}'.format(k, v if v is not None else 'undefined') + for [k, v] in cookie.items()]) + + headers = { + 'User-Agent': std_headers['User-Agent'], + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': 'https://music.163.com', + 'Cookie': cookie, + } + return ('params={0}'.format(encrypted_params), headers) + + def _call_player_api(self, song_id, bitrate): + url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' + data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) + try: + return self._download_json( + url, song_id, data=data.encode('ascii'), headers=headers) + except ExtractorError as e: + if type(e.cause) in (ValueError, TypeError): + # JSON load failure + raise + except Exception: + pass + return {} + def extract_formats(self, info): formats = [] + song_id = info['id'] for song_format in self._FORMATS: details = info.get(song_format) if not details: continue - song_file_path = '/%s/%s.%s' % ( - self._encrypt(details['dfsId']), details['dfsId'], details['extension']) - # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature - # from NetEase's CDN provider that can be used if m5.music.126.net does not - # work, especially for users outside of Mainland China - # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880 - for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net', - 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'): - song_url = host + song_file_path + bitrate = int_or_none(details.get('bitrate')) or 999000 + data = self._call_player_api(song_id, bitrate) + for song in try_get(data, lambda x: x['data'], list) or []: + song_url = try_get(song, lambda x: x['url']) if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, 'ext': details.get('extension'), - 'abr': float_or_none(details.get('bitrate'), scale=1000), + 'abr': float_or_none(song.get('br'), scale=1000), 'format_id': song_format, - 'filesize': details.get('size'), - 'asr': details.get('sr') + 'filesize': int_or_none(song.get('size')), + 'asr': int_or_none(details.get('sr')), }) - break return formats @classmethod @@ -79,30 +143,16 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', + 'md5': '3e909614ce09b1ccef4a3eb205441190', 'info_dict': { 'id': '32102397', 'ext': 'mp3', - 'title': 'Bad Blood (feat. Kendrick Lamar)', + 'title': 'Bad Blood', 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150517', - 'timestamp': 1431878400, - 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', + 'upload_date': '20150516', + 'timestamp': 1431792000, + 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'No lyrics translation.', - 'url': 'http://music.163.com/#/song?id=29822014', - 'info_dict': { - 'id': '29822014', - 'ext': 'mp3', - 'title': '听见下雨的声音', - 'creator': '周杰伦', - 'upload_date': '20141225', - 'timestamp': 1419523200, - 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', - }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'No lyrics.', 'url': 'http://music.163.com/song?id=17241424', @@ -112,9 +162,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': 'Opus 28', 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', + 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', 'timestamp': 1202745600, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Has translated name.', 'url': 'http://music.163.com/#/song?id=22735043', @@ -128,7 +178,6 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'timestamp': 1264608000, 'alt_title': '说出愿望吧(Genie)', }, - 'skip': 'Blocked outside Mainland China', }] def _process_lyrics(self, lyrics_info): From 7bbd5b13d4c6cfc3e24f56413ff1a1eace8472b8 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 12 Oct 2022 01:09:55 +0100 Subject: [PATCH 1395/1705] [Motherless] Pull from yt-dlp, etc * use username field * loosen regexes * warn on page count 0 in group * avoid reloading group page 1 Closes #29626 --- youtube_dl/extractor/motherless.py | 33 +++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 35d2b46ed..d352cb180 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -126,9 +126,10 @@ class MotherlessIE(InfoExtractor): kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') - comment_count = webpage.count('class="media-comment-contents"') + comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( - r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''', + (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', + r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage, default=None) @@ -171,6 +172,17 @@ class MotherlessGroupIE(InfoExtractor): 'any kind!' }, 'playlist_mincount': 0, + 'expected_warnings': [ + 'This group has no videos.', + ] + }, { + 'url': 'https://motherless.com/g/beautiful_cock', + 'info_dict': { + 'id': 'beautiful_cock', + 'title': 'Beautiful Cock', + 'description': 'Group for lovely cocks yours, mine, a friends anything human', + }, + 'playlist_mincount': 2500, }] @classmethod @@ -211,14 +223,21 @@ class MotherlessGroupIE(InfoExtractor): 'description', webpage, fatal=False) page_count = str_to_int(self._search_regex( r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', - webpage, 'page_count', default='1')) + webpage, 'page_count', default=0)) + if not page_count: + message = self._search_regex( + r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', + webpage, 'error_msg', default=None) or 'This group has no videos.' + self.report_warning(message, group_id) + page_count = 1 PAGE_SIZE = 80 def _get_page(idx): - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) + if idx > 0: + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) for entry in self._extract_entries(webpage, url): yield entry From 7135277fec497bd7649c31087aba52daa7897484 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 13 Oct 2022 01:59:01 +0000 Subject: [PATCH 1396/1705] [ManyVids] Support new single-page app structure See https://github.com/yt-dlp/yt-dlp/issues/5210#issuecomment-1276919962. --- youtube_dl/extractor/manyvids.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index 6805102ba..608a02a8d 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -47,7 +47,12 @@ class ManyVidsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + real_url = 'https://www.manyvids.com/video/%s/gtm.js' % (video_id, ) + try: + webpage = self._download_webpage(real_url, video_id) + except: + # probably useless fallback + webpage = self._download_webpage(url, video_id) info = self._search_regex( r'''(<div\b[^>]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', @@ -98,7 +103,8 @@ class ManyVidsIE(InfoExtractor): # Sets some cookies self._download_webpage( 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, fatal=False, data=urlencode_postdata({ + video_id, note='Setting format cookies', fatal=False, + data=urlencode_postdata({ 'mvtoken': mv_token, 'vid': video_id, }), headers={ From ee8560d01eec511587f8207c3d84219ec620a9a6 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 13 Oct 2022 02:42:49 +0000 Subject: [PATCH 1397/1705] [ManyVids] Support new single-page app structure --- youtube_dl/extractor/manyvids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index 608a02a8d..75978cfd6 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -50,7 +50,7 @@ class ManyVidsIE(InfoExtractor): real_url = 'https://www.manyvids.com/video/%s/gtm.js' % (video_id, ) try: webpage = self._download_webpage(real_url, video_id) - except: + except Exception: # probably useless fallback webpage = self._download_webpage(url, video_id) From 447edc48e63f5f21797ea0d9ee84e37ed1547035 Mon Sep 17 00:00:00 2001 From: ache <ache@ache.one> Date: Tue, 18 Oct 2022 15:06:27 +0000 Subject: [PATCH 1398/1705] Fix ADN extractor (#31275) * Rename Anime Digital Network to Animation Digital Network, animationdigitalnetwork.fr * Update the test to an available video * Update the decoding key of subtitles * Keep the support of old URLs * Add a test to match the old URL * Reduce redundancy of the URL name * Fix md5 ^^" * Fix undefined _BASE * Process HTTP error text (eg geo-block) correctly and uniformly in Py3, Py2 * Skip test for CI since geo-blocked Signed-off-by: ache <ache@ache.one> Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/adn.py | 57 +++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index a55ebbcbd..5ff419f19 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -31,30 +31,34 @@ from ..utils import ( class ADNIE(InfoExtractor): - IE_DESC = 'Anime Digital Network' - _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': '0319c99885ff5547565cacb4f3f9348d', + IE_DESC = 'Animation Digital Network' + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { - 'id': '7778', + 'id': '9841', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', - 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', - 'series': 'Blue Exorcist - Kyôto Saga', - 'duration': 1467, - 'release_date': '20170106', + 'title': 'Fruits Basket - Episode 1', + 'description': 'md5:14be2f72c3c96809b0ca424b0097d336', + 'series': 'Fruits Basket', + 'duration': 1437, + 'release_date': '20190405', 'comment_count': int, 'average_rating': float, - 'season_number': 2, - 'episode': 'Début des hostilités', + 'season_number': 1, + 'episode': 'À ce soir !', 'episode_number': 1, - } - } + }, + 'skip': 'Only available in region (FR, ...)', + }, { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'only_matching': True, + }] - _NETRC_MACHINE = 'animedigitalnetwork' - _BASE_URL = 'http://animedigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = 'https://gw.api.' + _BASE + '/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} _LOGIN_ERR_MESSAGE = 'Unable to log in' @@ -82,14 +86,14 @@ class ADNIE(InfoExtractor): if subtitle_location: enc_subtitles = self._download_webpage( subtitle_location, video_id, 'Downloading subtitles data', - fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) + fatal=False, headers={'Origin': 'https://' + self._BASE}) if not enc_subtitles: return None - # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + # http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), + bytes_to_intlist(binascii.unhexlify(self._K + '7fac1178830cfe0c')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -138,9 +142,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' if not username: return try: + url = self._API_BASE_URL + 'authentication/login' access_token = (self._download_json( - self._API_BASE_URL + 'authentication/login', None, - 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + url, None, 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, data=urlencode_postdata({ 'password': password, 'rememberMe': False, @@ -153,7 +157,8 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' message = None if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json( - e.cause.read().decode(), None, fatal=False) or {} + self._webpage_read_content(e.cause, url, username), + username, fatal=False) or {} message = resp.get('message') or resp.get('code') self.report_warning(message or self._LOGIN_ERR_MESSAGE) @@ -211,7 +216,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' # This usually goes away with a different random pkcs1pad, so retry continue - error = self._parse_json(e.cause.read(), video_id) + error = self._parse_json( + self._webpage_read_content(e.cause, links_url, video_id), + video_id, fatal=False) or {} message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) From 0faa45d6c08f518b73d20e341944ea7292f9f2b2 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 20 Oct 2022 11:06:44 +0000 Subject: [PATCH 1399/1705] [BongaCams] Support new .net domain Resolves #31262. --- youtube_dl/extractor/bongacams.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bongacams.py b/youtube_dl/extractor/bongacams.py index 180542fbc..016999d55 100644 --- a/youtube_dl/extractor/bongacams.py +++ b/youtube_dl/extractor/bongacams.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -12,13 +13,28 @@ from ..utils import ( class BongaCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://de.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://cn.bongacams.com/azumi-8', 'only_matching': True, + }, { + 'url': 'https://de.bongacams.net/claireashton', + 'info_dict': { + 'id': 'claireashton', + 'ext': 'mp4', + 'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'age_limit': 18, + 'uploader_id': 'ClaireAshton', + 'uploader': 'ClaireAshton', + 'like_count': int, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): From 502cefa41f1d24057b6158748b2072dc911af682 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Thu, 27 Oct 2022 14:33:00 +0000 Subject: [PATCH 1400/1705] [Vimeo] Update variable name in hydration JSON pattern Fixes #31311 --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a66912502..853b38402 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -663,7 +663,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if '//player.vimeo.com/video/' in url: config = self._parse_json(self._search_regex( - r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) From d25cf62086443d86a633b8176b5c7e79f4cc569e Mon Sep 17 00:00:00 2001 From: Xie Yanbo <xieyanbo@gmail.com> Date: Sun, 30 Oct 2022 19:46:46 +0800 Subject: [PATCH 1401/1705] [netease] Impove error handling (#31303) * add warnings for users outside of China * skip empty song urls Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/neteasemusic.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index fad22a2cd..2bbfc7858 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -20,6 +20,7 @@ from ..compat import ( from ..utils import ( ExtractorError, bytes_to_intlist, + error_to_compat_str, float_or_none, int_or_none, intlist_to_bytes, @@ -94,17 +95,23 @@ class NetEaseMusicBaseIE(InfoExtractor): url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) try: - return self._download_json( + msg = 'empty result' + result = self._download_json( url, song_id, data=data.encode('ascii'), headers=headers) + if result: + return result except ExtractorError as e: if type(e.cause) in (ValueError, TypeError): # JSON load failure raise - except Exception: - pass + except Exception as e: + msg = error_to_compat_str(e) + self.report_warning('%s API call (%s) failed: %s' % ( + song_id, bitrate, msg)) return {} def extract_formats(self, info): + err = 0 formats = [] song_id = info['id'] for song_format in self._FORMATS: @@ -116,6 +123,8 @@ class NetEaseMusicBaseIE(InfoExtractor): data = self._call_player_api(song_id, bitrate) for song in try_get(data, lambda x: x['data'], list) or []: song_url = try_get(song, lambda x: x['url']) + if not song_url: + continue if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, @@ -125,6 +134,19 @@ class NetEaseMusicBaseIE(InfoExtractor): 'filesize': int_or_none(song.get('size')), 'asr': int_or_none(details.get('sr')), }) + elif err == 0: + err = try_get(song, lambda x: x['code'], int) + + if not formats: + msg = 'No media links found' + if err != 0 and (err < 200 or err >= 400): + raise ExtractorError( + '%s (site code %d)' % (msg, err, ), expected=True) + else: + self.raise_geo_restricted( + msg + ': probably this video is not available from your location due to geo restriction.', + countries=['CN']) + return formats @classmethod From ce5d36486ea95b8961c639d118bad262c8d7a067 Mon Sep 17 00:00:00 2001 From: Xie Yanbo <xieyanbo@gmail.com> Date: Sun, 30 Oct 2022 19:48:44 +0800 Subject: [PATCH 1402/1705] [netease] Support urls shared from mobile app (#31304) Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/neteasemusic.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 2bbfc7858..5e5c6271b 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -162,7 +162,7 @@ class NetEaseMusicBaseIE(InfoExtractor): class NetEaseMusicIE(NetEaseMusicBaseIE): IE_NAME = 'netease:song' IE_DESC = '网易云音乐' - _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/song?id=32102397', 'md5': '3e909614ce09b1ccef4a3eb205441190', @@ -200,6 +200,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'timestamp': 1264608000, 'alt_title': '说出愿望吧(Genie)', }, + }, { + 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', + 'md5': '95826c73ea50b1c288b22180ec9e754d', + 'info_dict': { + 'id': '95670', + 'ext': 'mp3', + 'title': '国际歌', + 'creator': '马备', + 'upload_date': '19911130', + 'timestamp': 691516800, + 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + }, }] def _process_lyrics(self, lyrics_info): From a19855f0f50fe7a6eb05a1d8fee554897e4dbdda Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 31 Oct 2022 21:18:36 +0000 Subject: [PATCH 1403/1705] [compat] Add Python 2 Unicode casefold using a trivial wrapper around icu/CaseFolding.txt --- youtube_dl/casefold.py | 1643 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1643 insertions(+) create mode 100644 youtube_dl/casefold.py diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py new file mode 100644 index 000000000..546269a3c --- /dev/null +++ b/youtube_dl/casefold.py @@ -0,0 +1,1643 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .compat import compat_str + +# CaseFolding-15.0.0.txt +# Date: 2022-02-02, 23:35:35 GMT +# © 2022 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ +# +# Case Folding Properties +# +# This file is a supplement to the UnicodeData file. +# It provides a case folding mapping generated from the Unicode Character Database. +# If all characters are mapped according to the full mapping below, then +# case differences (according to UnicodeData.txt and SpecialCasing.txt) +# are eliminated. +# +# The data supports both implementations that require simple case foldings +# (where string lengths don't change), and implementations that allow full case folding +# (where string lengths may grow). Note that where they can be supported, the +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# +# All code points not listed in this file map to themselves. +# +# NOTE: case folding does not preserve normalization formats! +# +# For information on case folding, including how to have case folding +# preserve normalization formats, see Section 3.13 Default Case Algorithms in +# The Unicode Standard. +# +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# <code>; <status>; <mapping>; # <name> +# +# The status field is: +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# Note that the Turkic mappings do not maintain canonical equivalence without additional processing. +# See the discussions of case mapping in the Unicode Standard for more information. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) +# +# ================================================================= + +# Property: Case_Folding + +# All code points not explicitly listed for Case_Folding +# have the value C for the status field, and the code point itself for the mapping field. + +# ================================================================= +_map_str = ''' +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE +0244; C; 0289; # LATIN CAPITAL LETTER U BAR +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0370; C; 0371; # GREEK CAPITAL LETTER HETA +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F; C; 03F3; # GREEK CAPITAL LETTER YOT +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE +052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE +052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN +1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN +1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN +1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON +1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN +1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN +1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN +1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN +1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN +1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN +1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS +1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN +1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR +1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON +1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR +1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR +1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE +1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN +1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR +1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN +1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR +1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR +1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN +1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR +1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN +1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN +1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN +1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL +1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL +1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR +1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN +1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN +1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE +1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE +1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE +1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE +1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR +1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE +1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI +1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN +1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI +1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN +1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN +1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN +1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2132; C; 214E; # TURNED CAPITAL F +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE +A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O +A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726; C; A727; # LATIN CAPITAL LETTER HENG +A728; C; A729; # LATIN CAPITAL LETTER TZ +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732; C; A733; # LATIN CAPITAL LETTER AA +A734; C; A735; # LATIN CAPITAL LETTER AO +A736; C; A737; # LATIN CAPITAL LETTER AU +A738; C; A739; # LATIN CAPITAL LETTER AV +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C; C; A73D; # LATIN CAPITAL LETTER AY +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP +A74E; C; A74F; # LATIN CAPITAL LETTER OO +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760; C; A761; # LATIN CAPITAL LETTER VY +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768; C; A769; # LATIN CAPITAL LETTER VEND +A76A; C; A76B; # LATIN CAPITAL LETTER ET +A76C; C; A76D; # LATIN CAPITAL LETTER IS +A76E; C; A76F; # LATIN CAPITAL LETTER CON +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G +A780; C; A781; # LATIN CAPITAL LETTER TURNED L +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR +A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH +A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE +A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE +A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE +A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK +A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E +A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G +A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K +A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE +A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A +A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I +A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U +A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O +A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W +A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK +A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK +A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK +A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY +A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY +A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S +A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S +A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG +10426; C; 1044E; # DESERET CAPITAL LETTER OI +10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10570; C; 10597; # VITHKUQI CAPITAL LETTER A +10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE +10572; C; 10599; # VITHKUQI CAPITAL LETTER BE +10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE +10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE +10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE +10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE +10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI +10578; C; 1059F; # VITHKUQI CAPITAL LETTER E +10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE +1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA +1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA +1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA +1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I +1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE +10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE +10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA +10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA +10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA +10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME +10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE +10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE +10587; C; 105AE; # VITHKUQI CAPITAL LETTER O +10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE +10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA +1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE +1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE +1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE +1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE +1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE +10590; C; 105B7; # VITHKUQI CAPITAL LETTER U +10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE +10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE +10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y +10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US +118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA +118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A +118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI +118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU +118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA +118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO +118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II +118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU +118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E +118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O +118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG +118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA +118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO +118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY +118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ +118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC +118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN +118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD +118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE +118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG +118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA +118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT +118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM +118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU +118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU +118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO +118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO +118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR +118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR +118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU +118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII +118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M +16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S +16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V +16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W +16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU +16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z +16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP +16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P +16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T +16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G +16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F +16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I +16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K +16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A +16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J +16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E +16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B +16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C +16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U +16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU +16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L +16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q +16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP +16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY +16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X +16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D +16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE +16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N +16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R +16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O +16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI +16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA +''' +_map = dict( + (unichr(int(from_, 16)), ''.join((unichr(int(v, 16)) for v in to_.split(' ')))) + for from_, type_, to_, _ in ( + l.split('; ', 3) for l in _map_str.splitlines() if l) + if type_ in ('C', 'F')) +del _map_str + +def casefold(s): + assert isinstance(s, compat_str) + return ''.join((_map.get(c, c) for c in s)) + +__all__ = [ + casefold +] From f102e3dc4efe27e6c8697d6d117f05d1bb6d1b91 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Mon, 31 Oct 2022 21:27:14 +0000 Subject: [PATCH 1404/1705] [compat] Add compat_casefold and compat_re_Match, for traverse_obj() port --- youtube_dl/compat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index eca6d63de..4b5e1f6ed 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2375,8 +2375,10 @@ except ImportError: try: compat_str = unicode # Python 2 + from .casefold import casefold as compat_casefold except NameError: compat_str = str + compat_casefold = lambda s: s.casefold() try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes @@ -3066,6 +3068,9 @@ except ImportError: # Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) compat_re_Pattern = type(re.compile('')) +# and on the type of a match +compat_re_Match = type(re.match('a', 'a')) + if sys.version_info < (3, 3): def compat_b64decode(s, *args, **kwargs): @@ -3101,6 +3106,7 @@ __all__ = [ 'compat_Struct', 'compat_b64decode', 'compat_basestring', + 'compat_casefold', 'compat_chr', 'compat_collections_abc', 'compat_collections_chain_map', @@ -3132,6 +3138,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_re_Match', 'compat_re_Pattern', 'compat_realpath', 'compat_setenv', From b7c25959f0f76aad4ee24e254f82e6c5cca2c1ff Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 1 Nov 2022 12:40:23 +0000 Subject: [PATCH 1405/1705] [compat] Unify unicode/str compat and move up --- youtube_dl/compat.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4b5e1f6ed..28942a8c1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,19 @@ import subprocess import sys import xml.etree.ElementTree +# deal with critical unicode/str things first +try: + # Python 2 + compat_str, compat_basestring, compat_chr = ( + unicode, basestring, unichr + ) + from .casefold import casefold as compat_casefold +except NameError: + compat_str, compat_basestring, compat_chr = ( + str, str, chr + ) + compat_casefold = lambda s: s.casefold() + try: import collections.abc as compat_collections_abc except ImportError: @@ -2373,13 +2386,6 @@ try: except ImportError: import BaseHTTPServer as compat_http_server -try: - compat_str = unicode # Python 2 - from .casefold import casefold as compat_casefold -except NameError: - compat_str = str - compat_casefold = lambda s: s.casefold() - try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote @@ -2510,22 +2516,11 @@ except ImportError: # Python < 3.4 return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) -try: - compat_basestring = basestring # Python 2 -except NameError: - compat_basestring = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - try: from xml.etree.ElementTree import ParseError as compat_xml_parse_error except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error - etree = xml.etree.ElementTree From a874871801b8b05d06e8ffe52bed94fdfc26611e Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 1 Nov 2022 19:25:59 +0000 Subject: [PATCH 1406/1705] [compat] Reformat casefold.py for easier updating --- youtube_dl/casefold.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py index 546269a3c..7e91c3811 100644 --- a/youtube_dl/casefold.py +++ b/youtube_dl/casefold.py @@ -1,8 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -from .compat import compat_str +from .compat import ( + compat_str, + compat_chr, +) +# Below is included the text of icu/CaseFolding.txt retrieved from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt +# In case newly foldable Unicode characters are defined, paste the new version +# of the text inside the ''' marks. +# The text is expected to have only blank lines andlines with 1st character #, +# all ignored, and fold definitions like this: +# `from_hex_code; space_separated_to_hex_code_list; comment` + +_map_str = ''' # CaseFolding-15.0.0.txt # Date: 2022-02-02, 23:35:35 GMT # © 2022 Unicode®, Inc. @@ -65,7 +77,6 @@ from .compat import compat_str # have the value C for the status field, and the code point itself for the mapping field. # ================================================================= -_map_str = ''' 0041; C; 0061; # LATIN CAPITAL LETTER A 0042; C; 0062; # LATIN CAPITAL LETTER B 0043; C; 0063; # LATIN CAPITAL LETTER C @@ -1627,17 +1638,22 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO 1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA ''' + +_parse_unichr = lambda s: compat_chr(int(s, 16)) + _map = dict( - (unichr(int(from_, 16)), ''.join((unichr(int(v, 16)) for v in to_.split(' ')))) + (_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' ')))) for from_, type_, to_, _ in ( - l.split('; ', 3) for l in _map_str.splitlines() if l) + l.split('; ', 3) for l in _map_str.splitlines() if l and not l[0] == '#') if type_ in ('C', 'F')) del _map_str + def casefold(s): assert isinstance(s, compat_str) return ''.join((_map.get(c, c) for c in s)) + __all__ = [ casefold ] From 65ccb0dd4eb52cced7d0e11af021c09dbe2aed4a Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 1 Nov 2022 21:33:39 +0000 Subject: [PATCH 1407/1705] [compat] Add test for compat_casefold() --- test/test_compat.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index 86ff389fd..05995372a 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -118,9 +118,21 @@ class TestCompat(unittest.TestCase): <smil xmlns="http://www.w3.org/2001/SMIL20/Language"></smil>''' compat_etree_fromstring(xml) - def test_struct_unpack(self): + def test_compat_struct_unpack(self): self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,)) + def test_compat_casefold(self): + if hasattr(compat_str, 'casefold'): + # don't bother to test str.casefold() (again) + return + # thanks https://bugs.python.org/file24232/casefolding.patch + self.assertEqual(compat_casefold('hello'), 'hello') + self.assertEqual(compat_casefold('hELlo'), 'hello') + self.assertEqual(compat_casefold('ß'), 'ss') + self.assertEqual(compat_casefold('fi'), 'fi') + self.assertEqual(compat_casefold('\u03a3'), '\u03c3') + self.assertEqual(compat_casefold('A\u0345\u03a3'), 'a\u03b9\u03c3') + if __name__ == '__main__': unittest.main() From 087ddc237132103859cc00183d8d70bd75c0e44e Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 1 Nov 2022 22:47:02 +0000 Subject: [PATCH 1408/1705] [compat] Add test for compat_casefold() --- test/test_compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_compat.py b/test/test_compat.py index 05995372a..0986cff37 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( + compat_casefold, compat_getenv, compat_setenv, compat_etree_Element, From c4b19a88169fa76c5eb665d274e7270a0fe452c4 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Wed, 2 Nov 2022 11:56:26 +0000 Subject: [PATCH 1409/1705] [compat] Work around in case folding for narrow Python build Resolves #31324. --- youtube_dl/casefold.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py index 7e91c3811..748c2d491 100644 --- a/youtube_dl/casefold.py +++ b/youtube_dl/casefold.py @@ -1639,7 +1639,15 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA ''' -_parse_unichr = lambda s: compat_chr(int(s, 16)) + +def _parse_unichr(s): + s = int(s, 16) + try: + return compat_chr(s) + except ValueError: + # work around "unichr() arg not in range(0x10000) (narrow Python build)" + return ('\\U%08x' % s).decode('unicode-escape') + _map = dict( (_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' ')))) From 27ed77aabba8c9eb08d66f34092b1bfcc22c482e Mon Sep 17 00:00:00 2001 From: Andrei Lebedev <lebdron@gmail.com> Date: Thu, 3 Nov 2022 11:09:37 +0100 Subject: [PATCH 1410/1705] [utils] Backport traverse_obj (etc) from yt-dlp (#31156) * Backport traverse_obj and closely related function from yt-dlp (code by pukkandan) * Backport LazyList, variadic(), try_call (code by pukkandan) * Recast using yt-dlp's newer traverse_obj() implementation and tests (code by grub4k) * Add tests for Unicode case folding support matching Py3.5+ (requires f102e3d) * Improve/add tests for variadic, try_call, join_nonempty Co-authored-by: dirkf <fieldhouse@gmx.net> --- test/test_utils.py | 323 +++++++++++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 339 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 662 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index f1a748dde..9d364c863 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -12,7 +12,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests import io +import itertools import json +import re import xml.etree.ElementTree from youtube_dl.utils import ( @@ -40,11 +42,14 @@ from youtube_dl.utils import ( get_element_by_attribute, get_elements_by_class, get_elements_by_attribute, + get_first, InAdvancePagedList, int_or_none, intlist_to_bytes, is_html, + join_nonempty, js_to_json, + LazyList, limit_length, merge_dicts, mimetype2ext, @@ -79,6 +84,8 @@ from youtube_dl.utils import ( strip_or_none, subtitles_filename, timeconvert, + traverse_obj, + try_call, unescapeHTML, unified_strdate, unified_timestamp, @@ -92,6 +99,7 @@ from youtube_dl.utils import ( urlencode_postdata, urshift, update_url_query, + variadic, version_tuple, xpath_with_ns, xpath_element, @@ -112,12 +120,18 @@ from youtube_dl.compat import ( compat_getenv, compat_os_name, compat_setenv, + compat_str, compat_urlparse, compat_parse_qs, ) class TestUtil(unittest.TestCase): + + # yt-dlp shim + def assertCountEqual(self, expected, got, msg='count should be the same'): + return self.assertEqual(len(tuple(expected)), len(tuple(got)), msg=msg) + def test_timeconvert(self): self.assertTrue(timeconvert('') is None) self.assertTrue(timeconvert('bougrg') is None) @@ -1478,6 +1492,315 @@ Line 1 self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + def test_LazyList(self): + it = list(range(10)) + + self.assertEqual(list(LazyList(it)), it) + self.assertEqual(LazyList(it).exhaust(), it) + self.assertEqual(LazyList(it)[5], it[5]) + + self.assertEqual(LazyList(it)[5:], it[5:]) + self.assertEqual(LazyList(it)[:5], it[:5]) + self.assertEqual(LazyList(it)[::2], it[::2]) + self.assertEqual(LazyList(it)[1::2], it[1::2]) + self.assertEqual(LazyList(it)[5::-1], it[5::-1]) + self.assertEqual(LazyList(it)[6:2:-2], it[6:2:-2]) + self.assertEqual(LazyList(it)[::-1], it[::-1]) + + self.assertTrue(LazyList(it)) + self.assertFalse(LazyList(range(0))) + self.assertEqual(len(LazyList(it)), len(it)) + self.assertEqual(repr(LazyList(it)), repr(it)) + self.assertEqual(compat_str(LazyList(it)), compat_str(it)) + + self.assertEqual(list(LazyList(it, reverse=True)), it[::-1]) + self.assertEqual(list(reversed(LazyList(it))[::-1]), it) + self.assertEqual(list(reversed(LazyList(it))[1:3:7]), it[::-1][1:3:7]) + + def test_LazyList_laziness(self): + + def test(ll, idx, val, cache): + self.assertEqual(ll[idx], val) + self.assertEqual(ll._cache, list(cache)) + + ll = LazyList(range(10)) + test(ll, 0, 0, range(1)) + test(ll, 5, 5, range(6)) + test(ll, -3, 7, range(10)) + + ll = LazyList(range(10), reverse=True) + test(ll, -1, 0, range(1)) + test(ll, 3, 6, range(10)) + + ll = LazyList(itertools.count()) + test(ll, 10, 10, range(11)) + ll = reversed(ll) + test(ll, -15, 14, range(15)) + + def test_try_call(self): + def total(*x, **kwargs): + return sum(x) + sum(kwargs.values()) + + self.assertEqual(try_call(None), None, + msg='not a fn should give None') + self.assertEqual(try_call(lambda: 1), 1, + msg='int fn with no expected_type should give int') + self.assertEqual(try_call(lambda: 1, expected_type=int), 1, + msg='int fn with expected_type int should give int') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with wrong expected_type should give None') + self.assertEqual(try_call(total, args=(0, 1, 0, ), expected_type=int), 1, + msg='fn should accept arglist') + self.assertEqual(try_call(total, kwargs={'a': 0, 'b': 1, 'c': 0}, expected_type=int), 1, + msg='fn should accept kwargs') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with no expected_type should give None') + self.assertEqual(try_call(lambda x: {}, total, args=(42, ), expected_type=int), 42, + msg='expect first int result with expected_type int') + + def test_variadic(self): + self.assertEqual(variadic(None), (None, )) + self.assertEqual(variadic('spam'), ('spam', )) + self.assertEqual(variadic('spam', allowed_types=dict), 'spam') + + def test_traverse_obj(self): + _TEST_DATA = { + 100: 100, + 1.2: 1.2, + 'str': 'str', + 'None': None, + '...': Ellipsis, + 'urls': [ + {'index': 0, 'url': 'https://www.example.com/0'}, + {'index': 1, 'url': 'https://www.example.com/1'}, + ], + 'data': ( + {'index': 2}, + {'index': 3}, + ), + 'dict': {}, + } + + # Test base functionality + self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str', + msg='allow tuple path') + self.assertEqual(traverse_obj(_TEST_DATA, ['str']), 'str', + msg='allow list path') + self.assertEqual(traverse_obj(_TEST_DATA, (value for value in ("str",))), 'str', + msg='allow iterable path') + self.assertEqual(traverse_obj(_TEST_DATA, 'str'), 'str', + msg='single items should be treated as a path') + self.assertEqual(traverse_obj(_TEST_DATA, None), _TEST_DATA) + self.assertEqual(traverse_obj(_TEST_DATA, 100), 100) + self.assertEqual(traverse_obj(_TEST_DATA, 1.2), 1.2) + + # Test Ellipsis behavior + self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis), + (item for item in _TEST_DATA.values() if item is not None), + msg='`...` should give all values except `None`') + self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(), + msg='`...` selection for dicts should select all values') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='nested `...` queries should work') + self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4), + msg='`...` query result should be flattened') + + # Test function as key + self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), + [_TEST_DATA['urls']], + msg='function as query key should perform a filter based on (key, value)') + self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], compat_str)), {'str'}, + msg='exceptions in the query function should be caught') + + # Test alternative paths + self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', + msg='multiple `paths` should be treated as alternative paths') + self.assertEqual(traverse_obj(_TEST_DATA, 'str', 100), 'str', + msg='alternatives should exit early') + self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'fail'), None, + msg='alternatives should return `default` if exhausted') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, 'fail'), 100), 100, + msg='alternatives should track their own branching return') + self.assertEqual(traverse_obj(_TEST_DATA, ('dict', Ellipsis), ('data', Ellipsis)), list(_TEST_DATA['data']), + msg='alternatives on empty objects should search further') + + # Test branch and path nesting + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')), ['https://www.example.com/0'], + msg='tuple as key should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')), ['https://www.example.com/0'], + msg='list as key should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))), ['https://www.example.com/0'], + msg='double nesting in path should be treated as paths') + self.assertEqual(traverse_obj(['0', [1, 2]], [(0, 1), 0]), [1], + msg='do not fail early on branching') + self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', ((1, ('fail', 'url')), (0, 'url')))), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='triple nesting in path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ('fail', (Ellipsis, 'url')))), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='ellipsis as branch path start gets flattened') + + # Test dictionary as key + self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}), {0: 100, 1: 1.2}, + msg='dict key should result in a dict with the same keys') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}), + {0: 'https://www.example.com/0'}, + msg='dict key should allow paths') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}), + {0: ['https://www.example.com/0']}, + msg='tuple in dict path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}), + {0: ['https://www.example.com/0']}, + msg='double nesting in dict path should be treated as paths') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}), + {0: ['https://www.example.com/1', 'https://www.example.com/0']}, + msg='triple nesting in dict path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {}, + msg='remove `None` values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=Ellipsis), {0: Ellipsis}, + msg='do not remove `None` values if `default`') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {0: {}}, + msg='do not remove empty values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: {}}, + msg='do not remove empty values when dict key and a default') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', Ellipsis)}), {0: []}, + msg='if branch in dict key not successful, return `[]`') + + # Testing default parameter behavior + _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail'), None, + msg='default value should be `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=Ellipsis), Ellipsis, + msg='chained fails should result in default') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', 'int'), 0, + msg='should not short cirquit on `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', default=1), 1, + msg='invalid dict key should result in `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', default=1), 1, + msg='`None` is a deliberate sentinel and should become `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', 10)), None, + msg='`IndexError` should result in `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (Ellipsis, 'fail'), default=1), 1, + msg='if branched but not successful return `default` if defined, not `[]`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (Ellipsis, 'fail'), default=None), None, + msg='if branched but not successful return `default` even if `default` is `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (Ellipsis, 'fail')), [], + msg='if branched but not successful return `[]`, not `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', Ellipsis)), [], + msg='if branched but object is empty return `[]`, not `default`') + + # Testing expected_type behavior + _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=compat_str), 'str', + msg='accept matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), None, + msg='reject non matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: compat_str(x)), '0', + msg='transform type using type function') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', + expected_type=lambda _: 1 / 0), None, + msg='wrap expected_type function in try_call') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, Ellipsis, expected_type=compat_str), ['str'], + msg='eliminate items that expected_type fails on') + + # Test get_all behavior + _GET_ALL_DATA = {'key': [0, 1, 2]} + self.assertEqual(traverse_obj(_GET_ALL_DATA, ('key', Ellipsis), get_all=False), 0, + msg='if not `get_all`, return only first matching value') + self.assertEqual(traverse_obj(_GET_ALL_DATA, Ellipsis, get_all=False), [0, 1, 2], + msg='do not overflatten if not `get_all`') + + # Test casesense behavior + _CASESENSE_DATA = { + 'KeY': 'value0', + 0: { + 'KeY': 'value1', + 0: {'KeY': 'value2'}, + }, + # FULLWIDTH LATIN CAPITAL LETTER K + '\uff2bey': 'value3', + } + self.assertEqual(traverse_obj(_CASESENSE_DATA, 'key'), None, + msg='dict keys should be case sensitive unless `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, 'keY', + casesense=False), 'value0', + msg='allow non matching key case if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, '\uff4bey', # FULLWIDTH LATIN SMALL LETTER K + casesense=False), 'value3', + msg='allow non matching Unicode key case if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ('keY',)), + casesense=False), ['value1'], + msg='allow non matching key case in branch if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ((0, 'keY'),)), + casesense=False), ['value2'], + msg='allow non matching key case in branch path if `casesense`') + + # Test traverse_string behavior + _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)), None, + msg='do not traverse into string if not `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), + _traverse_string=True), 's', + msg='traverse into string if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), + _traverse_string=True), '.', + msg='traverse into converted data if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', Ellipsis), + _traverse_string=True), list('str'), + msg='`...` branching into string should result in list') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), + _traverse_string=True), ['s', 'r'], + msg='branching into string should result in list') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda _, x: x), + _traverse_string=True), list('str'), + msg='function branching into string should result in list') + + # Test is_user_input behavior + _IS_USER_INPUT_DATA = {'range8': list(range(8))} + self.assertEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3'), + _is_user_input=True), 3, + msg='allow for string indexing if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3:'), + _is_user_input=True), tuple(range(8))[3:], + msg='allow for string slice if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':4:2'), + _is_user_input=True), tuple(range(8))[:4:2], + msg='allow step in string slice if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':'), + _is_user_input=True), range(8), + msg='`:` should be treated as `...` if `is_user_input`') + with self.assertRaises(TypeError, msg='too many params should result in error'): + traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':::'), _is_user_input=True) + + # Test re.Match as input obj + mobj = re.match(r'^0(12)(?P<group>3)(4)?$', '0123') + self.assertEqual(traverse_obj(mobj, Ellipsis), [x for x in mobj.groups() if x is not None], + msg='`...` on a `re.Match` should give its `groups()`') + self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 2)), ['0123', '3'], + msg='function on a `re.Match` should give groupno, value starting at 0') + self.assertEqual(traverse_obj(mobj, 'group'), '3', + msg='str key on a `re.Match` should give group with that name') + self.assertEqual(traverse_obj(mobj, 2), '3', + msg='int key on a `re.Match` should give group with that name') + self.assertEqual(traverse_obj(mobj, 'gRoUp', casesense=False), '3', + msg='str key on a `re.Match` should respect casesense') + self.assertEqual(traverse_obj(mobj, 'fail'), None, + msg='failing str key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, 'gRoUpS', casesense=False), None, + msg='failing str key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, 8), None, + msg='failing int key on a `re.Match` should return `default`') + + def test_get_first(self): + self.assertEqual(get_first([{'a': None}, {'a': 'spam'}], 'a'), 'spam') + + def test_join_nonempty(self): + self.assertEqual(join_nonempty('a', 'b'), 'a-b') + self.assertEqual(join_nonempty( + 'a', 'b', 'c', 'd', + from_dict={'a': 'c', 'c': [], 'b': 'd', 'd': None}), 'c-d') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 23a65a81c..e3c3ccff9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -43,6 +43,7 @@ from .compat import ( compat_HTTPError, compat_basestring, compat_chr, + compat_collections_abc, compat_cookiejar, compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, @@ -1685,6 +1686,7 @@ USER_AGENTS = { NO_DEFAULT = object() +IDENTITY = lambda x: x ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', @@ -3867,6 +3869,105 @@ def detect_exe_version(output, version_re=None, unrecognized='present'): return unrecognized +class LazyList(compat_collections_abc.Sequence): + """Lazy immutable list from an iterable + Note that slices of a LazyList are lists and not LazyList""" + + class IndexError(IndexError): + def __init__(self, cause=None): + if cause: + # reproduce `raise from` + self.__cause__ = cause + super(IndexError, self).__init__() + + def __init__(self, iterable, **kwargs): + # kwarg-only + reverse = kwargs.get('reverse', False) + _cache = kwargs.get('_cache') + + self._iterable = iter(iterable) + self._cache = [] if _cache is None else _cache + self._reversed = reverse + + def __iter__(self): + if self._reversed: + # We need to consume the entire iterable to iterate in reverse + for item in self.exhaust(): + yield item + return + for item in self._cache: + yield item + for item in self._iterable: + self._cache.append(item) + yield item + + def _exhaust(self): + self._cache.extend(self._iterable) + self._iterable = [] # Discard the emptied iterable to make it pickle-able + return self._cache + + def exhaust(self): + """Evaluate the entire iterable""" + return self._exhaust()[::-1 if self._reversed else 1] + + @staticmethod + def _reverse_index(x): + return None if x is None else ~x + + def __getitem__(self, idx): + if isinstance(idx, slice): + if self._reversed: + idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1)) + start, stop, step = idx.start, idx.stop, idx.step or 1 + elif isinstance(idx, int): + if self._reversed: + idx = self._reverse_index(idx) + start, stop, step = idx, idx, 0 + else: + raise TypeError('indices must be integers or slices') + if ((start or 0) < 0 or (stop or 0) < 0 + or (start is None and step < 0) + or (stop is None and step > 0)): + # We need to consume the entire iterable to be able to slice from the end + # Obviously, never use this with infinite iterables + self._exhaust() + try: + return self._cache[idx] + except IndexError as e: + raise self.IndexError(e) + n = max(start or 0, stop or 0) - len(self._cache) + 1 + if n > 0: + self._cache.extend(itertools.islice(self._iterable, n)) + try: + return self._cache[idx] + except IndexError as e: + raise self.IndexError(e) + + def __bool__(self): + try: + self[-1] if self._reversed else self[0] + except self.IndexError: + return False + return True + + def __len__(self): + self._exhaust() + return len(self._cache) + + def __reversed__(self): + return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache) + + def __copy__(self): + return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache) + + def __repr__(self): + # repr and str should mimic a list. So we exhaust the iterable + return repr(self.exhaust()) + + def __str__(self): + return repr(self.exhaust()) + + class PagedList(object): def __len__(self): # This is only useful for tests @@ -4092,6 +4193,10 @@ def multipart_encode(data, boundary=None): return out, content_type +def variadic(x, allowed_types=(compat_str, bytes, dict)): + return x if isinstance(x, compat_collections_abc.Iterable) and not isinstance(x, allowed_types) else (x,) + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): if isinstance(key_or_keys, (list, tuple)): for key in key_or_keys: @@ -4102,6 +4207,23 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True): return d.get(key_or_keys, default) +def try_call(*funcs, **kwargs): + + # parameter defaults + expected_type = kwargs.get('expected_type') + fargs = kwargs.get('args', []) + fkwargs = kwargs.get('kwargs', {}) + + for f in funcs: + try: + val = f(*fargs, **fkwargs) + except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError): + pass + else: + if expected_type is None or isinstance(val, expected_type): + return val + + def try_get(src, getter, expected_type=None): if not isinstance(getter, (list, tuple)): getter = [getter] @@ -5835,3 +5957,220 @@ def clean_podcast_url(url): st\.fm # https://podsights.com/docs/ )/e )/''', '', url) + + +def traverse_obj(obj, *paths, **kwargs): + """ + Safely traverse nested `dict`s and `Sequence`s + + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" + + Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Sequence` and `re.Match`. + A value of None is treated as the absence of a value. + + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. + + The keys in the path can be one of: + - `None`: Return the current object. + - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Sequence`s, `key` is the index of the value. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches. + + @params paths Paths which to traverse by. + Keyword arguments: + @param default Value to return if the paths do not match. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param _is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param _traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + A list is always returned if the last path branches and no `default` is given. + """ + + # parameter defaults + default = kwargs.get('default', NO_DEFAULT) + expected_type = kwargs.get('expected_type') + get_all = kwargs.get('get_all', True) + casesense = kwargs.get('casesense', True) + _is_user_input = kwargs.get('_is_user_input', False) + _traverse_string = kwargs.get('_traverse_string', False) + + # instant compat + str = compat_str + + is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes)) + # stand-in until compat_re_Match is added + compat_re_Match = type(re.match('a', 'a')) + # stand-in until casefold.py is added + try: + ''.casefold() + compat_casefold = lambda s: s.casefold() + except AttributeError: + compat_casefold = lambda s: s.lower() + casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k + + if isinstance(expected_type, type): + type_test = lambda val: val if isinstance(val, expected_type) else None + else: + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + + def from_iterable(iterables): + # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F + for it in iterables: + for item in it: + yield item + + def apply_key(key, obj): + if obj is None: + return + + elif key is None: + yield obj + + elif isinstance(key, (list, tuple)): + for branch in key: + _, result = apply_path(obj, branch) + for item in result: + yield item + + elif key is Ellipsis: + result = [] + if isinstance(obj, compat_collections_abc.Mapping): + result = obj.values() + elif is_sequence(obj): + result = obj + elif isinstance(obj, compat_re_Match): + result = obj.groups() + elif _traverse_string: + result = str(obj) + for item in result: + yield item + + elif callable(key): + if is_sequence(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, compat_collections_abc.Mapping): + iter_obj = obj.items() + elif isinstance(obj, compat_re_Match): + iter_obj = enumerate(itertools.chain([obj.group()], obj.groups())) + elif _traverse_string: + iter_obj = enumerate(str(obj)) + else: + return + for item in (v for k, v in iter_obj if try_call(key, args=(k, v))): + yield item + + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items()) + yield dict((k, v if v is not None else default) for k, v in iter_obj + if v is not None or default is not NO_DEFAULT) + + elif isinstance(obj, compat_collections_abc.Mapping): + yield (obj.get(key) if casesense or (key in obj) + else next((v for k, v in obj.items() if casefold(k) == key), None)) + + elif isinstance(obj, compat_re_Match): + if isinstance(key, int) or casesense: + try: + yield obj.group(key) + return + except IndexError: + pass + if not isinstance(key, str): + return + + yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + + else: + if _is_user_input: + key = (int_or_none(key) if ':' not in key + else slice(*map(int_or_none, key.split(':')))) + + if not isinstance(key, (int, slice)): + return + + if not is_sequence(obj): + if not _traverse_string: + return + obj = str(obj) + + try: + yield obj[key] + except IndexError: + pass + + def apply_path(start_obj, path): + objs = (start_obj,) + has_branched = False + + for key in variadic(path): + if _is_user_input and key == ':': + key = Ellipsis + + if not casesense and isinstance(key, str): + key = compat_casefold(key) + + if key is Ellipsis or isinstance(key, (list, tuple)) or callable(key): + has_branched = True + + key_func = functools.partial(apply_key, key) + objs = from_iterable(map(key_func, objs)) + + return has_branched, objs + + def _traverse_obj(obj, path, use_list=True): + has_branched, results = apply_path(obj, path) + results = LazyList(x for x in map(type_test, results) if x is not None) + + if get_all and has_branched: + return results.exhaust() if results or use_list else None + + return results[0] if results else None + + for index, path in enumerate(paths, 1): + use_list = default is NO_DEFAULT and index == len(paths) + result = _traverse_obj(obj, path, use_list) + if result is not None: + return result + + return None if default is NO_DEFAULT else default + + +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (Ellipsis,) + tuple(variadic(keys)), get_all=False, **kwargs) + + +def join_nonempty(*values, **kwargs): + + # parameter defaults + delim = kwargs.get('delim', '-') + from_dict = kwargs.get('from_dict') + + if from_dict is not None: + values = (traverse_obj(from_dict, variadic(v)) for v in values) + return delim.join(map(compat_str, filter(None, values))) From de39d1281cea499cb1adfce5ff7e0a56f1bad5fe Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 4 Nov 2022 10:13:07 +0000 Subject: [PATCH 1411/1705] [extractor/ceskatelevize] Back-port extractor from yt-dlp, etc (#30713) * back-port extractor, removing CeskaTelevizePoradyIE * follow redirect URL * support liveBroadcast and videobonusDetail in __NEXT__ data * return single video for singleton playlist * fix/add tests --- youtube_dl/extractor/ceskatelevize.py | 178 ++++++++++++++------------ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 96 insertions(+), 87 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 7cb4efb74..fe677d8e8 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -12,35 +12,21 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + str_or_none, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'title': 'Bonus 01 - En - Hyde Park Civilizace', 'description': 'English Subtittles', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 81.3, @@ -51,31 +37,111 @@ class CeskaTelevizeIE(InfoExtractor): }, }, { # live stream - 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'url': 'http://www.ceskatelevize.cz/zive/ct1/', 'info_dict': { - 'id': 402, + 'id': '102', 'ext': 'mp4', - 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r'ČT1 - živé vysílání online', + 'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Georestricted to Czech Republic', + }, { + # another + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'only_matching': True, + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + # 'skip': 'Georestricted to Czech Republic', }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Bogotart - Queer', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Bogotart - Queer (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Bogotart - Queer (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', + webpage, 'next.js data', **kw), + video_id, **kw) + def _real_extract(self, url): playlist_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') - webpage = self._download_webpage(url, playlist_id) + type_ = 'IDEC' + if re.search(r'(^/porady|/zive)/', parsed_url.path): + next_data = self._search_nextjs_data(webpage, playlist_id) + if '/zive/' in parsed_url.path: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) + else: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) + if idec: + type_ = 'bonus' + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage( + 'https://www.ceskatelevize.cz/v-api/iframe-hash/', + playlist_id, note='Getting IFRAME hash') + query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } + webpage = self._download_webpage( + 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', + playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + self.raise_geo_restricted(NOT_AVAILABLE_STRING) + if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): + raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) type_ = None episode_id = None @@ -100,7 +166,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +174,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +196,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -167,7 +230,7 @@ class CeskaTelevizeIE(InfoExtractor): entries[num]['formats'].extend(formats) continue - item_id = item.get('id') or item['assetId'] + item_id = str_or_none(item.get('id') or item['assetId']) title = item['title'] duration = float_or_none(item.get('duration')) @@ -181,8 +244,6 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -200,6 +261,8 @@ class CeskaTelevizeIE(InfoExtractor): for e in entries: self._sort_formats(e['formats']) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): @@ -236,54 +299,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 751fc38b6..e36f86be4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -208,10 +208,7 @@ from .ccc import ( from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE From 47e70fff8ba3de769a31fab0b3572162094733f7 Mon Sep 17 00:00:00 2001 From: Moises Lima <mozlima@users.noreply.github.com> Date: Wed, 9 Nov 2022 17:26:30 -0300 Subject: [PATCH 1412/1705] [PeekVids, PlayVids] Add new extractor (#29765) * Merge back-port from yt-dlp * Merge features from PR #29798 * Improve metadata extraction Co-authored-by: dirkf <fieldhouse@gmx.net> Co-authored by: AXDOOMER --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/peekvids.py | 193 +++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 youtube_dl/extractor/peekvids.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e36f86be4..4d9f37424 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -909,6 +909,10 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peekvids import ( + PeekVidsIE, + PlayVidsIE, +) from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE diff --git a/youtube_dl/extractor/peekvids.py b/youtube_dl/extractor/peekvids.py new file mode 100644 index 000000000..c8aad564b --- /dev/null +++ b/youtube_dl/extractor/peekvids.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) + + +class PeekVidsIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?peekvids\.com/ + (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) + (?P<id>[^/?&#]*) + ''' + _TESTS = [{ + 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', + 'info_dict': { + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', + 'timestamp': 1642579329, + 'upload_date': '20220119', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, + }, + }] + _DOMAIN = 'www.peekvids.com' + + def _get_detail(self, html): + return get_element_by_class('detail-video-block', html) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + '[%s] %s: %s' % (self.IE_NAME, video_id, 'You are suspected as a bot. Wait, or pass the captcha test on the site and provide --cookies.'), + expected=True) + + title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + 'https://%s/v-alt/%s' % (self._DOMAIN, video_id), video_id, + note='Downloading list of source files') + formats = [{ + 'url': f_url, + 'format_id': f_id, + 'height': int_or_none(f_id), + } for f_url, f_id in ( + (url_or_none(f_v), f_match.group(1)) + for f_v, f_match in ( + (v, re.match(r'^data-src(\d{3,})$', k)) + for k, v in srcs.items() if v) if f_match) + if f_url + ] + if not formats: + formats = [{'url': url} for url in srcs.values()] + self._sort_formats(formats) + + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = self._get_detail(webpage) or '' + info['description'] = self._html_search_regex( + r'(?s)(.+?)(?:%s\s*<|<ul\b)' % (re.escape(info.get('description', '')), ), + detail, 'description', default=None) or None + info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url) + + def cat_tags(name, html): + l = self._html_search_regex( + r'(?s)<span\b[^>]*>\s*%s\s*:\s*</span>(.+?)</li>' % (re.escape(name), ), + html, name, default='') + return [x for x in re.split(r'\s+', l) if x] + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PlayVidsIE(PeekVidsIE): + _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)' + _TESTS = [{ + 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'md5': '2f12e50213dd65f142175da633c4564c', + 'info_dict': { + 'id': '1978030', + 'display_id': 'U3pBrYhsjXM', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', + 'timestamp': 1640435839, + 'upload_date': '20211225', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line', + 'md5': 'e783986e596cafbf46411a174ab42ba6', + 'info_dict': { + 'id': '762385', + 'display_id': 'bKmGLe3IwjZ', + 'ext': 'mp4', + 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6', + 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef', + 'timestamp': 1516958544, + 'upload_date': '20180126', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 480, + 'uploader': 'Brazzers', + 'age_limit': 18, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/v/47iUho33toY', + 'md5': 'b056b5049d34b648c1e86497cf4febce', + 'info_dict': { + 'id': '700621', + 'display_id': '47iUho33toY', + 'ext': 'mp4', + 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', + 'description': None, + 'timestamp': 1507052209, + 'upload_date': '20171003', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 332, + 'uploader': 'Cacerenele', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + } + }, { + 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances', + 'md5': 'efa09be9f031314b7b7e3bc6510cd0df', + 'info_dict': { + 'id': '1523518', + 'display_id': 'z3_7iwWCmqt', + 'ext': 'mp4', + 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', + 'description': None, + 'timestamp': 1607470323, + 'upload_date': '20201208', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 593, + 'uploader': 'yorours', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }] + _DOMAIN = 'www.playvids.com' + + def _get_detail(self, html): + return get_element_by_class('detail-block', html) From 604762a9f8fa21de3f7349bd612c4f34941a5d20 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Fri, 11 Nov 2022 00:49:13 +0000 Subject: [PATCH 1413/1705] [common:jwplayer] Improve jwplayer extraction and parsing (#31000) * don't crash parser if jwplayer_data is invalid (empty, or no formats) * use `label` in `sources[n]` as `format_id` * relax `jwplayer().setup(...)` RE (also rework PR #27274 enhancement) * detect more manifest formats in _parse_jwplayer_formats() (from PR #29596) * improve metadata extraction (from PR #25433) * remember URLs in a set * use parse_resolution() in format * extract filesize in format (from yt-dlp) Co-authored-by: kikuyan <kikuyan@users.noreply.github.com> Co-authored-by: martin54 <martin54@users.noreply.github.com> --- youtube_dl/extractor/common.py | 40 ++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1f33a1e06..a0a796d7b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,6 +70,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -2713,7 +2714,7 @@ class InfoExtractor(object): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -2734,9 +2735,14 @@ class InfoExtractor(object): def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True) + if flat_pl is None: + # not even a dict + return [] + # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: + if flat_pl is True: jwplayer_data = {'playlist': [jwplayer_data]} entries = [] @@ -2784,6 +2790,13 @@ class InfoExtractor(object): 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -2792,7 +2805,9 @@ class InfoExtractor(object): 'url': formats[0]['url'], }) else: - self._sort_formats(formats) + # avoid exception in case of only sttls + if formats: + self._sort_formats(formats) entry['formats'] = formats entries.append(entry) if len(entries) == 1: @@ -2802,7 +2817,7 @@ class InfoExtractor(object): def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -2811,14 +2826,14 @@ class InfoExtractor(object): base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -2833,20 +2848,23 @@ class InfoExtractor(object): 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } + if format_id: + a_format['format_id'] = format_id + if source_url.startswith('rtmp'): a_format['ext'] = 'flv' # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as From c2f9be3e63a000cf20e9e4ad789a4f5453d00eb7 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Sat, 12 Nov 2022 11:55:05 +0000 Subject: [PATCH 1414/1705] [generic] Add KVS player extraction --- youtube_dl/extractor/generic.py | 183 ++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a9c064105..01e406750 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -28,6 +28,7 @@ from ..utils import ( mimetype2ext, orderedSet, parse_duration, + parse_resolution, sanitized_Request, smuggle_url, unescapeHTML, @@ -2227,6 +2228,97 @@ class GenericIE(InfoExtractor): # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', 'only_matching': True, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/embed/105/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July / Embed Player', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player + 'url': 'https://thisvid.com/videos/fruit-is-healthy/', + 'md5': 'f83e52f409b9139a7efee58ef926a72e', + 'info_dict': { + 'id': '7079579', + 'display_id': 'fruit-is-healthy', + 'ext': 'mp4', + 'title': 'Fruit is healthy - ThisVid.com', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/7079000/7079579/preview.jpg', + } + }, { + # KVS Player + 'url': 'https://thisvid.com/embed/7079579/', + 'info_dict': { + 'id': '7079579', + 'display_id': 'fruit-is-healthy', + 'ext': 'mp4', + 'title': 'Fruit is healthy - ThisVid.com', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/7079000/7079579/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player + 'url': 'https://youix.com/video/leningrad-zoj/', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, + }, { + # KVS Player + 'url': 'https://youix.com/embed/18485', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Ленинград - ЗОЖ', + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, + }, { + # KVS Player + 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', + 'md5': '94166bdb26b4cb1fb9214319a629fc51', + 'info_dict': { + 'id': '21217', + 'display_id': '40-nochey-2016', + 'ext': 'mp4', + 'title': '40 ночей (2016) - BogMedia.org', + 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', + }, + }, { + # KVS Player (for sites that serve kt_player.js via non-https urls) + 'url': 'http://www.camhub.world/embed/389508', + 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32', + 'info_dict': { + 'id': '389508', + 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', + 'ext': 'mp4', + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', + }, }, ] @@ -2332,6 +2424,87 @@ class GenericIE(InfoExtractor): 'title': title, } + def _extract_kvs(self, url, webpage, video_id): + + def getlicensetoken(license): + modlicense = license.replace('$', '').replace('0', '1') + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = compat_str(4 * abs(fronthalf - backhalf)) + + def parts(): + for o in range(0, center + 1): + for i in range(1, 5): + yield compat_str((int(license[o + i]) + int(modlicense[o])) % 10) + + return ''.join(parts()) + + def getrealurl(video_url, license_code): + if not video_url.startswith('function/0/'): + return video_url # not obfuscated + + url_path, _, url_query = video_url.partition('?') + urlparts = url_path.split('/')[2:] + license = getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + def spells(x, o): + l = (o + sum(int(n) for n in license[o:])) % 32 + for i in range(0, len(x)): + yield {l: x[o], o: x[l]}.get(i, x[i]) + + for o in range(len(newmagic) - 1, -1, -1): + newmagic = ''.join(spells(newmagic, o)) + + urlparts[5] = newmagic + urlparts[5][32:] + return '/'.join(urlparts) + '?' + url_query + + flashvars = self._search_regex( + r'(?s)<script\b[^>]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?</script>', + webpage, 'flashvars') + flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' + r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', + webpage, 'display_id', fatal=False + ) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) + formats = [] + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(key + '_text', key) + formats.append(merge_dicts( + parse_resolution(format_id) or parse_resolution(flashvars[key]), { + 'url': getrealurl(flashvars[key], flashvars['license_code']), + 'format_id': format_id, + 'ext': 'mp4', + })) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 + + self._sort_formats(formats) + + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + def _real_extract(self, url): if url.startswith('//'): return self.url_result(self.http_scheme() + url) @@ -3389,6 +3562,16 @@ class GenericIE(InfoExtractor): info_dict['formats'] = formats return info_dict + # Look for generic KVS player (before ld+json for tests) + found = re.search( + r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)\1[^>]*>', + webpage) + if found: + self.report_extraction('KVS Player') + if found.group('maj_ver') not in ('4', '5', '6'): + self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found.group('ver'), )) + return self._extract_kvs(url, webpage, video_id) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') From 1a4fbe8462f5e531a891aeac7db6c0bde49c5536 Mon Sep 17 00:00:00 2001 From: FraFraFra-LongD <85188920+FraFraFra-LongD@users.noreply.github.com> Date: Sun, 13 Nov 2022 14:22:04 +0100 Subject: [PATCH 1415/1705] Added ThisVid.com support (#29187) * add ThisVidIE, ThisVidMemberIE, ThisVidPlaylistIE * redirect embed to main page for more metadata * use KVS extraction newly added to GenericIE and remove duplicate tests * also add MrDeepFake etc compat to GenericIE (closes #22390) Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/generic.py | 54 ++++--- youtube_dl/extractor/thisvid.py | 218 +++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+), 28 deletions(-) create mode 100644 youtube_dl/extractor/thisvid.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4d9f37424..947cbe8fd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1265,6 +1265,11 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidMemberIE, + ThisVidPlaylistIE, +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 01e406750..597611157 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2252,31 +2252,7 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, { - # KVS Player - 'url': 'https://thisvid.com/videos/fruit-is-healthy/', - 'md5': 'f83e52f409b9139a7efee58ef926a72e', - 'info_dict': { - 'id': '7079579', - 'display_id': 'fruit-is-healthy', - 'ext': 'mp4', - 'title': 'Fruit is healthy - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/7079000/7079579/preview.jpg', - } - }, { - # KVS Player - 'url': 'https://thisvid.com/embed/7079579/', - 'info_dict': { - 'id': '7079579', - 'display_id': 'fruit-is-healthy', - 'ext': 'mp4', - 'title': 'Fruit is healthy - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/7079000/7079579/preview.jpg', - }, - 'params': { - 'skip_download': True, - }, - }, { - # KVS Player + # KVS Player (tested also in thisvid.py) 'url': 'https://youix.com/video/leningrad-zoj/', 'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'info_dict': { @@ -2306,6 +2282,7 @@ class GenericIE(InfoExtractor): 'display_id': '40-nochey-2016', 'ext': 'mp4', 'title': '40 ночей (2016) - BogMedia.org', + 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', }, }, { @@ -2319,6 +2296,18 @@ class GenericIE(InfoExtractor): 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', }, + }, { + 'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes', + 'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4', + 'info_dict': { + 'id': '5', + 'display_id': 'selena-gomez-pov-deep-fakes', + 'ext': 'mp4', + 'title': 'Selena Gomez POV (Deep Fakes) DeepFake Porn - MrDeepFakes', + 'description': 'md5:17d1f84b578c9c26875ac5ef9a932354', + 'height': 720, + 'age_limit': 18, + }, }, ] @@ -2491,6 +2480,7 @@ class GenericIE(InfoExtractor): 'url': getrealurl(flashvars[key], flashvars['license_code']), 'format_id': format_id, 'ext': 'mp4', + 'http_headers': {'Referer': url}, })) if not formats[-1].get('height'): formats[-1]['quality'] = 1 @@ -2713,9 +2703,15 @@ class GenericIE(InfoExtractor): # but actually don't. AGE_LIMIT_MARKERS = [ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + r'>[^<]*you acknowledge you are at least (\d+) years old', ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 + for marker in AGE_LIMIT_MARKERS: + m = re.search(marker, webpage) + if not m: + continue + age_limit = max( + age_limit or 0, + int_or_none(m.groups() and m.group(1), default=18)) # video uploader is domain name video_uploader = self._search_regex( @@ -3570,7 +3566,9 @@ class GenericIE(InfoExtractor): self.report_extraction('KVS Player') if found.group('maj_ver') not in ('4', '5', '6'): self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found.group('ver'), )) - return self._extract_kvs(url, webpage, video_id) + return merge_dicts( + self._extract_kvs(url, webpage, video_id), + info_dict) # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( diff --git a/youtube_dl/extractor/thisvid.py b/youtube_dl/extractor/thisvid.py new file mode 100644 index 000000000..bc4bcb2d1 --- /dev/null +++ b/youtube_dl/extractor/thisvid.py @@ -0,0 +1,218 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, +) +from ..utils import ( + clean_html, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, + urljoin, +) + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'description': 'md5:372353bb995883d1b65fddf507489acd', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/3533241/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type') + webpage = self._download_webpage(url, main_id) + + title = self._html_search_regex( + r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?', + webpage, 'title') + + if type_ == 'embed': + # look for more metadata + video_alt_url = url_or_none(self._search_regex( + r'''video_alt_url\s*:\s+'(%s/)',''' % (self._VALID_URL, ), + webpage, 'video_alt_url', default=None)) + if video_alt_url and video_alt_url != url: + webpage = self._download_webpage( + video_alt_url, main_id, + note='Redirecting embed to main page', fatal=False) or webpage + + video_holder = get_element_by_class('video-holder', webpage) or '' + if '>This video is a private video' in video_holder: + self.raise_login_required( + (clean_html(video_holder) or 'Private video').split('\n', 1)[0]) + + uploader = self._html_search_regex( + r'''(?s)]*>Added by:\s*]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*''', + webpage, 'uploader', default='') + uploader = re.split(r'''/["'][^>]*>\s*''', uploader) + if len(uploader) == 2: + # id must be non-empty, uploader could be '' + uploader_id, uploader = uploader + uploader = uploader or None + else: + uploader_id = uploader = None + + return merge_dicts({ + '_type': 'url_transparent', + 'title': title, + 'age_limit': 18, + 'uploader': uploader, + 'uploader_id': uploader_id, + }, self.url_result(url, ie='Generic')) + + +class ThisVidMemberIE(InfoExtractor): + _VALID_URL = r'https?://thisvid\.com/members/(?P\d+)' + _TESTS = [{ + 'url': 'https://thisvid.com/members/2140501/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Profile', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://thisvid.com/members/2140501/favourite_videos/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Favourite Videos', + }, + 'playlist_mincount': 15, + }, { + 'url': 'https://thisvid.com/members/636468/public_videos/', + 'info_dict': { + 'id': '636468', + 'title': 'Happymouth\'s Public Videos', + }, + 'playlist_mincount': 196, + }, + ] + + def _urls(self, html): + for m in re.finditer(r''']+\bhref\s*=\s*["'](?P%s\b)[^>]+>''' % (ThisVidIE._VALID_URL, ), html): + yield m.group('url') + + def _real_extract(self, url): + pl_id = self._match_id(url) + webpage = self._download_webpage(url, pl_id) + + title = re.split( + r'(?i)\s*\|\s*ThisVid\.com\s*$', + self._og_search_title(webpage, default=None) or self._html_search_regex(r'(?s)]*>(.+?)]+\bhref\s*=\s*("|')(?P(?!#)(?:(?!\1).)+)''', + next_page, 'next page link', group='url', default=None)) + # in case a member page should have pagination-next with empty link, not just `else:` + if next_page is None: + # playlist page + parsed_url = compat_urlparse.urlparse(page_url) + base_path, num = parsed_url.path.rsplit('/', 1) + num = int_or_none(num) + if num is None: + base_path, num = parsed_url.path.rstrip('/'), 1 + parsed_url = parsed_url._replace(path=base_path + ('/%d' % (num + 1, ))) + next_page = compat_urlparse.urlunparse(parsed_url) + if page_url == next_page: + next_page = None + if not next_page: + break + page_url, html = next_page, None + + return self.playlist_from_matches( + entries(url, webpage), playlist_id=pl_id, playlist_title=title, ie='ThisVid') + + +class ThisVidPlaylistIE(ThisVidMemberIE): + _VALID_URL = r'https?://thisvid\.com/playlist/(?P\d+)/video/(?P[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '6615', + 'title': 'Underwear Stuff', + }, + 'playlist_mincount': 200, + }, { + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '1072387', + 'ext': 'mp4', + 'title': 'Big Italian Booty 28', + 'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2', + 'uploader_id': '367912', + 'uploader': 'Jcmusclefun', + 'age_limit': 18, + }, + 'params': { + 'noplaylist': True, + }, + }] + + def _get_video_url(self, pl_url): + video_id = re.match(self._VALID_URL, pl_url).group('video_id') + return urljoin(pl_url, '/videos/%s/' % (video_id, )) + + def _urls(self, html): + for m in re.finditer(r''']+\bhref\s*=\s*["'](?P%s\b)[^>]+>''' % (self._VALID_URL, ), html): + yield self._get_video_url(m.group('url')) + + def _real_extract(self, url): + pl_id = self._match_id(url) + + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just the featured video because of --no-playlist') + return self.url_result(self._get_video_url(url), 'ThisVid') + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to download just the featured video' % (pl_id, )) + result = super(ThisVidPlaylistIE, self)._real_extract(url) + + # rework title returned as `the title - the title` + title = result['title'] + t_len = len(title) + if t_len > 5 and t_len % 2 != 0: + t_len = t_len // 2 + if title[t_len] == '-': + title = [t.strip() for t in (title[:t_len], title[t_len + 1:])] + if title[0] and title[0] == title[1]: + result['title'] = title[0] + return result From fc2beab0e701c497a003f11fef5c0df54fba1da3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 13 Nov 2022 14:59:30 +0000 Subject: [PATCH 1416/1705] [generic] Improve KVS (etc) extraction * detect kt_player('kt_player', 'https://.../kt_player.swf?v=5... * detect age limit if 18 USC 2257 is mentioned * test with shooshtime.com Partially resolves #31332. --- youtube_dl/extractor/generic.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 597611157..3e8281ed3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -36,6 +36,7 @@ from ..utils import ( unsmuggle_url, UnsupportedError, url_or_none, + urljoin, xpath_attr, xpath_text, xpath_with_ns, @@ -2308,6 +2309,17 @@ class GenericIE(InfoExtractor): 'height': 720, 'age_limit': 18, }, + }, { + 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', + 'md5': 'e2f0a4c329f7986280b7328e24036d60', + 'info_dict': { + 'id': '284002', + 'display_id': 'just-out-of-the-shower-joi', + 'ext': 'mp4', + 'title': 'Just Out Of The Shower JOI - Shooshtime', + 'height': 720, + 'age_limit': 18, + }, }, ] @@ -2477,7 +2489,7 @@ class GenericIE(InfoExtractor): format_id = flashvars.get(key + '_text', key) formats.append(merge_dicts( parse_resolution(format_id) or parse_resolution(flashvars[key]), { - 'url': getrealurl(flashvars[key], flashvars['license_code']), + 'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])), 'format_id': format_id, 'ext': 'mp4', 'http_headers': {'Referer': url}, @@ -2704,6 +2716,7 @@ class GenericIE(InfoExtractor): AGE_LIMIT_MARKERS = [ r'Proudly Labeled RTA', r'>[^<]*you acknowledge you are at least (\d+) years old', + r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b', ] for marker in AGE_LIMIT_MARKERS: m = re.search(marker, webpage) @@ -3559,13 +3572,15 @@ class GenericIE(InfoExtractor): return info_dict # Look for generic KVS player (before ld+json for tests) - found = re.search( - r']+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P(?P\d+)(\.\d+)+)\1[^>]*>', - webpage) + found = self._search_regex( + (r']+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P\d+(?:\.\d+)+)\1[^>]*>', + # kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ... + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) if found: - self.report_extraction('KVS Player') - if found.group('maj_ver') not in ('4', '5', '6'): - self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found.group('ver'), )) + self.report_extraction('%s: KVS Player' % (video_id, )) + if found.split('.')[0] not in ('4', '5', '6'): + self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, )) return merge_dicts( self._extract_kvs(url, webpage, video_id), info_dict) From 195f22f679330549882a8234e7234942893a4902 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 13 Nov 2022 15:09:29 +0000 Subject: [PATCH 1417/1705] [generic] Improve KVS (etc) extraction --- youtube_dl/extractor/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3e8281ed3..0e473e952 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3575,8 +3575,8 @@ class GenericIE(InfoExtractor): found = self._search_regex( (r']+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P\d+(?:\.\d+)+)\1[^>]*>', # kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ... - r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', - ), webpage, 'KVS player', group='ver', default=False) + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) if found: self.report_extraction('%s: KVS Player' % (video_id, )) if found.split('.')[0] not in ('4', '5', '6'): From 14ef89a8dab4f6ba6185d6f5bf0317a705d7b842 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 1 Feb 2023 09:39:49 +0530 Subject: [PATCH 1418/1705] Support `if` statements Fix for yt-dlp/yt_dlp#6131 Closes #31509 --- test/test_jsinterp.py | 32 ++++++++++++++++++++++++++++++++ test/test_youtube_signature.py | 4 ++++ youtube_dl/jsinterp.py | 21 ++++++++++++++++++--- 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 5121c8cf8..c47def737 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -158,6 +158,38 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('z'), 5) self.assertEqual(jsi.call_function('y'), 2) + def test_if(self): + jsi = JSInterpreter(''' + function x() { + let a = 9; + if (0==0) {a++} + return a + }''') + self.assertEqual(jsi.call_function('x'), 10) + + jsi = JSInterpreter(''' + function x() { + if (0==0) {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + jsi = JSInterpreter(''' + function x() { + if (0!=0) {return 1} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) {return 1} + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + """ + def test_for_loop(self): # function x() { a=0; for (i=0; i-10; i++) {a++} a } jsi = JSInterpreter(''' diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4e678cae0..ac37ffa45 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -135,6 +135,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', ), + ( + 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', + 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 530a705b4..9a3b8d7f2 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -214,7 +214,7 @@ class JSInterpreter(object): def __init__(self, msg, *args, **kwargs): expr = kwargs.pop('expr', None) if expr is not None: - msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) + msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) @classmethod @@ -268,7 +268,7 @@ class JSInterpreter(object): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and (char in cls.OP_CHARS or char == '[' or (char.isspace() and after_op)) + after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -301,7 +301,7 @@ class JSInterpreter(object): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise cls.Exception('No terminating paren {delim} in {expr:.100}'.format(**locals())) + raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @staticmethod @@ -428,10 +428,25 @@ class JSInterpreter(object): m = re.match(r'''(?x) (?Ptry)\s*\{| + (?Pif)\s*\(| (?Pswitch)\s*\(| (?Pfor)\s*\( ''', expr) md = m.groupdict() if m else {} + if md.get('if'): + cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) + if_expr, expr = self._separate_at_paren(expr.lstrip()) + # TODO: "else if" is not handled + else_expr = None + m = re.match(r'else\s*{', expr) + if m: + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) + ret, should_abort = self.interpret_statement( + if_expr if cndn else else_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + if md.get('try'): try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) err = None From 295736c9cba714fb5de7d1c3dd31d86e50091cf8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 2 Feb 2023 14:28:32 +0000 Subject: [PATCH 1419/1705] [jsinterp] Improve parsing * support subset `... else if ...` * support `while` * add `RegExp` class * generalise `new` support * limited more debug strings * matching test changes --- test/test_jsinterp.py | 53 +++++++++++++- youtube_dl/jsinterp.py | 156 +++++++++++++++++++++++++++-------------- 2 files changed, 154 insertions(+), 55 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c47def737..b5962356c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -11,8 +11,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math import re -from youtube_dl.compat import compat_re_Pattern - from youtube_dl.jsinterp import JS_Undefined, JSInterpreter @@ -140,15 +138,23 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertTrue(math.isnan(jsi.call_function('x'))) + def test_Date(self): jsi = JSInterpreter(''' function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } ''') self.assertEqual(jsi.call_function('x'), 86000) + jsi = JSInterpreter(''' function x(dt) { return new Date(dt) - 0; } ''') self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + # date format m/d/y + jsi = JSInterpreter(''' + function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } @@ -181,6 +187,15 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('x'), 10) """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + """ + + def test_elseif(self): jsi = JSInterpreter(''' function x() { if (0!=0) {return 1} @@ -188,6 +203,16 @@ class TestJSInterpreter(unittest.TestCase): else {return 10} }''') self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + # etc """ def test_for_loop(self): @@ -197,6 +222,13 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 10) + def test_while_loop(self): + # function x() { a=0; while (a<10) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; while (a<10) {a++} return a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + def test_switch(self): jsi = JSInterpreter(''' function x(f) { switch(f){ @@ -415,13 +447,28 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/; return a; } ''') - self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern) + attrs = set(('findall', 'finditer', 'flags', 'groupindex', + 'groups', 'match', 'pattern', 'scanner', + 'search', 'split', 'sub', 'subn')) + self.assertTrue(set(dir(jsi.call_function('x'))) > attrs) jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/i; return a; } ''') self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + jsi = JSInterpreter(r''' + function x() { let a=[/[)\\]/]; return a[0]; } + ''') + self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + + """ # fails + jsi = JSInterpreter(r''' + function x() { let a=100; a/=/[0-9]+/.exec('divide by 20 today')[0]; } + ''') + self.assertEqual(jsi.call_function('x'), 5) + """ + def test_char_code_at(self): jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('x', 0), 116) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 9a3b8d7f2..1e7b342ac 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -187,19 +187,6 @@ class LocalNameSpace(ChainMap): class JSInterpreter(object): __named_object_counter = 0 - _RE_FLAGS = { - # special knowledge: Python's re flags are bitmask values, current max 128 - # invent new bitmask values well above that for literal parsing - # TODO: new pattern class to execute matches with these flags - 'd': 1024, # Generate indices for substring matches - 'g': 2048, # Global search - 'i': re.I, # Case-insensitive search - 'm': re.M, # Multi-line search - 's': re.S, # Allows . to match newline characters - 'u': re.U, # Treat a pattern as a sequence of unicode code points - 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string - } - _OBJ_NAME = '__youtube_dl_jsinterp_obj' OP_CHARS = None @@ -217,9 +204,48 @@ class JSInterpreter(object): msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) + class JS_RegExp(object): + _RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + def __init__(self, pattern_txt, flags=''): + if isinstance(flags, compat_str): + flags, _ = self.regex_flags(flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern + # First, avoid https://github.com/python/cpython/issues/74534 + self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + for name in dir(self.__self): + # Only these? Obviously __class__, __init__. + # PyPy creates a __weakref__ attribute with value None + # that can't be setattr'd but also can't need to be copied. + if name in ('__class__', '__init__', '__weakref__'): + continue + setattr(self, name, getattr(self.__self, name)) + + @classmethod + def regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls._RE_FLAGS: + break + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] + @classmethod def __op_chars(cls): - op_chars = set(';,') + op_chars = set(';,[') for op in cls._all_operators(): for c in op[0]: op_chars.add(c) @@ -231,17 +257,6 @@ class JSInterpreter(object): namespace[name] = obj return name - @classmethod - def _regex_flags(cls, expr): - flags = 0 - if not expr: - return flags, expr - for idx, ch in enumerate(expr): - if ch not in cls._RE_FLAGS: - break - flags |= cls._RE_FLAGS[ch] - return flags, expr[idx + 1:] - @classmethod def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): if not expr: @@ -328,7 +343,7 @@ class JSInterpreter(object): try: return opfunc(left_val, right_val) except Exception as e: - raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e) + raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) def _index(self, obj, idx, allow_undefined=False): if idx == 'length': @@ -338,7 +353,7 @@ class JSInterpreter(object): except Exception as e: if allow_undefined: return JS_Undefined - raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) + raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): try: @@ -352,6 +367,7 @@ class JSInterpreter(object): allow_recursion -= 1 should_return = False + # fails on (eg) if (...) stmt1; else stmt2; sub_statements = list(self._separate(stmt, ';')) or [''] expr = stmt = sub_statements.pop().strip() for sub_stmt in sub_statements: @@ -371,25 +387,30 @@ class JSInterpreter(object): if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': - flags, outer = self._regex_flags(outer) - inner = re.compile(inner[1:], flags=flags) # , strict=True)) + flags, outer = self.JS_RegExp.regex_flags(outer) + inner = self.JS_RegExp(inner[1:], flags=flags) else: inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) if not outer: return inner, should_return expr = self._named_object(local_vars, inner) + outer - if expr.startswith('new '): - obj = expr[4:] - if obj.startswith('Date('): - left, right = self._separate_at_paren(obj[4:]) - expr = unified_timestamp( - self.interpret_expression(left, local_vars, allow_recursion), False) + new_kw, _, obj = expr.partition('new ') + if not new_kw: + for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), + ('RegExp', self.JS_RegExp), + ('Error', self.Exception)): + if not obj.startswith(klass + '('): + continue + left, right = self._separate_at_paren(obj[len(klass):]) + argvals = self.interpret_iter(left, local_vars, allow_recursion) + expr = konstr(*argvals) if not expr: - raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr) - expr = self._dump(int(expr * 1000), local_vars) + right + raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr) + expr = self._dump(expr, local_vars) + right + break else: - raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr) + raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr) if expr.startswith('void '): left = self.interpret_expression(expr[5:], local_vars, allow_recursion) @@ -430,24 +451,45 @@ class JSInterpreter(object): (?Ptry)\s*\{| (?Pif)\s*\(| (?Pswitch)\s*\(| - (?Pfor)\s*\( + (?Pfor)\s*\(| + (?Pwhile)\s*\( ''', expr) md = m.groupdict() if m else {} if md.get('if'): cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) - if_expr, expr = self._separate_at_paren(expr.lstrip()) - # TODO: "else if" is not handled + if expr.startswith('{'): + if_expr, expr = self._separate_at_paren(expr) + else: + # may lose ... else ... because of ll.368-374 + if_expr, expr = self._separate_at_paren(expr, delim=';') else_expr = None - m = re.match(r'else\s*{', expr) + m = re.match(r'else\s*(?P\{)?', expr) if m: - else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if m.group('block'): + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + else: + # handle subset ... else if (...) {...} else ... + # TODO: make interpret_statement do this properly, if possible + exprs = list(self._separate(expr[m.end():], delim='}', max_split=2)) + if len(exprs) > 1: + if re.match(r'\s*if\s*\(', exprs[0]) and re.match(r'\s*else\b', exprs[1]): + else_expr = exprs[0] + '}' + exprs[1] + expr = (exprs[2] + '}') if len(exprs) == 3 else None + else: + else_expr = exprs[0] + exprs.append('') + expr = '}'.join(exprs[1:]) + else: + else_expr = exprs[0] + expr = None + else_expr = else_expr.lstrip() + '}' cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) ret, should_abort = self.interpret_statement( if_expr if cndn else else_expr, local_vars, allow_recursion) if should_abort: return ret, True - if md.get('try'): + elif md.get('try'): try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) err = None try: @@ -484,8 +526,8 @@ class JSInterpreter(object): if err: raise err - elif md.get('for'): - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) + elif md.get('for') or md.get('while'): + init_or_cond, remaining = self._separate_at_paren(expr[m.end() - 1:]) if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: @@ -496,11 +538,12 @@ class JSInterpreter(object): body = 'switch(%s){%s}' % (switch_val, body) else: body, expr = remaining, '' - start, cndn, increment = self._separate(constructor, ';') - self.interpret_expression(start, local_vars, allow_recursion) - while True: - if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): - break + if md.get('for'): + start, cndn, increment = self._separate(init_or_cond, ';') + self.interpret_expression(start, local_vars, allow_recursion) + else: + cndn, increment = init_or_cond, None + while _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): try: ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) if should_abort: @@ -509,7 +552,8 @@ class JSInterpreter(object): break except JS_Continue: pass - self.interpret_expression(increment, local_vars, allow_recursion) + if increment: + self.interpret_expression(increment, local_vars, allow_recursion) elif md.get('switch'): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) @@ -764,6 +808,10 @@ class JSInterpreter(object): if idx >= len(obj): return None return ord(obj[idx]) + elif member == 'replace': + assertion(isinstance(obj, compat_str), 'must be applied on a string') + assertion(len(argvals) == 2, 'takes exactly two arguments') + return re.sub(argvals[0], argvals[1], obj) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) @@ -795,6 +843,10 @@ class JSInterpreter(object): raise self.Exception('Cannot return from an expression', expr) return ret + def interpret_iter(self, list_txt, local_vars, allow_recursion): + for v in self._separate(list_txt): + yield self.interpret_expression(v, local_vars, allow_recursion) + def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} From 37cbdfa0e7c9d00d450af32dc9cdaf93cbfc4576 Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Thu, 2 Feb 2023 11:58:21 -0500 Subject: [PATCH 1420/1705] [americastestkitchen] Add support for downloading entire series (#31493) Also * support new sites and URL patterns * back-port from yt-dlp Co-authored-by: dirkf --- youtube_dl/extractor/americastestkitchen.py | 115 +++++++++++++++----- 1 file changed, 88 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index be960c0f9..08d3604e9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -15,7 +15,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -23,15 +23,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', 'series': "America's Test Kitchen", + 'season': 'Season 18', 'season_number': 18, 'episode': 'Japanese Suppers', 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -44,15 +49,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', 'series': "America's Test Kitchen", + 'season': 'Season 21', 'season_number': 21, 'episode': 'Simple Chicken Dinner', 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,6 +70,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', + 'only_matching': True, }, { 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', 'only_matching': True, @@ -94,7 +110,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -105,48 +121,93 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_name, season_number = re.match(self._VALID_URL, url).groups() - season_number = int(season_number) + match = re.match(self._VALID_URL, url).groupdict() + show = match.get('show2') + show_path = ('/' + show) if show else '' + show = show or match['show'] + season_number = int_or_none(match.get('season')) - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - season = 'Season %d' % season_number + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] + + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + playlist_id, headers={ + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), - 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'url': 'https://www.americastestkitchen.com%s%s' % (show_path, search_url), + 'id': try_get(episode, lambda e: e['objectID'].rsplit('_', 1)[-1]), 'title': episode.get('title'), 'description': episode.get('description'), 'timestamp': unified_timestamp(episode.get('search_document_date')), @@ -156,4 +217,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) From 297fbff23b347612a5f6002b40adba9dfad85413 Mon Sep 17 00:00:00 2001 From: Rodrigo Dias Date: Thu, 2 Feb 2023 17:10:09 +0000 Subject: [PATCH 1421/1705] [doc] Fixed typo appearing to promise an example (#31489) Resolves #31425 Co-authored-by: dirkf --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd888c731..6e07ddb1c 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,7 @@ To use percent literals in an output template use `%%`. To output to stdout use The current default template is `%(title)s-%(id)s.%(ext)s`. -In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: +In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title. #### Output template and Windows batch files From 807e593a32a1ace8fa0be8129fc5071d86516c99 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:12:36 -0400 Subject: [PATCH 1422/1705] [cammodels] fix and improve extractor (#31453) Co-authored-by: dirkf --- youtube_dl/extractor/cammodels.py | 34 +++++++++---------------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 1eb81b75e..d2e860b24 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, url_or_none, ) @@ -20,32 +19,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -85,6 +63,13 @@ class CamModelsIE(InfoExtractor): 'preference': -1, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) self._sort_formats(formats) @@ -92,6 +77,7 @@ class CamModelsIE(InfoExtractor): return { 'id': user_id, 'title': self._live_title(user_id), + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 From e9611a2a3603ee201d0c1ba99e8bfd8ec1e697cd Mon Sep 17 00:00:00 2001 From: Leon Etienne <40911701+Leonetienne@users.noreply.github.com> Date: Thu, 2 Feb 2023 18:13:39 +0100 Subject: [PATCH 1423/1705] [pr0gramm] implement InfoExtractor, Resolves #31433 (#31434) * [pr0gramm] implement infoextractor * [pr0gramm] remove misplaced comment, uncapture regex-group * [pr0gramm]: specify utf-8 coding * [pr0gramm]: add trailing comma to lists for maintainability * [pr0gramm]: ie only sets upload_date attribute * [pr0gramm]: add video_id to title * [pr0gramm]: more forgiving _valid_url regex * [pr0gramm]: add uploader to title, if set * Discriminate URL pattern --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/pr0gramm.py | 105 +++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 youtube_dl/extractor/pr0gramm.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 947cbe8fd..cf0388ed2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1667,3 +1667,7 @@ from .zingmp3 import ( ) from .zoom import ZoomIE from .zype import ZypeIE +from .pr0gramm import ( + Pr0grammIE, + Pr0grammStaticIE, +) diff --git a/youtube_dl/extractor/pr0gramm.py b/youtube_dl/extractor/pr0gramm.py new file mode 100644 index 000000000..b68224fd5 --- /dev/null +++ b/youtube_dl/extractor/pr0gramm.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re +from ..utils import ( + merge_dicts, +) + + +class Pr0grammStaticIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/static/5466437 + _VALID_URL = r'https?://pr0gramm\.com/static/(?P[0-9]+)' + _TEST = { + 'url': 'https://pr0gramm.com/static/5466437', + 'md5': '52fa540d70d3edc286846f8ca85938aa', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Fetch media sources + entries = self._parse_html5_media_entries(url, webpage, video_id) + media_info = entries[0] + + # this raises if there are no formats + self._sort_formats(media_info.get('formats') or []) + + # Fetch author + uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') + + # Fetch approx upload timestamp from filename + # Have None-defaults in case the extraction fails + uploadDay = None + uploadMon = None + uploadYear = None + uploadTimestr = None + # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) + m = re.search(r'//img\.pr0gramm\.com/(?P[\d]+)/(?P[\d]+)/(?P[\d]+)/\w+\.\w{,4}', webpage) + + if (m): + # Up to a day of accuracy should suffice... + uploadDay = m.groupdict().get('day') + uploadMon = m.groupdict().get('mon') + uploadYear = m.groupdict().get('year') + uploadTimestr = uploadYear + uploadMon + uploadDay + + return merge_dicts({ + 'id': video_id, + 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), + 'uploader': uploader, + 'upload_date': uploadTimestr + }, media_info) + + +# This extractor is for the primary url (used for sharing, and appears in the +# location bar) Since this page loads the DOM via JS, yt-dl can't find any +# video information here. So let's redirect to a compatibility version of +# the site, which does contain the

    ', webpage, 'title') + lecture_series_title = self._html_search_regex( + r'(?s)]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + + formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': lecture_title, + 'series': lecture_series_title, + 'formats': formats, + } + + +class RbgTumCourseIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + + lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + + lecture_urls = [] + for lecture_url in re.findall(r'(?i)href="/w/(.+)(? Date: Fri, 10 Feb 2023 04:19:27 +0800 Subject: [PATCH 1444/1705] [feat]: Add support to external downloader aria2p (#31500) * feat: add class Aria2pFD * feat: create call_downloader function * feat: a colorful download interface to aria2pFD * feat: change value name * Apply suggestions from code review Co-authored-by: dirkf * Typo in suggestion * fix: remove unused value * fix: add not function to return value(0 is normal); add total_seconds to download.eta(timedelta object); add waiting status when hook progress * fix: remove unuse method ..utils.format_bytes * fix: be up to flake8 * fix: be up to flake8 * Apply suggestions from code review * [feat] test external downloader aria2p * [feat] test external downloader aria2p * [fix] test_external_downloader.py * Apply suggestions from code review Co-authored-by: dirkf * Apply suggestions from code review Co-authored-by: dirkf * Update test/test_external_downloader.py Co-authored-by: dirkf * Update test/test_external_downloader.py Co-authored-by: dirkf * Update youtube_dl/downloader/external.py Co-authored-by: dirkf * refactoring code and fix bugs * Apply suggestions from code review * Rename test_external_downloader.py to test_downloader_external.py --------- Co-authored-by: dirkf --- test/helper.py | 11 +++ test/test_downloader_external.py | 115 ++++++++++++++++++++++++++++++ test/test_downloader_http.py | 17 ++--- test/test_http.py | 16 ++--- youtube_dl/downloader/external.py | 58 +++++++++++++++ 5 files changed, 193 insertions(+), 24 deletions(-) create mode 100644 test/test_downloader_external.py diff --git a/test/helper.py b/test/helper.py index c6a2f0667..883b2e877 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,6 +89,17 @@ class FakeYDL(YoutubeDL): self.report_warning = types.MethodType(report_warning, self) +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): for tc in ie.get_testcases(include_onlymatching): diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py new file mode 100644 index 000000000..c0239502b --- /dev/null +++ b/test/test_downloader_external.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import subprocess +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.utils import encodeFilename +from youtube_dl.downloader.external import Aria2pFD +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.match(r'bytes=(\d+)-(\d+)', range_header) + if mobj: + start, end = (int(mobj.group(i)) for i in (1, 2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False, 'unrecognised server path' + + +@unittest.skipUnless(Aria2pFD.available(), 'aria2p module not found') +class TestAria2pFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + with subprocess.Popen( + ['aria2c', '--enable-rpc'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) as process: + if not process.poll(): + filename = 'testfile.mp4' + params['logger'] = FakeLogger() + params['outtmpl'] = filename + ydl = YoutubeDL(params) + try_rm(encodeFilename(filename)) + self.assertEqual(ydl.download(['http://127.0.0.1:%d/%s' % (self.port, ep)]), 0) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + process.kill() + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({'external_downloader': 'aria2p'}) + + def test_chunked(self): + self.download_all({ + 'external_downloader': 'aria2p', + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 750472281..4e6d7a2a0 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,7 +9,11 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port, try_rm +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD @@ -66,17 +70,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHttpFD(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/test/test_http.py b/test/test_http.py index 3ee0a5dda..487a9bc77 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port +from test.helper import ( + FakeLogger, + http_server_port, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -52,17 +55,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index a06ab2e50..bffcd10b6 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -200,6 +200,64 @@ class Aria2cFD(ExternalFD): return cmd +class Aria2pFD(ExternalFD): + ''' Aria2pFD class + This class support to use aria2p as downloader. + (Aria2p, a command-line tool and Python library to interact with an aria2c daemon process + through JSON-RPC.) + It can help you to get download progress more easily. + To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip. + Then run aria2c in the background and enable with the --enable-rpc option. + ''' + try: + import aria2p + __avail = True + except ImportError: + __avail = False + + @classmethod + def available(cls): + return cls.__avail + + def _call_downloader(self, tmpfilename, info_dict): + aria2 = self.aria2p.API( + self.aria2p.Client( + host='http://localhost', + port=6800, + secret='' + ) + ) + + options = { + 'min-split-size': '1M', + 'max-connection-per-server': 4, + 'auto-file-renaming': 'false', + } + options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.') + options['out'] = os.path.basename(tmpfilename) + options['header'] = [] + for key, val in info_dict['http_headers'].items(): + options['header'].append('{0}: {1}'.format(key, val)) + download = aria2.add_uris([info_dict['url']], options) + status = { + 'status': 'downloading', + 'tmpfilename': tmpfilename, + } + started = time.time() + while download.status in ['active', 'waiting']: + download = aria2.get_download(download.gid) + status.update({ + 'downloaded_bytes': download.completed_length, + 'total_bytes': download.total_length, + 'elapsed': time.time() - started, + 'eta': download.eta.total_seconds(), + 'speed': download.download_speed, + }) + self._hook_progress(status) + time.sleep(.5) + return download.status != 'complete' + + class HttpieFD(ExternalFD): @classmethod def available(cls): From 822f19f05d0ab1a4a945a85f691f2079f7cb3bbb Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:37:45 +0100 Subject: [PATCH 1445/1705] [FileMoonIE] Add extractor for filemoon.sx (#31515) --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/filemoon.py | 43 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/filemoon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dfaef0cc3..f63a2e030 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -376,6 +376,7 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE +from .filemoon import FileMoonIE from .fifa import FifaIE from .filmon import ( FilmOnIE, diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py new file mode 100644 index 000000000..654df9b69 --- /dev/null +++ b/youtube_dl/extractor/filemoon.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + js_to_json, +) + + +class FileMoonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' + _TEST = { + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + matches = re.findall(r'(?s)(eval.*?)', webpage) + packed = matches[-1] + unpacked = decode_packed_codes(packed) + jwplayer_sources = self._parse_json( + self._search_regex( + r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': self._generic_title(url) or video_id, + 'formats': formats + } From de48105dd870e353af468bfb8d49b14d9894e649 Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:47:43 +0100 Subject: [PATCH 1446/1705] [KommunetvIE] Add extractor for kommunetv.no (#31516) * Add extractor for kommunetv.no * Using utils.update_url instead of regex --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kommunetv.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/kommunetv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f63a2e030..d8428f46f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .khanacademy import ( from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE from .kth import KTHIE diff --git a/youtube_dl/extractor/kommunetv.py b/youtube_dl/extractor/kommunetv.py new file mode 100644 index 000000000..91d06a74f --- /dev/null +++ b/youtube_dl/extractor/kommunetv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } From 6f8c2635a573c84ef66c02f73b4aeff1cc36ae4e Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:54:45 +0100 Subject: [PATCH 1447/1705] [StreamsbIE] Add extractor for streamsb.com (viewsb.com) (#31517) * Add extractor for streamsb.com (viewsb.com) * make data url using app.js version --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/streamsb.py | 61 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/streamsb.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8428f46f..3a87f9e33 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1206,6 +1206,7 @@ from .storyfire import ( from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamsb import StreamsbIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE diff --git a/youtube_dl/extractor/streamsb.py b/youtube_dl/extractor/streamsb.py new file mode 100644 index 000000000..bffcb3de1 --- /dev/null +++ b/youtube_dl/extractor/streamsb.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import binascii +import random +import re +import string + +from .common import InfoExtractor +from ..utils import urljoin, url_basename + + +def to_ascii_hex(str1): + return binascii.hexlify(str1.encode('utf-8')).decode('ascii') + + +def generate_random_string(length): + return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length)) + + +class StreamsbIE(InfoExtractor): + _DOMAINS = ('viewsb.com', ) + _VALID_URL = r'https://(?P%s)/(?P.+)' % '|'.join(_DOMAINS) + _TEST = { + 'url': 'https://viewsb.com/dxfvlu4qanjx', + 'md5': '488d111a63415369bf90ea83adc8a325', + 'info_dict': { + 'id': 'dxfvlu4qanjx', + 'ext': 'mp4', + 'title': 'Sintel' + } + } + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id') + webpage = self._download_webpage(url, video_id) + + iframe_rel_url = self._search_regex(r'''(?i)]+\bsrc\s*=\s*('|")(?P/.*\.html)\1''', webpage, 'iframe', group='path') + iframe_url = urljoin('https://' + domain, iframe_rel_url) + + iframe_data = self._download_webpage(iframe_url, video_id) + app_version = self._search_regex(r''']+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50' + + video_code = url_basename(iframe_url).rsplit('.')[0] + + length = 12 + req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb')) + ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req)) + + video_data = self._download_webpage(ereq, video_id, headers={ + 'Referer': iframe_url, + 'watchsb': 'sbstream', + }) + player_data = self._parse_json(video_data, video_id) + title = player_data['stream_data']['title'] + formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title, + } From 42b098dd79e91295376ca98f394876555481a3eb Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Feb 2023 02:47:09 +0000 Subject: [PATCH 1448/1705] [InfoExtractor] Handle unquoted values in OpenGraph searches --- test/test_InfoExtractor.py | 2 ++ youtube_dl/extractor/common.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index dd69a681b..4db5c93f1 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -62,6 +62,7 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -74,6 +75,7 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) + self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value') def test_html_search_meta(self): ie = self.ie diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a0a796d7b..7244e5df6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1087,7 +1087,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' From dd9aa74beefc179f943051c4e19eecad87ab1124 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Feb 2023 16:33:01 +0000 Subject: [PATCH 1449/1705] [test] Avoid name TestIE which causes a pytest warning See: https://github.com/yt-dlp/yt-dlp/commit/060ac76257a8c1f7370a8a571821c1d73377701f --- test/test_InfoExtractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4db5c93f1..6d25441db 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,13 +35,13 @@ class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler) assert False -class TestIE(InfoExtractor): +class DummyIE(InfoExtractor): pass class TestInfoExtractor(unittest.TestCase): def setUp(self): - self.ie = TestIE(FakeYDL()) + self.ie = DummyIE(FakeYDL()) def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) From 2dd6c6edd8e0fc5e45865b8e6d865e35147de772 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 17 Feb 2023 11:16:54 +0000 Subject: [PATCH 1450/1705] [YouTube] Avoid crash if uploader_id extraction fails See #31530. --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ba0f5c8b6..66b0257df 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2122,7 +2122,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): microformat.get('uploadDate') or search_meta('uploadDate')), 'uploader': video_details['author'], - 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_id': self._search_regex( + r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, + 'uploader id', fatal=False) if owner_profile_url else None, 'uploader_url': owner_profile_url, 'channel_id': channel_id, 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, From 57802e632f5a741df6fd9b30a455c32632944489 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 19 Feb 2023 13:47:49 +0000 Subject: [PATCH 1451/1705] [jsinterp] Fix dict comprehension for Py2.6 Resolves #31600 --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 60fa2b1b9..a3bc42a61 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -262,7 +262,7 @@ class JSInterpreter(object): if not expr: return # collections.Counter() is ~10% slower in both 2.7 and 3.9 - counters = {k: 0 for k in _MATCHING_PARENS.values()} + counters = dict((k, 0) for k in _MATCHING_PARENS.values()) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 after_op, in_regex_char_group, skip_re = True, False, 0 From 6067451e432fb65d487a8a67bb5cff52efb9ccf4 Mon Sep 17 00:00:00 2001 From: df Date: Mon, 20 Feb 2023 01:41:46 +0000 Subject: [PATCH 1452/1705] [Vimeo] Fix e19ec52 for tween-age Pythons * a check in older Pythons in the 2.7 and earlier, 3.3, 3.4 series caused "sre_constants.error: nothing to repeat" * satisfy the check by avoiding nested qualifiers that can match empty string Resolves #31597 --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7f2731d83..8e1a805f6 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -286,7 +286,7 @@ class VimeoIE(VimeoBaseInfoExtractor): /(?!videos|likes)[^/?#]+/?| (?(q)|/(?P[\da-f]{10}))? ) - (?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$ + (?:(?(q)[&]|(?(u)|/?)[?]).+?)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' _TESTS = [ From 1d3751c3fe50b203d3e2bff71d866c8c500f8288 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 1 Jun 2021 18:05:41 +0530 Subject: [PATCH 1453/1705] Escape URLs in `sanitized_Request`, not `sanitize_url` d2558234cf5dd12d6896eed5427b7dcdb3ab7b5a added escaping of URLs while sanitizing. However, `sanitize_url` may not always receive an actual URL. Eg: When using `youtube-dl "search query" --default-search ytsearch`, `search query` gets escaped to `search%20query` before being prefixed with `ytsearch:` which is not the intended behavior. So the escaping is moved to `sanitized_Request` instead. --- test/test_utils.py | 1 + youtube_dl/extractor/generic.py | 19 +++++++++++++++++++ youtube_dl/utils.py | 4 ++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9d364c863..ea2b96ed2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -250,6 +250,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar') self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') + self.assertEqual(sanitize_url('foo bar'), 'foo bar') def test_expand_path(self): def env(var): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0e473e952..b01900afa 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2320,6 +2320,25 @@ class GenericIE(InfoExtractor): 'height': 720, 'age_limit': 18, }, + }, { + # would like to use the yt-dl test video but searching for + # '"\'/\\ä↭𝕐' fails, so using an old vid from YouTube Korea + 'note': 'Test default search', + 'url': 'Shorts로 허락 필요없이 놀자! (BTS편)', + 'info_dict': { + 'id': 'usDGO4Zb-dc', + 'ext': 'mp4', + 'title': 'YouTube Shorts로 허락 필요없이 놀자! (BTS편)', + 'description': 'md5:96e31607eba81ab441567b5e289f4716', + 'upload_date': '20211107', + 'uploader': 'YouTube Korea', + 'location': '대한민국', + }, + 'params': { + 'default_search': 'ytsearch', + 'skip_download': True, + }, + 'expected_warnings': ['uploader id'], }, ] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4edbfa27b..761edcd49 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2176,11 +2176,11 @@ def sanitize_url(url): for mistake, fixup in COMMON_TYPOS: if re.match(mistake, url): return re.sub(mistake, fixup, url) - return escape_url(url) + return url def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) + return compat_urllib_request.Request(escape_url(sanitize_url(url)), *args, **kwargs) def expand_path(s): From e67e52a8f8fd7e76253e416da76570af8da200d0 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 24 Feb 2023 02:32:40 +0000 Subject: [PATCH 1454/1705] [test] Support test-case with volatile ID (eg live show) Signalled by regexp ID value, eg: `'id': r're:[\da-zA-Z_-]{8,}'` --- test/test_download.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_download.py b/test/test_download.py index 19936969f..d50008307 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -148,6 +148,7 @@ def generator(test_case, tname): try_rm(tc_filename) try_rm(tc_filename + '.part') try_rm(os.path.splitext(tc_filename)[0] + '.info.json') + try_rm_tcs_files() try: try_num = 1 @@ -213,7 +214,15 @@ def generator(test_case, tname): # First, check test cases' data against extracted data alone expect_info_dict(self, tc_res_dict, tc.get('info_dict', {})) # Now, check downloaded file consistency + # support test-case with volatile ID, signalled by regexp value + if tc.get('info_dict', {}).get('id', '').startswith('re:'): + test_id = tc['info_dict']['id'] + tc['info_dict']['id'] = tc_res_dict['id'] + else: + test_id = None tc_filename = get_tc_filename(tc) + if test_id: + tc['info_dict']['id'] = test_id if not test_case.get('params', {}).get('skip_download', False): self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) self.assertTrue(tc_filename in finished_hook_called) From f7ce98a21e15cb094c772e9082796d009c61578b Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 24 Feb 2023 02:48:37 +0000 Subject: [PATCH 1455/1705] [YouTube] Support @owner format in uploader_id etc * implement https://github.com/ytdl-org/youtube-dl/issues/31530#issuecomment-1435734719 * update affected tests * misc clean-ups --- youtube_dl/extractor/youtube.py | 319 +++++++++++++++++++------------- 1 file changed, 194 insertions(+), 125 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 66b0257df..4246d84f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,6 +31,7 @@ from ..utils import ( get_element_by_attribute, int_or_none, js_to_json, + merge_dicts, mimetype2ext, parse_codecs, parse_duration, @@ -400,6 +401,62 @@ class YoutubeBaseInfoExtractor(InfoExtractor): break data['continuation'] = token + @staticmethod + def _owner_endpoints_path(): + return [ + Ellipsis, + lambda k, _: k.endswith('SecondaryInfoRenderer'), + ('owner', 'videoOwner'), 'videoOwnerRenderer', 'title', + 'runs', Ellipsis] + + def _extract_channel_id(self, webpage, videodetails={}, metadata={}, renderers=[]): + channel_id = None + if any((videodetails, metadata, renderers)): + channel_id = ( + traverse_obj(videodetails, 'channelId') + or traverse_obj(metadata, 'externalChannelId', 'externalId') + or traverse_obj(renderers, + self._owner_endpoints_path() + [ + 'navigationEndpoint', 'browseEndpoint', 'browseId'], + get_all=False) + ) + return channel_id or self._html_search_meta( + 'channelId', webpage, 'channel id', default=None) + + def _extract_author_var(self, webpage, var_name, + videodetails={}, metadata={}, renderers=[]): + result = None + paths = { + # (HTML, videodetails, metadata, renderers) + 'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']), + 'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl', + ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']) + } + if any((videodetails, metadata, renderers)): + result = ( + traverse_obj(videodetails, paths[var_name][1], get_all=False) + or traverse_obj(metadata, paths[var_name][2], get_all=False) + or traverse_obj(renderers, + self._owner_endpoints_path() + paths[var_name][3], + get_all=False) + ) + return result or traverse_obj( + extract_attributes(self._search_regex( + r'''(?s)(]+\bitemprop\s*=\s*("|')%s\2[^>]*>)''' + % re.escape(var_name), + get_element_by_attribute('itemprop', 'author', webpage) or '', + 'author link', default='')), + paths[var_name][0]) + + @staticmethod + def _yt_urljoin(url_or_path): + return urljoin('https://www.youtube.com', url_or_path) + + def _extract_uploader_id(self, uploader_url): + return self._search_regex( + r'/(?:(?:channel|user)/|(?=@))([^/?&#]+)', uploader_url or '', + 'uploader id', default=None) + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -516,8 +573,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@PhilippHagemeister', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -557,8 +614,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@PhilippHagemeister', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], @@ -588,7 +645,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, - 'skip': 'format 141 not served anymore', + 'skip': 'format 141 not served any more', }, # DASH manifest with encrypted signature { @@ -600,7 +657,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', + 'uploader_id': '@AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, }, @@ -618,8 +675,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 219, 'upload_date': '20100909', 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'uploader_id': '@theamazingatheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -635,8 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'uploader_id': '@thewitcher', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@thewitcher', 'upload_date': '20140605', 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'age_limit': 18, @@ -659,7 +716,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'duration': 177, 'uploader': 'FlyingKitty', - 'uploader_id': 'FlyingKitty900', + 'uploader_id': '@FlyingKitty900', 'upload_date': '20200408', 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', 'age_limit': 18, @@ -682,7 +739,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:17eccca93a786d51bc67646756894066', 'duration': 106, 'uploader': 'Projekt Melody', - 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'uploader_id': '@ProjektMelody', 'upload_date': '20191227', 'age_limit': 18, 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', @@ -704,10 +761,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', 'description': 'Fan Video. Music & Lyrics by OOMPH!.', 'duration': 210, - 'uploader': 'Herr Lurik', - 'uploader_id': 'st3in234', 'upload_date': '20130730', - 'uploader_url': 'http://www.youtube.com/user/st3in234', + 'uploader': 'Herr Lurik', + 'uploader_id': '@HerrLurik', + 'uploader_url': 'http://www.youtube.com/@HerrLurik', 'age_limit': 0, 'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/hqdefault.jpg', 'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'], @@ -740,8 +797,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 266, 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'uploader_id': '@deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@deadmau5', 'creator': 'deadmau5', 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', 'uploader': 'deadmau5', @@ -762,8 +819,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s)(?:.+\s)?HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games\s*', 'duration': 6085, 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', + 'uploader_id': '@Olympics', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@Olympics', 'uploader': r're:Olympics?', 'age_limit': 0, 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg', @@ -785,8 +842,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'duration': 85, 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'uploader_id': '@AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', @@ -824,7 +881,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'dorappi2000', 'formats': 'mincount:31', }, - 'skip': 'not actual anymore', + 'skip': 'not actual any more', }, # DASH manifest with segment_list { @@ -905,6 +962,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Not multifeed any more', }, { # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) @@ -914,7 +972,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, - 'skip': 'Not multifeed anymore', + 'skip': 'Not multifeed any more', }, { 'url': 'https://vid.plus/FlRa-iH7PGw', @@ -938,8 +996,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', + 'uploader_id': '@IronSoulElf', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@IronSoulElf', 'uploader': 'IronSoulElf', 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', 'track': 'Dark Walk', @@ -987,8 +1045,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:a677553cf0840649b731a3024aeff4cc', 'duration': 721, 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader_id': '@BKCHarvard', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@BKCHarvard', 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', }, @@ -1007,8 +1065,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_id': '@BernieSanders', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@BernieSanders', 'license': 'Creative Commons Attribution license (reuse allowed)', }, 'params': { @@ -1054,8 +1112,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', + 'uploader_id': '@Vsauce', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@Vsauce', 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, @@ -1134,7 +1192,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, - 'skip': 'not actual anymore', + 'skip': 'not actual any more', }, { # Youtube Music Auto-generated description @@ -1191,8 +1249,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'IMG 3456', 'description': '', 'upload_date': '20170613', - 'uploader_id': 'ElevageOrVert', 'uploader': 'ElevageOrVert', + 'uploader_id': '@ElevageOrVert', }, 'params': { 'skip_download': True, @@ -1210,8 +1268,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Part 77 Sort a list of simple types in c#', 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', 'upload_date': '20130831', - 'uploader_id': 'kudvenkat', 'uploader': 'kudvenkat', + 'uploader_id': '@Csharp-video-tutorialsBlogspot', }, 'params': { 'skip_download': True, @@ -1263,8 +1321,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', 'upload_date': '20201120', 'uploader': 'Walk around Japan', - 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_id': '@walkaroundjapan7124', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@walkaroundjapan7124', }, 'params': { 'skip_download': True, @@ -1276,11 +1334,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': '4L2J27mJ3Dc', 'ext': 'mp4', + 'title': 'Midwest Squid Game #Shorts', + 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', 'upload_date': '20211025', 'uploader': 'Charlie Berens', - 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', - 'uploader_id': 'fivedlrmilkshake', - 'title': 'Midwest Squid Game #Shorts', + 'uploader_id': '@CharlieBerens', }, 'params': { 'skip_download': True, @@ -2088,25 +2146,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails = [{'url': thumbnail}] category = microformat.get('category') or search_meta('genre') - channel_id = video_details.get('channelId') \ - or microformat.get('externalChannelId') \ - or search_meta('channelId') + channel_id = self._extract_channel_id( + webpage, videodetails=video_details, metadata=microformat) duration = int_or_none( video_details.get('lengthSeconds') or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') - def gen_owner_profile_url(): - yield microformat.get('ownerProfileUrl') - yield extract_attributes(self._search_regex( - r'''(?s)(]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', - get_element_by_attribute('itemprop', 'author', webpage), - 'owner_profile_url', default='')).get('href') + owner_profile_url = self._yt_urljoin(self._extract_author_var( + webpage, 'url', videodetails=video_details, metadata=microformat)) - owner_profile_url = next( - (x for x in map(url_or_none, gen_owner_profile_url()) if x), - None) + uploader = self._extract_author_var( + webpage, 'name', videodetails=video_details, metadata=microformat) if not player_url: player_url = self._extract_player_url(webpage) @@ -2121,13 +2173,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': unified_strdate( microformat.get('uploadDate') or search_meta('uploadDate')), - 'uploader': video_details['author'], - 'uploader_id': self._search_regex( - r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, - 'uploader id', fatal=False) if owner_profile_url else None, - 'uploader_url': owner_profile_url, + 'uploader': uploader, 'channel_id': channel_id, - 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, 'duration': duration, 'view_count': int_or_none( video_details.get('viewCount') @@ -2257,6 +2304,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] + if not info['channel_id']: + channel_id = self._extract_channel_id('', renderers=contents) + if not info['uploader']: + info['uploader'] = self._extract_author_var('', 'name', renderers=contents) + if not owner_profile_url: + owner_profile_url = self._yt_urljoin(self._extract_author_var('', 'url', renderers=contents)) + for content in contents: vpir = content.get('videoPrimaryInfoRenderer') if vpir: @@ -2304,10 +2358,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) vsir = content.get('videoSecondaryInfoRenderer') if vsir: - info['channel'] = get_text(try_get( - vsir, - lambda x: x['owner']['videoOwnerRenderer']['title'], - dict)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], @@ -2365,7 +2415,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.mark_watched(video_id, player_response) - return info + return merge_dicts( + info, { + 'uploader_id': self._extract_uploader_id(owner_profile_url), + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': channel_id and self._yt_urljoin('/channel/' + channel_id), + 'channel': info['uploader'], + }) class YoutubeTabIE(YoutubeBaseInfoExtractor): @@ -2394,6 +2451,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'description': 'Short clips from Super Cooper Sundays!', 'id': 'UCKMA8kHZ8bPYpnMNaUSxfEQ', 'title': 'Super Cooper Shorts - Shorts', + 'uploader': 'Super Cooper Shorts', + 'uploader_id': '@SuperCooperShorts', } }, { # Channel that does not have a Shorts tab. Test should just download videos on Home tab instead @@ -2404,14 +2463,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Emergency Awesome - Home', }, 'playlist_mincount': 5, + 'skip': 'new test page needed to replace `Emergency Awesome - Shorts`', }, { # playlists, multipage 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Igor Kleiner', + 'uploader_id': '@IgorDataScience', }, }, { # playlists, multipage, different order @@ -2419,8 +2481,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Igor Kleiner', + 'uploader_id': '@IgorDataScience', }, }, { # playlists, series @@ -2430,6 +2494,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Playlists', 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': '@3blue1brown', }, }, { # playlists, singlepage @@ -2439,6 +2505,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', 'title': 'ThirstForScience - Playlists', 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_id': '@ThirstForScience', } }, { 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', @@ -2447,20 +2515,22 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # basic, single video playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'title': 'youtube-dl public playlist', + 'uploader': 'Sergey M.', + 'uploader_id': '@sergeym.6173', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', }, 'playlist_count': 1, }, { # empty playlist 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'title': 'youtube-dl empty playlist', + 'uploader': 'Sergey M.', + 'uploader_id': '@sergeym.6173', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', }, 'playlist_count': 0, }, { @@ -2470,6 +2540,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Home', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 2, }, { @@ -2479,6 +2551,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 975, }, { @@ -2488,6 +2562,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 199, }, { @@ -2497,6 +2573,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Playlists', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 17, }, { @@ -2506,6 +2584,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Community', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, 'playlist_mincount': 18, }, { @@ -2515,8 +2595,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Channels', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', }, - 'playlist_mincount': 138, + 'playlist_mincount': 75, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, @@ -2533,7 +2615,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'uploader_id': '@ChRiStIaAn008', + 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', }, 'playlist_count': 96, }, { @@ -2543,7 +2626,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader_id': '@Cauchemar89', + 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 1123, }, { @@ -2557,7 +2641,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + 'uploader_id': '@InterstellarMovie', + 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincount': 21, }, { @@ -2566,8 +2651,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'Data Analysis with Dr Mike Pound', 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'uploader': 'Computerphile', + 'uploader_id': '@Computerphile', + 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', }, 'playlist_mincount': 11, }, { @@ -2605,14 +2691,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': '9Auq9mYxFEE', + 'id': r're:[\da-zA-Z_-]{8,}', 'ext': 'mp4', - 'title': 'Watch Sky News live', + 'title': r're:(?s)[A-Z].{20,}', 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', - 'upload_date': '20191102', - 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', + 'uploader_id': '@SkyNews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@SkyNews', + 'upload_date': r're:\d{8}', + 'description': r're:(?s)(?:.*\n)+SUBSCRIBE to our YouTube channel for more videos: http://www\.youtube\.com/skynews *\n.*', 'categories': ['News & Politics'], 'tags': list, 'like_count': int, @@ -2701,34 +2787,22 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'note': 'Search tab', 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', - 'playlist_mincount': 40, + 'playlist_mincount': 20, 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', 'description': 'md5:e1384e8a133307dd10edee76e875d62f', 'uploader': '3Blue1Brown', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader_id': '@3blue1brown', + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', } }] @classmethod def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( + return not YoutubeIE.suitable(url) and super( YoutubeTabIE, cls).suitable(url) - def _extract_channel_id(self, webpage): - channel_id = self._html_search_meta( - 'channelId', webpage, 'channel id', default=None) - if channel_id: - return channel_id - channel_url = self._html_search_meta( - ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', - 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', - 'twitter:app:url:googleplay'), webpage, 'channel url') - return self._search_regex( - r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', - channel_url, 'channel id') - @staticmethod def _extract_grid_item_renderer(item): assert isinstance(item, dict) @@ -3116,27 +3190,18 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): else: raise ExtractorError('Unable to find selected tab') - @staticmethod - def _extract_uploader(data): + def _extract_uploader(self, metadata, data): uploader = {} - sidebar_renderer = try_get( - data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) - if sidebar_renderer: - for item in sidebar_renderer: - if not isinstance(item, dict): - continue - renderer = item.get('playlistSidebarSecondaryInfoRenderer') - if not isinstance(renderer, dict): - continue - owner = try_get( - renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) - if owner: - uploader['uploader'] = owner.get('text') - uploader['uploader_id'] = try_get( - owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) - uploader['uploader_url'] = urljoin( - 'https://www.youtube.com/', - try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + renderers = traverse_obj(data, + ('sidebar', 'playlistSidebarRenderer', 'items')) + uploader['channel_id'] = self._extract_channel_id('', metadata=metadata, renderers=renderers) + uploader['uploader'] = ( + self._extract_author_var('', 'name', renderers=renderers) + or self._extract_author_var('', 'name', metadata=metadata)) + uploader['uploader_url'] = self._yt_urljoin( + self._extract_author_var('', 'url', metadata=metadata, renderers=renderers)) + uploader['uploader_id'] = self._extract_uploader_id(uploader['uploader_url']) + uploader['channel'] = uploader['uploader'] return uploader @staticmethod @@ -3187,8 +3252,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) - playlist.update(self._extract_uploader(data)) - return playlist + return merge_dicts(playlist, self._extract_uploader(renderer, data)) def _extract_from_playlist(self, item_id, url, data, playlist): title = playlist.get('title') or try_get( @@ -3275,8 +3339,9 @@ class YoutubePlaylistIE(InfoExtractor): 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + 'uploader': 'Wickman', + 'uploader_id': '@WickmanVT', + 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', }, 'playlist_mincount': 29, }, { @@ -3290,21 +3355,25 @@ class YoutubePlaylistIE(InfoExtractor): }, { 'note': 'embedded', 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, + # TODO: full playlist requires _reload_with_unavailable_videos() + # 'playlist_count': 4, + 'playlist_mincount': 1, 'info_dict': { 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + 'uploader_id': '@milan5503', + 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', } }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 982, + 'playlist_mincount': 455, 'info_dict': { 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'uploader': 'LBK', - 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', + 'uploader_id': '@music_king', + 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', } }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', @@ -3342,8 +3411,8 @@ class YoutubeYtBeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Small Scale Baler and Braiding Rugs', 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'uploader_id': '@backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@backuspagemuseum', 'upload_date': '20161008', 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', 'categories': ['Nonprofits & Activism'], From 3da17834a49fad2a97c308fdd89aa26781ef4d60 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 28 Feb 2023 23:03:44 +0530 Subject: [PATCH 1456/1705] [Youtube] Construct dash formats with `range` query See yt-dlp/yt_dlp#6369 --- youtube_dl/extractor/youtube.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4246d84f9..89711c84e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1694,8 +1694,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if n_response is None: # give up if descrambling failed break - fmt['url'] = update_url( - parsed_fmt_url, query_update={'n': [n_response]}) + for fmt_dct in traverse_obj(fmt, (None, (None, ('fragments', Ellipsis))), expected_type=dict): + fmt_dct['url'] = update_url( + fmt_dct['url'], query_update={'n': [n_response]}) # from yt-dlp, with tweaks def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -2047,10 +2048,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if no_video: dct['abr'] = tbr if no_audio or no_video: - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } + CHUNK_SIZE = 10 << 20 + # avoid Youtube throttling + dct.update({ + 'protocol': 'http_dash_segments', + 'fragments': [{ + 'url': update_url_query(dct['url'], { + 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, dct['filesize'])) + }) + } for range_start in range(0, dct['filesize'], CHUNK_SIZE)] + } if dct['filesize'] else { + 'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful? + }) + if dct.get('ext'): dct['container'] = dct['ext'] + '_dash' formats.append(dct) From 3e92c60fcd94c37428d57153dbdd14cd0a1f9226 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 3 Mar 2023 16:48:54 +0530 Subject: [PATCH 1457/1705] [jsinterp] Handle `Date` at epoch 0 See yt-dlp/yt_dlp#6400 --- test/test_youtube_signature.py | 4 ++++ youtube_dl/jsinterp.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index ac37ffa45..decf7ee38 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -67,6 +67,10 @@ _SIG_TESTS = [ ] _NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/7862ca1f/player_ias.vflset/en_US/base.js', + 'X_LCxVDjAavgE5t', 'yxJ1dM6iz5ogUg', + ), ( 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a3bc42a61..e28670a3f 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -405,7 +405,7 @@ class JSInterpreter(object): left, right = self._separate_at_paren(obj[len(klass):]) argvals = self.interpret_iter(left, local_vars, allow_recursion) expr = konstr(*argvals) - if not expr: + if expr is None: raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr) expr = self._dump(expr, local_vars) + right break From 040271022709c4d20d33c604d1dbc72dc2da472d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Mar 2023 23:07:07 +0000 Subject: [PATCH 1458/1705] [jsinterp] Fix regexp parsing and .replace[All] method * For performance, make regexp object instantiation lazy * Other small performance improvements --- test/test_jsinterp.py | 46 ++++++++++++++++++----- youtube_dl/jsinterp.py | 84 ++++++++++++++++++++++++++++-------------- 2 files changed, 93 insertions(+), 37 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b5962356c..5d129433d 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -139,21 +139,16 @@ class TestJSInterpreter(unittest.TestCase): self.assertTrue(math.isnan(jsi.call_function('x'))) def test_Date(self): - jsi = JSInterpreter(''' - function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } - ''') - self.assertEqual(jsi.call_function('x'), 86000) - jsi = JSInterpreter(''' function x(dt) { return new Date(dt) - 0; } ''') self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) # date format m/d/y - jsi = JSInterpreter(''' - function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; } - ''') - self.assertEqual(jsi.call_function('x'), 86000) + self.assertEqual(jsi.call_function('x', '12/31/1969 18:01:26 MDT'), 86000) + + # epoch 0 + self.assertEqual(jsi.call_function('x', '1 January 1970 00:00:00 UTC'), 0) def test_call(self): jsi = JSInterpreter(''' @@ -445,7 +440,7 @@ class TestJSInterpreter(unittest.TestCase): self.assertIs(jsi.call_function('x'), None) jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/; return a; } + function x() { let a=/,,[/,913,/](,)}/; "".replace(a, ""); return a; } ''') attrs = set(('findall', 'finditer', 'flags', 'groupindex', 'groups', 'match', 'pattern', 'scanner', @@ -457,6 +452,31 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace("data-", ""); return a } + ''') + self.assertEqual(jsi.call_function('x'), 'name') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace(new RegExp("^.+-"), ""); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'name') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace(/^.+-/, ""); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'name') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replace(/a/g, "o"); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'doto-nome') + + jsi = JSInterpreter(r''' + function x() { let a="data-name".replaceAll("a", "o"); return a; } + ''') + self.assertEqual(jsi.call_function('x'), 'doto-nome') + jsi = JSInterpreter(r''' function x() { let a=[/[)\\]/]; return a[0]; } ''') @@ -485,6 +505,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + """ # fails so far + def test_packed(self): + jsi = JSInterpreter('''function x(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') + self.assertEqual(jsi.call_function('x', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + """ + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index e28670a3f..ab7d6f926 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -12,9 +12,11 @@ from .utils import ( js_to_json, remove_quotes, unified_timestamp, + variadic, ) from .compat import ( compat_basestring, + compat_chr, compat_collections_chain_map as ChainMap, compat_itertools_zip_longest as zip_longest, compat_str, @@ -205,10 +207,10 @@ class JSInterpreter(object): super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) class JS_RegExp(object): - _RE_FLAGS = { + RE_FLAGS = { # special knowledge: Python's re flags are bitmask values, current max 128 # invent new bitmask values well above that for literal parsing - # TODO: new pattern class to execute matches with these flags + # TODO: execute matches with these flags (remaining: d, y) 'd': 1024, # Generate indices for substring matches 'g': 2048, # Global search 'i': re.I, # Case-insensitive search @@ -218,12 +220,19 @@ class JSInterpreter(object): 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string } - def __init__(self, pattern_txt, flags=''): + def __init__(self, pattern_txt, flags=0): if isinstance(flags, compat_str): flags, _ = self.regex_flags(flags) - # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern # First, avoid https://github.com/python/cpython/issues/74534 - self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + self.__self = None + self.__pattern_txt = pattern_txt.replace('[[', r'[\[') + self.__flags = flags + + def __instantiate(self): + if self.__self: + return + self.__self = re.compile(self.__pattern_txt, self.__flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern for name in dir(self.__self): # Only these? Obviously __class__, __init__. # PyPy creates a __weakref__ attribute with value None @@ -232,15 +241,21 @@ class JSInterpreter(object): continue setattr(self, name, getattr(self.__self, name)) + def __getattr__(self, name): + self.__instantiate() + if hasattr(self, name): + return getattr(self, name) + return super(JSInterpreter.JS_RegExp, self).__getattr__(name) + @classmethod def regex_flags(cls, expr): flags = 0 if not expr: return flags, expr for idx, ch in enumerate(expr): - if ch not in cls._RE_FLAGS: + if ch not in cls.RE_FLAGS: break - flags |= cls._RE_FLAGS[ch] + flags |= cls.RE_FLAGS[ch] return flags, expr[idx + 1:] @classmethod @@ -265,17 +280,17 @@ class JSInterpreter(object): counters = dict((k, 0) for k in _MATCHING_PARENS.values()) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 - after_op, in_regex_char_group, skip_re = True, False, 0 + after_op, in_regex_char_group = True, False for idx, char in enumerate(expr): - if skip_re > 0: - skip_re -= 1 - continue + paren_delta = 0 if not in_quote: if char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 + paren_delta = 1 elif char in counters: counters[char] -= 1 + paren_delta = -1 if not escaping: if char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': @@ -283,7 +298,7 @@ class JSInterpreter(object): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) + after_op = not in_quote and (char in cls.OP_CHARS or paren_delta > 0 or (after_op and char.isspace())) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -293,7 +308,7 @@ class JSInterpreter(object): continue elif pos == 0 and skip_delims: here = expr[idx:] - for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]: + for s in variadic(skip_delims): if here.startswith(s) and s: skipping = len(s) - 1 break @@ -316,7 +331,7 @@ class JSInterpreter(object): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) + raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @staticmethod @@ -361,6 +376,20 @@ class JSInterpreter(object): except TypeError: return self._named_object(namespace, obj) + # used below + _VAR_RET_THROW_RE = re.compile(r'''(?x) + (?P(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?Pthrow\s+) + ''') + _COMPOUND_RE = re.compile(r'''(?x) + (?Ptry)\s*\{| + (?Pif)\s*\(| + (?Pswitch)\s*\(| + (?Pfor)\s*\(| + (?Pwhile)\s*\( + ''') + _FINALLY_RE = re.compile(r'finally\s*\{') + _SWITCH_RE = re.compile(r'switch\s*\(') + def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise self.Exception('Recursion limit reached') @@ -375,7 +404,7 @@ class JSInterpreter(object): if should_return: return ret, should_return - m = re.match(r'(?P(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?Pthrow\s+)', stmt) + m = self._VAR_RET_THROW_RE.match(stmt) if m: expr = stmt[len(m.group(0)):].strip() if m.group('throw'): @@ -447,13 +476,7 @@ class JSInterpreter(object): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'''(?x) - (?Ptry)\s*\{| - (?Pif)\s*\(| - (?Pswitch)\s*\(| - (?Pfor)\s*\(| - (?Pwhile)\s*\( - ''', expr) + m = self._COMPOUND_RE.match(expr) md = m.groupdict() if m else {} if md.get('if'): cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) @@ -512,7 +535,7 @@ class JSInterpreter(object): err = None pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion) - m = re.match(r'finally\s*\{', expr) + m = self._FINALLY_RE.match(expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) @@ -531,7 +554,7 @@ class JSInterpreter(object): if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: - switch_m = re.match(r'switch\s*\(', remaining) # FIXME + switch_m = self._SWITCH_RE.match(remaining) # FIXME if switch_m: switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') @@ -735,7 +758,7 @@ class JSInterpreter(object): if obj == compat_str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') - return ''.join(map(chr, argvals)) + return ''.join(map(compat_chr, argvals)) raise self.Exception('Unsupported string method ' + member, expr=expr) elif obj == float: if member == 'pow': @@ -808,10 +831,17 @@ class JSInterpreter(object): if idx >= len(obj): return None return ord(obj[idx]) - elif member == 'replace': + elif member in ('replace', 'replaceAll'): assertion(isinstance(obj, compat_str), 'must be applied on a string') assertion(len(argvals) == 2, 'takes exactly two arguments') - return re.sub(argvals[0], argvals[1], obj) + # TODO: argvals[1] callable, other Py vs JS edge cases + if isinstance(argvals[0], self.JS_RegExp): + count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1 + assertion(member != 'replaceAll' or count == 0, + 'replaceAll must be called with a global RegExp') + return argvals[0].sub(argvals[1], obj, count=count) + count = ('replaceAll', 'replace').index(member) + return re.sub(re.escape(argvals[0]), argvals[1], obj, count=count) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) From 27d41d73655b8fbf2dedf88cac96220520d526b5 Mon Sep 17 00:00:00 2001 From: Sophira Date: Tue, 7 Mar 2023 15:49:31 +0000 Subject: [PATCH 1459/1705] [doc] Recommend "Get cookies.txt LOCALLY" extension in README.md (#31763) * remove link to suspect "Get cookies.txt" extension, dropped from Chrome store * link to new Manifest V3-compatible open-source "Get cookies.txt LOCALLY" extension. Fixes #31465. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e07ddb1c..227e34046 100644 --- a/README.md +++ b/README.md @@ -918,7 +918,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. From 8c86fd33dca48ebb505ed04150d9e35993b9fe7e Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 9 Mar 2023 16:40:30 +0000 Subject: [PATCH 1460/1705] [doc] Improve "guidance" on bug reporting --- README.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 227e34046..14a3d6c86 100644 --- a/README.md +++ b/README.md @@ -1408,7 +1408,11 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported in the issue tracker: ( is an alias for this). Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). + +## Opening a bug report or suggestion + +Be sure to follow instructions provided **below** and **in the issue tracker**. Complete the appropriate issue template fully. Consider whether your problem is covered by an existing issue: if so, follow the discussion there. Avoid commenting on existing duplicate issues as such comments do not add to the discussion of the issue and are liable to be treated as spam. **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` @@ -1428,17 +1432,17 @@ $ youtube-dl -v The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. -Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): +Finally please review your issue to avoid various common mistakes (you can and should use this as a checklist) listed below. ### Is the description of the issue itself sufficient? -We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts. +We often get issue reports that are hard to understand. To avoid subsequent clarifications, and to assist participants who are not native English speakers, please elaborate on what feature you are requesting, or what bug you want to be fixed. -So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious +Make sure that it's obvious - What the problem is - How it could be fixed -- How your proposed solution would look like +- How your proposed solution would look If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. @@ -1448,14 +1452,14 @@ If your server has multiple IPs or you suspect censorship, adding `--call-home` **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. +### Is the issue already documented? + +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. Initially, at least, use the search term `-label:duplicate` to focus on active issues. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. + ### Are you using the latest version? Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. -### Is the issue already documented? - -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. - ### Why are existing options not enough? Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. From 5c985d4f81a43ada75dafb23233e7fe39913907a Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 11 Mar 2023 12:09:55 +0000 Subject: [PATCH 1461/1705] [downloader] Let _ffmpeg_ handle DASH segments Fixes https://github.com/ytdl-org/youtube-dl/issues/31792 after 3da1783. --- youtube_dl/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index bffcd10b6..1b6bd1fa2 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -273,7 +273,7 @@ class HttpieFD(ExternalFD): class FFmpegFD(ExternalFD): @classmethod def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms', 'http_dash_segments') @classmethod def available(cls): From baa6c5e95cb307e7d716645780ff8aef22de6aca Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 11 Mar 2023 12:17:00 +0000 Subject: [PATCH 1462/1705] [FragmentFD] Respect `--no-continue` * discard partial fragment on `--no-continue` * continue with correct progress display otherwise Resolves #21467 --- youtube_dl/downloader/common.py | 24 +++++++++++----- youtube_dl/downloader/dash.py | 10 +++---- youtube_dl/downloader/fragment.py | 46 +++++++++++++++++++++---------- youtube_dl/downloader/http.py | 15 ++++------ 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 1cdba89cd..c86ce2aa5 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -88,17 +88,21 @@ class FileDownloader(object): return '---.-%' return '%6s' % ('%3.1f%%' % percent) - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, *args): + if len(args) < 2: + rate, remaining = (start_or_rate, now_or_remaining) + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + start, now = (start_or_rate, now_or_remaining) + total, current = args if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def format_eta(eta): @@ -123,6 +127,12 @@ class FileDownloader(object): def format_retries(retries): return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod + def filesize_or_none(unencoded_filename): + fn = encodeFilename(unencoded_filename) + if os.path.isfile(fn): + return os.path.getsize(fn) + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index c6d674bc6..cc30485f8 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -38,8 +38,7 @@ class DashSegmentsFD(FragmentFD): # In DASH, the first segment contains necessary headers to # generate a valid MP4 file, so always abort for the first segment fatal = i == 0 or not skip_unavailable_fragments - count = 0 - while count <= fragment_retries: + for count in range(fragment_retries + 1): try: fragment_url = fragment.get('url') if not fragment_url: @@ -57,9 +56,8 @@ class DashSegmentsFD(FragmentFD): # is usually enough) thus allowing to download the whole file successfully. # To be future-proof we will retry all fragments that fail with any # HTTP error. - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count < fragment_retries: + self.report_retry_fragment(err, frag_index, count + 1, fragment_retries) except DownloadError: # Don't retry fragment if error occurred during HTTP downloading # itself since it has own retry settings @@ -68,7 +66,7 @@ class DashSegmentsFD(FragmentFD): break raise - if count > fragment_retries: + if count >= fragment_retries: if not fatal: self.report_skip_fragment(frag_index) continue diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 35c76feba..913e91b64 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -71,7 +71,7 @@ class FragmentFD(FileDownloader): @staticmethod def __do_ytdl_file(ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' + return ctx['live'] is not True and ctx['tmpfilename'] != '-' def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx @@ -101,6 +101,13 @@ class FragmentFD(FileDownloader): 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none( + self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = frag_resume_len + ctx['frag_resume_len'] = frag_resume_len or 0 + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None @@ -124,9 +131,7 @@ class FragmentFD(FileDownloader): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -136,10 +141,11 @@ class FragmentFD(FileDownloader): self.to_screen( '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) self.report_destination(ctx['filename']) + continuedl = self.params.get('continuedl', True) dl = HttpQuietDownloader( self.ydl, { - 'continuedl': True, + 'continuedl': continuedl, 'quiet': True, 'noprogress': True, 'ratelimit': self.params.get('ratelimit'), @@ -150,12 +156,11 @@ class FragmentFD(FileDownloader): ) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) or 0 + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -164,7 +169,8 @@ class FragmentFD(FileDownloader): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -178,7 +184,12 @@ class FragmentFD(FileDownloader): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -209,6 +220,7 @@ class FragmentFD(FileDownloader): start = time.time() ctx.update({ 'started': start, + 'fragment_started': start, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -218,6 +230,9 @@ class FragmentFD(FileDownloader): if s['status'] not in ('downloading', 'finished'): return + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + time_now = time.time() state['elapsed'] = time_now - start frag_total_bytes = s.get('total_bytes') or 0 @@ -232,16 +247,17 @@ class FragmentFD(FileDownloader): ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_total_bytes) + ctx['fragment_started'] = time.time() ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) @@ -268,7 +284,7 @@ class FragmentFD(FileDownloader): os.utime(ctx['filename'], (time.time(), filetime)) except Exception: pass - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + downloaded_bytes = self.filesize_or_none(ctx['filename']) or 0 self._hook_progress({ 'downloaded_bytes': downloaded_bytes, diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index d8ac41dcc..440471aa0 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -58,9 +58,9 @@ class HttpFD(FileDownloader): if self.params.get('continuedl', True): # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) + ctx.resume_len = info_dict.get('frag_resume_len') + if ctx.resume_len is None: + ctx.resume_len = self.filesize_or_none(ctx.tmpfilename) or 0 ctx.is_resume = ctx.resume_len > 0 @@ -115,9 +115,9 @@ class HttpFD(FileDownloader): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked - # to match the value of requested Range HTTP header. This is due to a webservers + # to match the value of requested Range HTTP header. This is due to webservers # that don't support resuming and serve a whole file with no Content-Range - # set in response despite of requested Range (see + # set in response despite requested Range (see # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) if has_range: content_range = ctx.data.headers.get('Content-Range') @@ -293,10 +293,7 @@ class HttpFD(FileDownloader): # Progress message speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if ctx.data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + eta = self.calc_eta(speed, ctx.data_len and (ctx.data_len - ctx.resume_len)) self._hook_progress({ 'status': 'downloading', From e8de54bce50f6f77a4d7e8e80675f7003d5bf630 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 13 Mar 2023 19:45:54 +0000 Subject: [PATCH 1463/1705] [core] Handle `/../` sequences in HTTP URLs * use Python's RFC implementation for embedded sequences * hack: strip unbalanced leading `../` from path, like eg Firefox See https://github.com/yt-dlp/yt-dlp/issues/3355 --- youtube_dl/YoutubeDL.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8e8546596..bcf781744 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,7 @@ from .compat import ( compat_str, compat_tokenize_tokenize, compat_urllib_error, + compat_urllib_parse, compat_urllib_request, compat_urllib_request_DataHandler, ) @@ -60,6 +61,7 @@ from .utils import ( format_bytes, formatSeconds, GeoRestrictedError, + HEADRequest, int_or_none, ISO3166Utils, locked_file, @@ -74,6 +76,7 @@ from .utils import ( preferredencoding, prepend_extension, process_communicate_or_kill, + PUTRequest, register_socks_protocols, render_table, replace_extension, @@ -2297,6 +2300,27 @@ class YoutubeDL(object): """ Start an HTTP download """ if isinstance(req, compat_basestring): req = sanitized_Request(req) + # an embedded /../ sequence is not automatically handled by urllib2 + # see https://github.com/yt-dlp/yt-dlp/issues/3355 + url = req.get_full_url() + parts = url.partition('/../') + if parts[1]: + url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2]) + if url: + # worse, URL path may have initial /../ against RFCs: work-around + # by stripping such prefixes, like eg Firefox + parts = compat_urllib_parse.urlsplit(url) + path = parts.path + while path.startswith('/../'): + path = path[3:] + url = parts._replace(path=path).geturl() + # get a new Request with the munged URL + if url != req.get_full_url(): + req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get( + req.get_method(), compat_urllib_request.Request) + req = req_type( + url, data=req.data, headers=dict(req.header_items()), + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): From 70ff01391068c98b4377c5cc17a8d00d5645e734 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Mar 2023 00:58:59 +0000 Subject: [PATCH 1464/1705] [devscripts] Add a hack to convert command-line options to API options --- devscripts/cli_to_api.py | 64 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 devscripts/cli_to_api.py diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py new file mode 100755 index 000000000..2f4d6a458 --- /dev/null +++ b/devscripts/cli_to_api.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +""" +This script displays the API parameters corresponding to a yt-dl command line + +Example: +$ ./cli_to_api.py -f best +{u'format': 'best'} +$ +""" + +# Allow direct execution +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import youtube_dl +from types import MethodType + + +def cli_to_api(*opts): + YDL = youtube_dl.YoutubeDL + + # to extract the parsed options, break out of YoutubeDL instantiation + + # return options via this Exception + class ParseYTDLResult(Exception): + def __init__(self, result): + super(ParseYTDLResult, self).__init__('result') + self.opts = result + + # replacement constructor that raises ParseYTDLResult + def ytdl_init(ydl, ydl_opts): + super(YDL, ydl).__init__(ydl_opts) + raise ParseYTDLResult(ydl_opts) + + # patch in the constructor + YDL.__init__ = MethodType(ytdl_init, YDL) + + # core parser + def parsed_options(argv): + try: + youtube_dl._real_main(list(argv)) + except ParseYTDLResult as result: + return result.opts + + # from https://github.com/yt-dlp/yt-dlp/issues/5859#issuecomment-1363938900 + default = parsed_options([]) + diff = dict((k, v) for k, v in parsed_options(opts).items() if default[k] != v) + if 'postprocessors' in diff: + diff['postprocessors'] = [pp for pp in diff['postprocessors'] if pp not in default['postprocessors']] + return diff + + +def main(): + from pprint import pprint + pprint(cli_to_api(*sys.argv)) + + +if __name__ == '__main__': + main() From 6fece0a96b3cd8677f5c1185a57c6e21403fcb44 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 14 Mar 2023 13:01:32 +0000 Subject: [PATCH 1465/1705] [AENetworksBaseIE] Report missing show data instead of crash --- youtube_dl/extractor/aenetworks.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 2a1f08e39..59fbe048a 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -8,6 +8,8 @@ from ..utils import ( ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -33,14 +35,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', @@ -75,7 +80,14 @@ class AENetworksBaseIE(ThePlatformIE): requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -126,7 +138,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -143,6 +155,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True From 45495228b7a6728b7e764bbcf1f38490cd3d8697 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 19 Mar 2023 00:51:44 +0000 Subject: [PATCH 1466/1705] [downloader/http] Only check for resumability when actually resuming --- test/test_downloader_http.py | 2 +- youtube_dl/downloader/http.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 4e6d7a2a0..6af86ae48 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -88,7 +88,7 @@ class TestHttpFD(unittest.TestCase): self.assertTrue(downloader.real_download(filename, { 'url': 'http://127.0.0.1:%d/%s' % (self.port, ep), })) - self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE, ep) try_rm(encodeFilename(filename)) def download_all(self, params): diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 440471aa0..28a49b9e8 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -141,7 +141,8 @@ class HttpFD(FileDownloader): # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload - self.report_unable_to_resume() + if range_start > 0: + self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) From f35b757c826027ab5263d431bbe363c6403bd66d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 19 Mar 2023 02:27:46 +0000 Subject: [PATCH 1467/1705] [utils] Ensure `allow_types` for `variadic()` is a tuple --- test/test_utils.py | 1 + youtube_dl/utils.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index ea2b96ed2..b85d397d0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1563,6 +1563,7 @@ Line 1 self.assertEqual(variadic(None), (None, )) self.assertEqual(variadic('spam'), ('spam', )) self.assertEqual(variadic('spam', allowed_types=dict), 'spam') + self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam') def test_traverse_obj(self): _TEST_DATA = { diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 761edcd49..f3c7af437 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4213,6 +4213,8 @@ def multipart_encode(data, boundary=None): def variadic(x, allowed_types=(compat_str, bytes, dict)): + if not isinstance(allowed_types, tuple) and isinstance(allowed_types, compat_collections_abc.Iterable): + allowed_types = tuple(allowed_types) return x if isinstance(x, compat_collections_abc.Iterable) and not isinstance(x, allowed_types) else (x,) From 88f28f620bcae7ba7302f8b049b74f0f8a12831f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 12 Mar 2023 14:46:09 +0530 Subject: [PATCH 1468/1705] [extractor/youtube] Construct fragment list lazily Ref: yt-dlp/yt-dlp/commit/e389d17 See: yt-dlp/yt-dlp#6517 --- youtube_dl/extractor/youtube.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 89711c84e..6b153193c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,6 +31,7 @@ from ..utils import ( get_element_by_attribute, int_or_none, js_to_json, + LazyList, merge_dicts, mimetype2ext, parse_codecs, @@ -1986,9 +1987,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): itags = [] itag_qualities = {} q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) + CHUNK_SIZE = 10 << 20 + streaming_data = player_response.get('streamingData') or {} streaming_formats = streaming_data.get('formats') or [] streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) + + def build_fragments(f): + return LazyList({ + 'url': update_url_query(f['url'], { + 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])) + }) + } for range_start in range(0, f['filesize'], CHUNK_SIZE)) + for fmt in streaming_formats: if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): continue @@ -2048,15 +2059,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if no_video: dct['abr'] = tbr if no_audio or no_video: - CHUNK_SIZE = 10 << 20 # avoid Youtube throttling dct.update({ 'protocol': 'http_dash_segments', - 'fragments': [{ - 'url': update_url_query(dct['url'], { - 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, dct['filesize'])) - }) - } for range_start in range(0, dct['filesize'], CHUNK_SIZE)] + 'fragments': build_fragments(dct), } if dct['filesize'] else { 'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful? }) From 3f6d2bd76f3393eef90896dfabc2d8dde37c2009 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 9 Mar 2023 22:09:23 +0530 Subject: [PATCH 1469/1705] [extractor/youtube] Bypass throttling for `-f17` and related cleanup Thanks @AudricV for the finding Ref: yt-dlp/yt-dlp/commit/c9abebb --- youtube_dl/extractor/youtube.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6b153193c..ae3416b20 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2052,13 +2052,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if mobj: dct['ext'] = mimetype2ext(mobj.group(1)) dct.update(parse_codecs(mobj.group(2))) - no_audio = dct.get('acodec') == 'none' - no_video = dct.get('vcodec') == 'none' - if no_audio: - dct['vbr'] = tbr - if no_video: - dct['abr'] = tbr - if no_audio or no_video: + single_stream = 'none' in (dct.get(c) for c in ('acodec', 'vcodec')) + if single_stream and dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' + if single_stream or itag == '17': # avoid Youtube throttling dct.update({ 'protocol': 'http_dash_segments', @@ -2067,8 +2064,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful? }) - if dct.get('ext'): - dct['container'] = dct['ext'] + '_dash' formats.append(dct) hls_manifest_url = streaming_data.get('hlsManifestUrl') From cdf40b6aa651d949ce01e9bec1a11f792e8af899 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 3 Apr 2023 21:07:10 +0100 Subject: [PATCH 1470/1705] [test] Update tests for Ubuntu 20.04 * 18.04 test runner was withdrawn * for now, disable Py 3.3/3.4 tests --- .github/workflows/ci.yml | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a609f3704..51abdce1d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,9 +7,10 @@ jobs: strategy: fail-fast: true matrix: - os: [ubuntu-18.04] + os: [ubuntu-20.04] # TODO: python 2.6 - python-version: [2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] + # TODO: restore support for 3.3, 3.4 + python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] python-impl: [cpython] ytdl-test-set: [core, download] run-tests-ext: [sh] @@ -26,26 +27,27 @@ jobs: ytdl-test-set: download run-tests-ext: bat # jython - - os: ubuntu-18.04 + - os: ubuntu-20.04 python-impl: jython ytdl-test-set: core run-tests-ext: sh - - os: ubuntu-18.04 + - os: ubuntu-20.04 python-impl: jython ytdl-test-set: download run-tests-ext: sh steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - if: ${{ matrix.python-impl == 'cpython' }} + - uses: actions/checkout@v3 + - name: Set up supported Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + if: ${{ matrix.python-impl == 'cpython' && ! contains(fromJSON('["3.3", "3.4"]'), matrix.python-version) }} with: python-version: ${{ matrix.python-version }} - name: Set up Java 8 if: ${{ matrix.python-impl == 'jython' }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v2 with: java-version: 8 + distribution: 'zulu' - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | @@ -70,9 +72,9 @@ jobs: name: Linter runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install flake8 From 557dbac173c30a51acd284b46f2d5460e539f51a Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 5 Apr 2023 18:29:24 +0100 Subject: [PATCH 1471/1705] [FragmentFD] Fix iteration with infinite limit * fixes ytdl-org/youtube-dl/baa6c5e * resolves #31885 --- youtube_dl/downloader/dash.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index cc30485f8..67a8e173f 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import itertools + from .fragment import FragmentFD from ..compat import compat_urllib_error from ..utils import ( @@ -30,15 +32,13 @@ class DashSegmentsFD(FragmentFD): fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - frag_index = 0 - for i, fragment in enumerate(fragments): - frag_index += 1 + for frag_index, fragment in enumerate(fragments, 1): if frag_index <= ctx['fragment_index']: continue # In DASH, the first segment contains necessary headers to # generate a valid MP4 file, so always abort for the first segment - fatal = i == 0 or not skip_unavailable_fragments - for count in range(fragment_retries + 1): + fatal = frag_index == 1 or not skip_unavailable_fragments + for count in itertools.count(): try: fragment_url = fragment.get('url') if not fragment_url: @@ -48,7 +48,6 @@ class DashSegmentsFD(FragmentFD): if not success: return False self._append_fragment(ctx, frag_content) - break except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately @@ -58,13 +57,14 @@ class DashSegmentsFD(FragmentFD): # HTTP error. if count < fragment_retries: self.report_retry_fragment(err, frag_index, count + 1, fragment_retries) + continue except DownloadError: # Don't retry fragment if error occurred during HTTP downloading - # itself since it has own retry settings - if not fatal: - self.report_skip_fragment(frag_index) - break - raise + # itself since it has its own retry settings + if fatal: + raise + self.report_skip_fragment(frag_index) + break if count >= fragment_retries: if not fatal: From 78da22489b483988e198a8352893df9c6cf34032 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 5 Apr 2023 18:39:54 +0100 Subject: [PATCH 1472/1705] [compat] Add and use `compat_open()` like Py3 `open()` * resolves FIXME: ytdl-org/youtube-dl/commit/dfe5fa4 --- youtube_dl/compat.py | 11 +++++++++++ youtube_dl/options.py | 6 ++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 39551f810..fe62caf80 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3127,6 +3127,16 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) +if sys.version_info < (3, 0): + # open(file, mode='r', buffering=- 1, encoding=None, errors=None, newline=None, closefd=True) not: opener=None + def compat_open(file_, *args, **kwargs): + if len(args) > 6 or 'opener' in kwargs: + raise ValueError('open: unsupported argument "opener"') + return io.open(file_, *args, **kwargs) +else: + compat_open = open + + legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -3185,6 +3195,7 @@ __all__ = [ 'compat_kwargs', 'compat_map', 'compat_numeric_types', + 'compat_open', 'compat_ord', 'compat_os_name', 'compat_os_path_expanduser', diff --git a/youtube_dl/options.py b/youtube_dl/options.py index f6d2b0898..7b059b51e 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -11,6 +11,7 @@ from .compat import ( compat_get_terminal_size, compat_getenv, compat_kwargs, + compat_open as open, compat_shlex_split, ) from .utils import ( @@ -41,14 +42,11 @@ def _hide_login_info(opts): def parseOpts(overrideArguments=None): def _readOptions(filename_bytes, default=[]): try: - optionf = open(filename_bytes) + optionf = open(filename_bytes, encoding=preferredencoding()) except IOError: return default # silently skip if file is not present try: - # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 contents = optionf.read() - if sys.version_info < (3,): - contents = contents.decode(preferredencoding()) res = compat_shlex_split(contents, comments=True) finally: optionf.close() From 25124bd640acf2fbae71b2a52738ee41da548fb1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 5 Apr 2023 18:47:49 +0100 Subject: [PATCH 1473/1705] [devscripts] Improve hack to convert command-line options to API options * define equality for DateRange * don't show default DateRange --- devscripts/cli_to_api.py | 25 ++++++++++++++++++++++--- youtube_dl/utils.py | 4 ++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py index 2f4d6a458..9fb1d2ba8 100755 --- a/devscripts/cli_to_api.py +++ b/devscripts/cli_to_api.py @@ -49,15 +49,34 @@ def cli_to_api(*opts): # from https://github.com/yt-dlp/yt-dlp/issues/5859#issuecomment-1363938900 default = parsed_options([]) - diff = dict((k, v) for k, v in parsed_options(opts).items() if default[k] != v) + + def neq_opt(a, b): + if a == b: + return False + if a is None and repr(type(object)).endswith(".utils.DateRange'>"): + return '0001-01-01 - 9999-12-31' != '{0}'.format(b) + return a != b + + diff = dict((k, v) for k, v in parsed_options(opts).items() if neq_opt(default[k], v)) if 'postprocessors' in diff: diff['postprocessors'] = [pp for pp in diff['postprocessors'] if pp not in default['postprocessors']] return diff def main(): - from pprint import pprint - pprint(cli_to_api(*sys.argv)) + from pprint import PrettyPrinter + + pprint = PrettyPrinter() + super_format = pprint.format + + def format(object, context, maxlevels, level): + if repr(type(object)).endswith(".utils.DateRange'>"): + return '{0}: {1}>'.format(repr(object)[:-2], object), True, False + return super_format(object, context, maxlevels, level) + + pprint.format = format + + pprint.pprint(cli_to_api(*sys.argv)) if __name__ == '__main__': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f3c7af437..d80ceb007 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3190,6 +3190,10 @@ class DateRange(object): def __str__(self): return '%s - %s' % (self.start.isoformat(), self.end.isoformat()) + def __eq__(self, other): + return (isinstance(other, DateRange) + and self.start == other.start and self.end == other.end) + def platform_name(): """ Returns the platform name as a compat_str """ From 9f4d83ff4255d8840c0fa9b367722c129ebecdb2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 5 Apr 2023 18:50:25 +0100 Subject: [PATCH 1474/1705] [options] Add --mtime option, unsets default --no-mtime * resolves #1709 (!) --- youtube_dl/options.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 7b059b51e..d802b7e59 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -731,9 +731,13 @@ def parseOpts(overrideArguments=None): '--no-part', action='store_true', dest='nopart', default=False, help='Do not use .part files - write directly into output file') + filesystem.add_option( + '--mtime', + action='store_true', dest='updatetime', default=True, + help='Use the Last-modified header to set the file modification time (default)') filesystem.add_option( '--no-mtime', - action='store_false', dest='updatetime', default=True, + action='store_false', dest='updatetime', help='Do not use the Last-modified header to set the file modification time') filesystem.add_option( '--write-description', From d6ae3b77cd50083ef245c28f904ee0b70a77d5c6 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 6 Apr 2023 14:11:18 +0100 Subject: [PATCH 1475/1705] [core] Avoid deepcopy of ctx dict (fix f35b757) * may now contain `LazyList`s * resolves #31999 --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index bcf781744..2c0d4926c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1399,7 +1399,7 @@ class YoutubeDL(object): filters = [self._build_format_filter(f) for f in selector.filters] def final_selector(ctx): - ctx_copy = copy.deepcopy(ctx) + ctx_copy = dict(ctx) for _filter in filters: ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) return selector_function(ctx_copy) From f8253a528935f78e1a3b724db8c1f0089f99314a Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 6 Apr 2023 19:42:36 +0100 Subject: [PATCH 1476/1705] [core] Avoid deepcopy of ctx dict (fix f35b757) (Pt 2) --- youtube_dl/YoutubeDL.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2c0d4926c..927b19417 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1389,11 +1389,10 @@ class YoutubeDL(object): 'abr': formats_info[1].get('abr'), 'ext': output_ext, } - video_selector, audio_selector = map(_build_selector_function, selector.selector) def selector_function(ctx): - for pair in itertools.product( - video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): + selector_fn = lambda x: _build_selector_function(x)(ctx) + for pair in itertools.product(*map(selector_fn, selector.selector)): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] From 213d1d91bfc4a00fefc72fa2730555d51060b42d Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 6 Apr 2023 19:49:46 +0100 Subject: [PATCH 1477/1705] [core] No longer importing copy --- youtube_dl/YoutubeDL.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 927b19417..2a1e59bf8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -5,7 +5,6 @@ from __future__ import absolute_import, unicode_literals import collections import contextlib -import copy import datetime import errno import fileinput From fe7e13066c20b10fe48bc154431440da36baec53 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 10 Apr 2023 17:12:31 +0100 Subject: [PATCH 1478/1705] [core] Add and use sanitize_info() method from yt-dlp --- youtube_dl/YoutubeDL.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a1e59bf8..2719d546f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -30,9 +30,12 @@ from string import ascii_letters from .compat import ( compat_basestring, compat_cookiejar, + compat_filter as filter, compat_get_terminal_size, compat_http_client, + compat_integer_types, compat_kwargs, + compat_map as map, compat_numeric_types, compat_os_name, compat_str, @@ -64,6 +67,7 @@ from .utils import ( int_or_none, ISO3166Utils, locked_file, + LazyList, make_HTTPS_handler, MaxDownloadsReached, orderedSet, @@ -2109,10 +2113,36 @@ class YoutubeDL(object): return self._download_retcode @staticmethod - def filter_requested_info(info_dict): - return dict( - (k, v) for k, v in info_dict.items() - if k not in ['requested_formats', 'requested_subtitles']) + def sanitize_info(info_dict, remove_private_keys=False): + ''' Sanitize the infodict for converting to json ''' + if info_dict is None: + return info_dict + + if remove_private_keys: + reject = lambda k, v: (v is None + or k.startswith('__') + or k in ('requested_formats', + 'requested_subtitles')) + else: + reject = lambda k, v: False + + def filter_fn(obj): + if isinstance(obj, dict): + return dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)) + elif isinstance(obj, (list, tuple, set, LazyList)): + return list(map(filter_fn, obj)) + elif obj is None or any(isinstance(obj, c) + for c in (compat_integer_types, + (compat_str, float, bool))): + return obj + else: + return repr(obj) + + return filter_fn(info_dict) + + @classmethod + def filter_requested_info(cls, info_dict): + return cls.sanitize_info(info_dict, True) def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" From 735e87adfc44b284dcdb4d9a0155ce0616e3af97 Mon Sep 17 00:00:00 2001 From: Gabriel Nagy Date: Thu, 13 Apr 2023 01:40:38 +0300 Subject: [PATCH 1479/1705] [core] Sanitize info dict before dumping JSON (fixes fe7e130) (#32032) * follow up to fe7e130 which didn't fix everything. Co-authored-by: dirkf --- youtube_dl/YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2719d546f..117f1c513 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1777,7 +1777,7 @@ class YoutubeDL(object): self.to_stdout(formatSeconds(info_dict['duration'])) print_mandatory('format') if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) + self.to_stdout(json.dumps(self.sanitize_info(info_dict))) def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -2091,7 +2091,7 @@ class YoutubeDL(object): raise else: if self.params.get('dump_single_json', False): - self.to_stdout(json.dumps(res)) + self.to_stdout(json.dumps(self.sanitize_info(res))) return self._download_retcode @@ -2100,6 +2100,7 @@ class YoutubeDL(object): [info_filename], mode='r', openhook=fileinput.hook_encoded('utf-8'))) as f: # FileInput doesn't have a read method, we can't call json.load + # TODO: let's use io.open(), then info = self.filter_requested_info(json.loads('\n'.join(f))) try: self.process_ie_result(info, download=True) From 2da3fa04a68ff0652f49d6874d82b7a0edb85ea3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 11 Apr 2023 17:36:27 +0100 Subject: [PATCH 1480/1705] [YouTube] Simplify signature patterns --- youtube_dl/extractor/youtube.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ae3416b20..80fff7ada 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -19,6 +19,7 @@ from ..compat import ( compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlparse, + compat_zip as zip, ) from ..jsinterp import JSInterpreter from ..utils import ( @@ -1555,17 +1556,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'("|\')signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') From 26035bde46c0acc30dc053618451d9aeca4b7709 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 13 Apr 2023 00:15:07 +0100 Subject: [PATCH 1481/1705] [DashSegmentsFD] Correctly detect errors when `fragment_retries` == 0 * use the success flag instead of the retry count * establish the fragment_url outside the retry loop * only report skipping a fragment once. * resolves #32033 --- youtube_dl/downloader/dash.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 67a8e173f..2800d4260 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -38,12 +38,13 @@ class DashSegmentsFD(FragmentFD): # In DASH, the first segment contains necessary headers to # generate a valid MP4 file, so always abort for the first segment fatal = frag_index == 1 or not skip_unavailable_fragments + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + success = False for count in itertools.count(): try: - fragment_url = fragment.get('url') - if not fragment_url: - assert fragment_base_url - fragment_url = urljoin(fragment_base_url, fragment['path']) success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) if not success: return False @@ -63,14 +64,13 @@ class DashSegmentsFD(FragmentFD): # itself since it has its own retry settings if fatal: raise - self.report_skip_fragment(frag_index) break - if count >= fragment_retries: + if not success: if not fatal: self.report_skip_fragment(frag_index) continue - self.report_error('giving up after %s fragment retries' % fragment_retries) + self.report_error('giving up after %s fragment retries' % count) return False self._finish_frag_download(ctx) From 211cbfd5d46025a8e4d8f9f3d424aaada4698974 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 21 Apr 2023 14:04:30 +0100 Subject: [PATCH 1482/1705] [jsinterp] Minimally handle arithmetic operator precedence Resolves #32066 --- test/test_jsinterp.py | 11 +++++++++++ youtube_dl/jsinterp.py | 40 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 5d129433d..e121358d7 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -505,6 +505,17 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_32066(self): + jsi = JSInterpreter("function x(){return Math.pow(3, 5) + new Date('1970-01-01T08:01:42.000+08:00') / 1000 * -239 - -24205;}") + self.assertEqual(jsi.call_function('x'), 70) + + def test_unary_operators(self): + jsi = JSInterpreter('function f(){return 2 - - - 2;}') + self.assertEqual(jsi.call_function('f'), 0) + # fails + # jsi = JSInterpreter('function f(){return 2 + - + - - 2;}') + # self.assertEqual(jsi.call_function('f'), 0) + """ # fails so far def test_packed(self): jsi = JSInterpreter('''function x(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index ab7d6f926..a06fc4ff5 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +from functools import update_wrapper import itertools import json import math @@ -23,11 +24,23 @@ from .compat import ( ) +def wraps_op(op): + + def update_and_rename_wrapper(w): + f = update_wrapper(w, op) + # fn names are str in both Py 2/3 + f.__name__ = str('JS_') + f.__name__ + return f + + return update_and_rename_wrapper + + def _js_bit_op(op): def zeroise(x): return 0 if x in (None, JS_Undefined) else x + @wraps_op(op) def wrapped(a, b): return op(zeroise(a), zeroise(b)) & 0xffffffff @@ -36,6 +49,7 @@ def _js_bit_op(op): def _js_arith_op(op): + @wraps_op(op) def wrapped(a, b): if JS_Undefined in (a, b): return float('nan') @@ -66,6 +80,7 @@ def _js_exp(a, b): def _js_eq_op(op): + @wraps_op(op) def wrapped(a, b): if set((a, b)) <= set((None, JS_Undefined)): return op(a, a) @@ -76,6 +91,7 @@ def _js_eq_op(op): def _js_comp_op(op): + @wraps_op(op) def wrapped(a, b): if JS_Undefined in (a, b): return False @@ -356,6 +372,7 @@ class JSInterpreter(object): return right_val try: + # print('Eval:', opfunc.__name__, left_val, right_val) return opfunc(left_val, right_val) except Exception as e: raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) @@ -395,6 +412,7 @@ class JSInterpreter(object): raise self.Exception('Recursion limit reached') allow_recursion -= 1 + # print('At: ' + stmt[:60]) should_return = False # fails on (eg) if (...) stmt1; else stmt2; sub_statements = list(self._separate(stmt, ';')) or [''] @@ -702,9 +720,24 @@ class JSInterpreter(object): continue right_expr = separated.pop() - while op == '-' and len(separated) > 1 and not separated[-1].strip(): - right_expr = '-' + right_expr - separated.pop() + # handle operators that are both unary and binary, minimal BODMAS + if op in ('+', '-'): + undone = 0 + while len(separated) > 1 and not separated[-1].strip(): + undone += 1 + separated.pop() + if op == '-' and undone % 2 != 0: + right_expr = op + right_expr + left_val = separated[-1] + for dm_op in ('*', '%', '/', '**'): + bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim)) + if len(bodmas) > 1 and not bodmas[-1].strip(): + expr = op.join(separated) + op + right_expr + right_expr = None + break + if right_expr is None: + continue + left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return @@ -955,6 +988,7 @@ class JSInterpreter(object): def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] argnames = tuple(argnames) + # import pdb; pdb.set_trace() def resf(args, kwargs={}, allow_recursion=100): global_stack[0].update( From 64d6dd64c8b7a35a87655d27fc83f2e98ef6ce13 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 23 Apr 2023 22:58:35 +0100 Subject: [PATCH 1483/1705] [YouTube] Support Releases tab --- youtube_dl/extractor/youtube.py | 114 +++++++++++++++++++------------- youtube_dl/utils.py | 9 ++- 2 files changed, 74 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 80fff7ada..0411c49f1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,6 +31,7 @@ from ..utils import ( extract_attributes, get_element_by_attribute, int_or_none, + join_nonempty, js_to_json, LazyList, merge_dicts, @@ -45,6 +46,7 @@ from ..utils import ( str_to_int, traverse_obj, try_get, + txt_or_none, unescapeHTML, unified_strdate, unsmuggle_url, @@ -2608,6 +2610,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'uploader_id': '@lexwill718', }, 'playlist_mincount': 75, + }, { + # Releases tab + 'url': 'https://www.youtube.com/@daftpunk/releases', + 'info_dict': { + 'id': 'UC_kRDKYrUlrbtrSiyu5Tflg', + 'title': 'Daft Punk - Releases', + 'description': 'Daft Punk (1993 - 2021) - Official YouTube Channel', + 'uploader_id': '@daftpunk', + 'uploader': 'Daft Punk', + }, + 'playlist_mincount': 36, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, @@ -2822,6 +2835,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continue return renderer + @staticmethod + def _get_text(r, k): + return traverse_obj( + r, (k, 'runs', 0, 'text'), (k, 'simpleText'), + expected_type=txt_or_none) + def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -2829,9 +2848,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): renderer = self._extract_grid_item_renderer(item) if not isinstance(renderer, dict): continue - title = try_get( - renderer, (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) + title = self._get_text(renderer, 'title') # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2848,8 +2865,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # channel channel_id = renderer.get('channelId') if channel_id: - title = try_get( - renderer, lambda x: x['title']['simpleText'], compat_str) + title = self._get_text(renderer, 'title') yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) @@ -2958,15 +2974,26 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _rich_grid_entries(self, contents): for content in contents: - video_renderer = try_get( - content, - (lambda x: x['richItemRenderer']['content']['videoRenderer'], - lambda x: x['richItemRenderer']['content']['reelItemRenderer']), - dict) + content = traverse_obj( + content, ('richItemRenderer', 'content'), + expected_type=dict) or {} + video_renderer = traverse_obj( + content, 'videoRenderer', 'reelItemRenderer', + expected_type=dict) if video_renderer: entry = self._video_entry(video_renderer) if entry: yield entry + # playlist + renderer = traverse_obj( + content, 'playlistRenderer', expected_type=dict) or {} + title = self._get_text(renderer, 'title') + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) @staticmethod def _build_continuation_query(continuation, ctp=None): @@ -3071,6 +3098,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): yield entry + continuation = self._extract_continuation(rich_grid_renderer) ytcfg = self._extract_ytcfg(item_id, webpage) @@ -3213,50 +3241,41 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): uploader['channel'] = uploader['uploader'] return uploader - @staticmethod - def _extract_alert(data): + @classmethod + def _extract_alert(cls, data): alerts = [] - for alert in try_get(data, lambda x: x['alerts'], list) or []: - if not isinstance(alert, dict): - continue - alert_text = try_get( - alert, lambda x: x['alertRenderer']['text'], dict) + for alert in traverse_obj(data, ('alerts', Ellipsis), expected_type=dict): + alert_text = traverse_obj( + alert, (None, lambda x: x['alertRenderer']['text']), get_all=False) if not alert_text: continue - text = try_get( - alert_text, - (lambda x: x['simpleText'], lambda x: x['runs'][0]['text']), - compat_str) + text = cls._get_text(alert_text, 'text') if text: alerts.append(text) return '\n'.join(alerts) def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) - renderer = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), + expected_type=dict) or {} playlist_id = item_id title = description = None if renderer: - channel_title = renderer.get('title') or item_id - tab_title = selected_tab.get('title') - title = channel_title or item_id - if tab_title: - title += ' - %s' % tab_title - if selected_tab.get('expandedText'): - title += ' - %s' % selected_tab['expandedText'] - description = renderer.get('description') - playlist_id = renderer.get('externalId') + channel_title = txt_or_none(renderer.get('title')) or item_id + tab_title = txt_or_none(selected_tab.get('title')) + title = join_nonempty( + channel_title or item_id, tab_title, + txt_or_none(selected_tab.get('expandedText')), + delim=' - ') + description = txt_or_none(renderer.get('description')) + playlist_id = txt_or_none(renderer.get('externalId')) or playlist_id else: - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - else: - renderer = try_get( - data, lambda x: x['header']['hashtagHeaderRenderer'], dict) - if renderer: - title = try_get(renderer, lambda x: x['hashtag']['simpleText']) + renderer = traverse_obj(data, + ('metadata', 'playlistMetadataRenderer'), + ('header', 'hashtagHeaderRenderer'), + expected_type=dict) or {} + title = traverse_obj(renderer, 'title', ('hashtag', 'simpleText'), + expected_type=txt_or_none) playlist = self.playlist_result( self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, @@ -3264,15 +3283,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return merge_dicts(playlist, self._extract_uploader(renderer, data)) def _extract_from_playlist(self, item_id, url, data, playlist): - title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], compat_str) - playlist_id = playlist.get('playlistId') or item_id + title = traverse_obj((playlist, data), + (0, 'title'), (1, 'titleText', 'simpleText'), + expected_type=txt_or_none) + playlist_id = txt_or_none(playlist.get('playlistId')) or item_id # Inline playlist rendition continuation does not always work # at Youtube side, so delegating regular tab-based playlist URL # processing whenever possible. - playlist_url = urljoin(url, try_get( - playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + playlist_url = urljoin(url, traverse_obj( + playlist, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), + expected_type=url_or_none)) if playlist_url and playlist_url != url: return self.url_result( playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d80ceb007..65ddb3b0f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3753,6 +3753,11 @@ def strip_or_none(v, default=None): return v.strip() if isinstance(v, compat_str) else default +def txt_or_none(v, default=None): + """ Combine str/strip_or_none, disallow blank value (for traverse_obj) """ + return default if v is None else (compat_str(v).strip() or default) + + def url_or_none(url): if not url or not isinstance(url, compat_str): return None @@ -4096,8 +4101,8 @@ def escape_url(url): ).geturl() -def parse_qs(url): - return compat_parse_qs(compat_urllib_parse.urlparse(url).query) +def parse_qs(url, **kwargs): + return compat_parse_qs(compat_urllib_parse.urlparse(url).query, **kwargs) def read_batch_urls(batch_fd): From 11cc3f3ad03a88d6cb1eab18a8e5dd6bf148ac54 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 20:53:07 +0100 Subject: [PATCH 1484/1705] [utils] Fix `compiled_regex_type` in 249f2b6 --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 65ddb3b0f..584581b6a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -56,6 +56,7 @@ from .compat import ( compat_kwargs, compat_os_name, compat_re_Match, + compat_re_Pattern, compat_shlex_quote, compat_str, compat_struct_pack, @@ -86,7 +87,7 @@ def register_socks_protocols(): # Unfavoured alias -compiled_regex_type = compat_re_Match +compiled_regex_type = compat_re_Pattern def random_user_agent(): From a85a875fef2e9b097c3f6f93f1d0cead06f84e43 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 20:59:30 +0100 Subject: [PATCH 1485/1705] [jsinterp] Handle NaN in bitwise operators * also add _NaN * also pull function naming from yt-dlp --- test/test_jsinterp.py | 11 +++++++++++ youtube_dl/jsinterp.py | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index e121358d7..a8f312fde 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -18,6 +18,7 @@ class TestJSInterpreter(unittest.TestCase): def test_basic(self): jsi = JSInterpreter('function x(){;}') self.assertEqual(jsi.call_function('x'), None) + self.assertEqual(repr(jsi.extract_function('x')), 'F') jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) @@ -505,6 +506,16 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_bitwise_operators_madness(self): + jsi = JSInterpreter('function x(){return null << 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return undefined >> 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return 42 << NaN}') + self.assertEqual(jsi.call_function('x'), 42) + def test_32066(self): jsi = JSInterpreter("function x(){return Math.pow(3, 5) + new Date('1970-01-01T08:01:42.000+08:00') / 1000 * -239 - -24205;}") self.assertEqual(jsi.call_function('x'), 70) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a06fc4ff5..bb406647a 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,12 +1,13 @@ from __future__ import unicode_literals -from functools import update_wrapper import itertools import json import math import operator import re +from functools import update_wrapper + from .utils import ( error_to_compat_str, ExtractorError, @@ -24,6 +25,22 @@ from .compat import ( ) +# name JS functions +class function_with_repr(object): + # from yt_dlp/utils.py, but in this module + # repr_ is always set + def __init__(self, func, repr_): + update_wrapper(self, func) + self.func, self.__repr = func, repr_ + + def __call__(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def __repr__(self): + return self.__repr + + +# name JS operators def wraps_op(op): def update_and_rename_wrapper(w): @@ -35,10 +52,13 @@ def wraps_op(op): return update_and_rename_wrapper +_NaN = float('nan') + + def _js_bit_op(op): def zeroise(x): - return 0 if x in (None, JS_Undefined) else x + return 0 if x in (None, JS_Undefined, _NaN) else x @wraps_op(op) def wrapped(a, b): @@ -52,7 +72,7 @@ def _js_arith_op(op): @wraps_op(op) def wrapped(a, b): if JS_Undefined in (a, b): - return float('nan') + return _NaN return op(a or 0, b or 0) return wrapped @@ -60,13 +80,13 @@ def _js_arith_op(op): def _js_div(a, b): if JS_Undefined in (a, b) or not (a and b): - return float('nan') + return _NaN return operator.truediv(a or 0, b) if b else float('inf') def _js_mod(a, b): if JS_Undefined in (a, b) or not b: - return float('nan') + return _NaN return (a or 0) % b @@ -74,7 +94,7 @@ def _js_exp(a, b): if not b: return 1 # even 0 ** 0 !! elif JS_Undefined in (a, b): - return float('nan') + return _NaN return (a or 0) ** b @@ -285,6 +305,8 @@ class JSInterpreter(object): def _named_object(self, namespace, obj): self.__named_object_counter += 1 name = '%s%d' % (self._OBJ_NAME, self.__named_object_counter) + if callable(obj) and not isinstance(obj, function_with_repr): + obj = function_with_repr(obj, 'F<%s>' % (self.__named_object_counter, )) namespace[name] = obj return name @@ -693,7 +715,7 @@ class JSInterpreter(object): elif expr == 'undefined': return JS_Undefined, should_return elif expr == 'NaN': - return float('NaN'), should_return + return _NaN, should_return elif md.get('return'): return local_vars[m.group('name')], should_return @@ -953,7 +975,9 @@ class JSInterpreter(object): return self.build_arglist(func_m.group('args')), code def extract_function(self, funcname): - return self.extract_function_from_code(*self.extract_function_code(funcname)) + return function_with_repr( + self.extract_function_from_code(*self.extract_function_code(funcname)), + 'F<%s>' % (funcname, )) def extract_function_from_code(self, argnames, code, *global_stack): local_vars = {} @@ -988,7 +1012,6 @@ class JSInterpreter(object): def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] argnames = tuple(argnames) - # import pdb; pdb.set_trace() def resf(args, kwargs={}, allow_recursion=100): global_stack[0].update( From 6ed34338285f722d0da312ce0af3a15a077a3e2a Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 21:02:01 +0100 Subject: [PATCH 1486/1705] [jsinterp] Add short-cut evaluation for common expression * special handling for (d%e.length+e.length)%e.length speeds up ~6% --- youtube_dl/jsinterp.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index bb406647a..f837865c4 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -502,8 +502,15 @@ class JSInterpreter(object): expr = self._dump(inner, local_vars) + outer if expr.startswith('('): - inner, outer = self._separate_at_paren(expr) - inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + + m = re.match(r'\((?P[a-z])%(?P[a-z])\.length\+(?P=e)\.length\)%(?P=e)\.length', expr) + if m: + # short-cut eval of frequently used `(d%e.length+e.length)%e.length`, worth ~6% on `pytest -k test_nsig` + outer = None + inner, should_abort = self._offset_e_by_d(m.group('d'), m.group('e'), local_vars) + else: + inner, outer = self._separate_at_paren(expr) + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: return inner, should_abort or should_return else: @@ -957,6 +964,17 @@ class JSInterpreter(object): return obj + @staticmethod + def _offset_e_by_d(d, e, local_vars): + """ Short-cut eval: (d%e.length+e.length)%e.length """ + try: + d = local_vars[d] + e = local_vars[e] + e = len(e) + return _js_mod(_js_mod(d, e) + e, e), False + except Exception: + return None, True + def extract_function_code(self, funcname): """ @returns argnames, code """ func_m = re.search( From d1c6c5c4d618fa950813c0c71aede34a5ac851e9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 21:17:31 +0100 Subject: [PATCH 1487/1705] [core] Improve platform debug log, based on yt-dlp --- youtube_dl/YoutubeDL.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 117f1c513..212c04298 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -25,6 +25,7 @@ import tokenize import traceback import random +from ssl import OPENSSL_VERSION from string import ascii_letters from .compat import ( @@ -66,6 +67,7 @@ from .utils import ( HEADRequest, int_or_none, ISO3166Utils, + join_nonempty, locked_file, LazyList, make_HTTPS_handler, @@ -2395,9 +2397,20 @@ class YoutubeDL(object): return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] return impl_name - self._write_string('[debug] Python version %s (%s) - %s\n' % ( - platform.python_version(), python_implementation(), - platform_name())) + def libc_ver(): + try: + return platform.libc_ver() + except OSError: # We may not have access to the executable + return [] + + self._write_string('[debug] Python %s (%s %s) - %s (%s%s)\n' % ( + platform.python_version(), + python_implementation(), + platform.architecture()[0], + platform_name(), + OPENSSL_VERSION, + ', %s' % (join_nonempty(*libc_ver(), delim=' ') or '-'), + )) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() From d89c2137ba4c1def185358a9ff48642e05ac65a2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 19 May 2023 13:09:18 +0100 Subject: [PATCH 1488/1705] [jsinterp] Small updates for a85a875 * update signature tests * clarify NaN handling --- test/test_jsinterp.py | 3 +++ test/test_youtube_signature.py | 8 ++++++++ youtube_dl/jsinterp.py | 12 +++++------- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index a8f312fde..1cc148b15 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -516,6 +516,9 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x(){return 42 << NaN}') self.assertEqual(jsi.call_function('x'), 42) + jsi = JSInterpreter('function x(){return 42 << Infinity}') + self.assertEqual(jsi.call_function('x'), 42) + def test_32066(self): jsi = JSInterpreter("function x(){return Math.pow(3, 5) + new Date('1970-01-01T08:01:42.000+08:00') / 1000 * -239 - -24205;}") self.assertEqual(jsi.call_function('x'), 70) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index decf7ee38..d41d708a0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -143,6 +143,14 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', ), + ( + 'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js', + 'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw', + ), + ( + 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', + 'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index f837865c4..dc580943e 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import itertools import json -import math import operator import re @@ -52,6 +51,10 @@ def wraps_op(op): return update_and_rename_wrapper +# NB In principle NaN cannot be checked by membership. +# Here all NaN values are actually this one, so _NaN is _NaN, +# although _NaN != _NaN. + _NaN = float('nan') @@ -126,13 +129,8 @@ def _js_comp_op(op): def _js_ternary(cndn, if_true=True, if_false=False): """Simulate JS's ternary operator (cndn?if_true:if_false)""" - if cndn in (False, None, 0, '', JS_Undefined): + if cndn in (False, None, 0, '', JS_Undefined, _NaN): return if_false - try: - if math.isnan(cndn): # NB: NaN cannot be checked by membership - return if_false - except TypeError: - pass return if_true From 1f7c6f8b2ba5bedc9b4da279659688fbbf06a059 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 19 May 2023 13:12:59 +0100 Subject: [PATCH 1489/1705] [core] Further improve platform debug log * see d1c6c5c --- youtube_dl/YoutubeDL.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 212c04298..1b3ef94b4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -102,6 +102,7 @@ from .utils import ( YoutubeDLCookieProcessor, YoutubeDLHandler, YoutubeDLRedirectHandler, + ytdl_is_updateable, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -2373,9 +2374,11 @@ class YoutubeDL(object): self.get_encoding())) write_string(encoding_str, encoding=None) - self._write_string('[debug] youtube-dl version ' + __version__ + '\n') + writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) + + writeln_debug('youtube-dl version ', __version__, (' (single file build)' if ytdl_is_updateable() else '')) if _LAZY_LOADER: - self._write_string('[debug] Lazy loading extractors enabled' + '\n') + writeln_debug('Lazy loading extractors enabled') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -2384,7 +2387,7 @@ class YoutubeDL(object): out, err = process_communicate_or_kill(sp) out = out.decode().strip() if re.match('[0-9a-f]+', out): - self._write_string('[debug] Git HEAD: ' + out + '\n') + writeln_debug('Git HEAD: ', out) except Exception: try: sys.exc_clear() @@ -2403,13 +2406,15 @@ class YoutubeDL(object): except OSError: # We may not have access to the executable return [] - self._write_string('[debug] Python %s (%s %s) - %s (%s%s)\n' % ( + libc = join_nonempty(*libc_ver(), delim=' ') + writeln_debug('Python %s (%s %s %s) - %s - %s%s' % ( platform.python_version(), python_implementation(), + platform.machine(), platform.architecture()[0], platform_name(), OPENSSL_VERSION, - ', %s' % (join_nonempty(*libc_ver(), delim=' ') or '-'), + (' - %s' % (libc, )) if libc else '' )) exe_versions = FFmpegPostProcessor.get_versions(self) @@ -2422,17 +2427,17 @@ class YoutubeDL(object): ) if not exe_str: exe_str = 'none' - self._write_string('[debug] exe versions: %s\n' % exe_str) + writeln_debug('exe versions: %s' % (exe_str, )) proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + writeln_debug('Proxy map: ', compat_str(proxy_map)) if self.params.get('call_home', False): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - self._write_string('[debug] Public IP address: %s\n' % ipaddr) + writeln_debug('Public IP address: %s' % (ipaddr, )) latest_version = self.urlopen( 'https://yt-dl.org/latest/version').read().decode('utf-8') if version_tuple(latest_version) > version_tuple(__version__): From ee731f3d00064f446faa9ffb4c21ce4ca388bf5d Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 23 May 2023 16:19:55 +0100 Subject: [PATCH 1490/1705] [ITV] Fix UA capitalisation in 384f632 --- youtube_dl/extractor/itv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 7026139ea..c64af3be6 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -59,7 +59,7 @@ class ITVBaseIE(InfoExtractor): @staticmethod def _vanilla_ua_header(): - return {'User-agent': 'Mozilla/5.0'} + return {'User-Agent': 'Mozilla/5.0'} def _download_webpage_handle(self, url, video_id, *args, **kwargs): # specialised to (a) use vanilla UA (b) detect geo-block @@ -69,7 +69,7 @@ class ITVBaseIE(InfoExtractor): 'user_agent' not in params and not any(re.match(r'(?i)user-agent\s*:', h) for h in (params.get('headers') or [])) - and 'User-agent' not in (kwargs.get('headers') or {})): + and 'User-Agent' not in (kwargs.get('headers') or {})): kwargs.setdefault('headers', {}) kwargs['headers'] = self._vanilla_ua_header() From 2389c7cbd30813435c50848a9b276bcfe2a810db Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 23 May 2023 17:11:22 +0100 Subject: [PATCH 1491/1705] [compat] Fix casefold import __all__ syntax in a19855f --- youtube_dl/casefold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py index 748c2d491..ad9c66f8e 100644 --- a/youtube_dl/casefold.py +++ b/youtube_dl/casefold.py @@ -1663,5 +1663,5 @@ def casefold(s): __all__ = [ - casefold + 'casefold', ] From b8a86dcf1aa837577178ae25357d8241ab4ba6c1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 26 May 2023 20:25:25 +0100 Subject: [PATCH 1492/1705] [core] Revise 1f7c6f8 to help downstream merger (possibly) --- youtube_dl/YoutubeDL.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1b3ef94b4..98b878fc1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2374,11 +2374,10 @@ class YoutubeDL(object): self.get_encoding())) write_string(encoding_str, encoding=None) - writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) - - writeln_debug('youtube-dl version ', __version__, (' (single file build)' if ytdl_is_updateable() else '')) + self._write_string('[debug] youtube-dl version ' + __version__ + (' (single file build)\n' if ytdl_is_updateable() else '\n')) if _LAZY_LOADER: - writeln_debug('Lazy loading extractors enabled') + self._write_string('[debug] Lazy loading extractors enabled\n') + writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) # moved down for easier merge try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], From a2534f7b888416e872d5afd1862eb3e30fc69fc7 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 11 Jun 2023 13:33:50 +0100 Subject: [PATCH 1493/1705] [jsinterp] Fix div bug breaking player 8c7583ff Thx bashonly: https://github.com/ytdl-org/youtube-dl/issues/32292#issuecomment-1585639223 Fixes #32292 --- test/test_jsinterp.py | 49 ++++++++++++++++++++++++++++++++++ test/test_youtube_signature.py | 4 +++ youtube_dl/jsinterp.py | 2 +- 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 1cc148b15..ecd6ab3c9 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -33,6 +33,55 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x4(a){return 2*a+1;}') self.assertEqual(jsi.call_function('x4', 3), 7) + def test_add(self): + jsi = JSInterpreter('function f(){return 42 + 7;}') + self.assertEqual(jsi.call_function('f'), 49) + jsi = JSInterpreter('function f(){return 42 + undefined;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + jsi = JSInterpreter('function f(){return 42 + null;}') + self.assertEqual(jsi.call_function('f'), 42) + + def test_sub(self): + jsi = JSInterpreter('function f(){return 42 - 7;}') + self.assertEqual(jsi.call_function('f'), 35) + jsi = JSInterpreter('function f(){return 42 - undefined;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + jsi = JSInterpreter('function f(){return 42 - null;}') + self.assertEqual(jsi.call_function('f'), 42) + + def test_mul(self): + jsi = JSInterpreter('function f(){return 42 * 7;}') + self.assertEqual(jsi.call_function('f'), 294) + jsi = JSInterpreter('function f(){return 42 * undefined;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + jsi = JSInterpreter('function f(){return 42 * null;}') + self.assertEqual(jsi.call_function('f'), 0) + + def test_div(self): + jsi = JSInterpreter('function f(a, b){return a / b;}') + self.assertTrue(math.isnan(jsi.call_function('f', 0, 0))) + self.assertTrue(math.isnan(jsi.call_function('f', JS_Undefined, 1))) + self.assertTrue(math.isinf(jsi.call_function('f', 2, 0))) + self.assertEqual(jsi.call_function('f', 0, 3), 0) + + def test_mod(self): + jsi = JSInterpreter('function f(){return 42 % 7;}') + self.assertEqual(jsi.call_function('f'), 0) + jsi = JSInterpreter('function f(){return 42 % 0;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + jsi = JSInterpreter('function f(){return 42 % undefined;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + + def test_exp(self): + jsi = JSInterpreter('function f(){return 42 ** 2;}') + self.assertEqual(jsi.call_function('f'), 1764) + jsi = JSInterpreter('function f(){return 42 ** undefined;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + jsi = JSInterpreter('function f(){return 42 ** null;}') + self.assertEqual(jsi.call_function('f'), 1) + jsi = JSInterpreter('function f(){return undefined ** 42;}') + self.assertTrue(math.isnan(jsi.call_function('f'))) + def test_empty_return(self): jsi = JSInterpreter('function f(){return; y()}') self.assertEqual(jsi.call_function('f'), None) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index d41d708a0..e7bce9d68 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -151,6 +151,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', ), + ( + 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', + 'E2AQVN6y_zM7uN9w8z', '9A2dbY5GDZrt9A', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index dc580943e..9d4a5bc57 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -82,7 +82,7 @@ def _js_arith_op(op): def _js_div(a, b): - if JS_Undefined in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a or b): return _NaN return operator.truediv(a or 0, b) if b else float('inf') From ff75c300f52321dc7322e28d1df153cf0ea65a6d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 17 Jun 2023 15:34:11 +0100 Subject: [PATCH 1494/1705] [jsinterp] Fix test for failed match in extract_object() --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 9d4a5bc57..c18c4fef1 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -985,9 +985,9 @@ class JSInterpreter(object): \((?P[^)]*)\)\s* (?P{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code')) # refine the match if func_m is None: raise self.Exception('Could not find JS function "{funcname}"'.format(**locals())) + code, _ = self._separate_at_paren(func_m.group('code')) # refine the match return self.build_arglist(func_m.group('args')), code def extract_function(self, funcname): From d6433cbb2c4440056a38846e35bb5a3efa9bcac2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 17 Jun 2023 15:43:10 +0100 Subject: [PATCH 1495/1705] [jsinterp] Don't find unrelated objects --- youtube_dl/jsinterp.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c18c4fef1..00f219440 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -941,15 +941,15 @@ class JSInterpreter(object): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( - r'''(?x) - (?(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) - }\s*; - ''' % (re.escape(objname), _FUNC_NAME_RE), + r'''(?xs) + (?:{0}\s*\.\s*{1}|{1}\s*=\s*\{{\s* + (?P({2}\s*:\s*function\s*\(.*?\)\s*\{{.*?}}(?:,\s*)?)*) + }}\s*); + '''.format(_NAME_RE, re.escape(objname), _FUNC_NAME_RE), self.code) - if not obj_m: + fields = obj_m and obj_m.group('fields') + if fields is None: raise self.Exception('Could not find object ' + objname) - fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( r'''(?x) From ae8ba2c31977b68b75221f80c488c0b12385269c Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 17 Jun 2023 15:36:39 +0100 Subject: [PATCH 1496/1705] [YouTube] Fix `KeyError QV` in signature extraction failed * temporarily force missing global definition into sig JS * improve test: thanks https://github.com/yt-dlp/yt-dlp/issues/7327#issuecomment-1595274615 * resolves #32314 --- test/test_youtube_signature.py | 7 ++++++- youtube_dl/extractor/youtube.py | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index e7bce9d68..4ba586e53 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -63,6 +63,11 @@ _SIG_TESTS = [ 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', + ), + ( + 'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ) ] @@ -231,7 +236,7 @@ def n_sig(jscode, sig_input): make_sig_test = t_factory( - 'signature', signature, re.compile(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) + 'signature', signature, re.compile(r'(?s).*(?:-|/player/)(?P[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) for test_spec in _SIG_TESTS: make_sig_test(*test_spec) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0411c49f1..0bbce71a3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1569,8 +1569,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') - jsi = JSInterpreter(jscode) + # temporary (please) hack for player 6ed0d907 #32314 + ah = 'var AH={LR:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c},QV:function(a){a.reverse()},pO:function(a,b){a.splice(0,b)}};' + jsi = JSInterpreter(ah + jscode) + initial_function = jsi.extract_function(funcname) + return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url): From 07af47960f3bb262ead02490ce65c8c45c01741e Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 18 Jun 2023 00:52:18 +0100 Subject: [PATCH 1497/1705] [YouTube] Improve fix for ae8ba2c Thx: https://github.com/yt-dlp/yt-dlp/commit/01aba25 --- youtube_dl/extractor/youtube.py | 4 +--- youtube_dl/jsinterp.py | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0bbce71a3..1855fca7f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1569,9 +1569,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') - # temporary (please) hack for player 6ed0d907 #32314 - ah = 'var AH={LR:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c},QV:function(a){a.reverse()},pO:function(a,b){a.splice(0,b)}};' - jsi = JSInterpreter(ah + jscode) + jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 00f219440..1ba9c3d67 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -940,15 +940,18 @@ class JSInterpreter(object): def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} - obj_m = re.search( - r'''(?xs) - (?:{0}\s*\.\s*{1}|{1}\s*=\s*\{{\s* - (?P({2}\s*:\s*function\s*\(.*?\)\s*\{{.*?}}(?:,\s*)?)*) - }}\s*); - '''.format(_NAME_RE, re.escape(objname), _FUNC_NAME_RE), - self.code) - fields = obj_m and obj_m.group('fields') - if fields is None: + fields = None + for obj_m in re.finditer( + r'''(?xs) + {0}\s*\.\s*{1}|{1}\s*=\s*\{{\s* + (?P({2}\s*:\s*function\s*\(.*?\)\s*\{{.*?}}(?:,\s*)?)*) + }}\s*; + '''.format(_NAME_RE, re.escape(objname), _FUNC_NAME_RE), + self.code): + fields = obj_m.group('fields') + if fields: + break + else: raise self.Exception('Could not find object ' + objname) # Currently, it only supports function definitions fields_m = re.finditer( From 9112e668a5ea6376017718db9ff13b369d53ad7a Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 22 Jun 2023 13:23:31 +0530 Subject: [PATCH 1498/1705] [YouTube] Improve nsig function name extraction Fixes player b7910ca8, using `,` vs `;` See https://github.com/ytdl-org/youtube-dl/issues/32292#issuecomment-1602231170 Co-authored-by: dirkf --- test/test_youtube_signature.py | 11 +++-------- youtube_dl/extractor/youtube.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4ba586e53..5dcabaf95 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -63,11 +63,6 @@ _SIG_TESTS = [ 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', - ), - ( - 'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ) ] @@ -157,8 +152,8 @@ _NSIG_TESTS = [ 'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', ), ( - 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', - 'E2AQVN6y_zM7uN9w8z', '9A2dbY5GDZrt9A', + 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', + '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', ), ] @@ -236,7 +231,7 @@ def n_sig(jscode, sig_input): make_sig_test = t_factory( - 'signature', signature, re.compile(r'(?s).*(?:-|/player/)(?P[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) + 'signature', signature, re.compile(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) for test_spec in _SIG_TESTS: make_sig_test(*test_spec) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1855fca7f..24e2efbd9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1623,15 +1623,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') if not idx: return nfunc + + VAR_RE_TMPL = r'var\s+%s\s*=\s*(?P\[(?P%s)\])[;,]' + note = 'Initial JS player n function {0} (%s[%s])' % (nfunc, idx) + + def search_function_code(needle, group): + return self._search_regex( + VAR_RE_TMPL % (re.escape(nfunc), needle), jscode, + note.format(group), group=group) + if int_or_none(idx) == 0: - real_nfunc = self._search_regex( - r'var %s\s*=\s*\[([a-zA-Z_$][\w$]*)\];' % (re.escape(nfunc), ), jscode, - 'Initial JS player n function alias ({nfunc}[{idx}])'.format(**locals())) + real_nfunc = search_function_code(r'[a-zA-Z_$][\w$]*', group='alias') if real_nfunc: return real_nfunc - return self._parse_json(self._search_regex( - r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode, - 'Initial JS player n function name ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] + return self._parse_json( + search_function_code('.+?', group='name'), + nfunc, transform_source=js_to_json)[int(idx)] def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) From ebdc82c58684b4e202fabc046f9a40fc73cccde5 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 22 Jun 2023 17:24:48 +0100 Subject: [PATCH 1499/1705] [workflows/ci.yml] Replace actions/setup-python for legacy Pythons Thanks MatteoH2O1999: https://github.com/MatteoH2O1999/setup-python --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51abdce1d..9be4eaa89 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,10 +38,12 @@ jobs: steps: - uses: actions/checkout@v3 - name: Set up supported Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - if: ${{ matrix.python-impl == 'cpython' && ! contains(fromJSON('["3.3", "3.4"]'), matrix.python-version) }} + # wrap broken actions/setup-python@v4 + uses: ytdl-org/setup-python@v1 with: python-version: ${{ matrix.python-version }} + cache-build: true + allow-build: info - name: Set up Java 8 if: ${{ matrix.python-impl == 'jython' }} uses: actions/setup-java@v2 From fa7f0effbe4e14fcf70e1dc4496371c9862b64b9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 22 Jun 2023 23:10:04 +0100 Subject: [PATCH 1500/1705] [YouTube] Avoid crash in author extraction --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 24e2efbd9..9c419c002 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -448,7 +448,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): extract_attributes(self._search_regex( r'''(?s)(]+\bitemprop\s*=\s*("|')%s\2[^>]*>)''' % re.escape(var_name), - get_element_by_attribute('itemprop', 'author', webpage) or '', + get_element_by_attribute('itemprop', 'author', webpage or '') or '', 'author link', default='')), paths[var_name][0]) From 58fc5bde47215d9e7c60647dd21202a254b3b066 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 23 Jun 2023 00:15:06 +0100 Subject: [PATCH 1501/1705] [workflows/ci.yml] Restore test support for Py 3.3, 3.4, and add 2.6 --- .github/workflows/ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9be4eaa89..4008cc190 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,9 +8,7 @@ jobs: fail-fast: true matrix: os: [ubuntu-20.04] - # TODO: python 2.6 - # TODO: restore support for 3.3, 3.4 - python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] + python-version: [2.6, 2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] python-impl: [cpython] ytdl-test-set: [core, download] run-tests-ext: [sh] From 2500300c2a5986ace34390aa473a8bd51f83622c Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 29 Jun 2023 15:27:12 +0100 Subject: [PATCH 1502/1705] [workflows/ci.yml] Restore test support for Py 3.2 --- .github/workflows/ci.yml | 319 +++++++++++++++++++++++++++-- devscripts/make_lazy_extractors.py | 4 + test/test_execution.py | 8 +- test/test_unicode_literals.py | 1 + youtube_dl/__init__.py | 8 +- youtube_dl/compat.py | 18 +- 6 files changed, 328 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4008cc190..8d8e654fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,73 +1,349 @@ name: CI -on: [push, pull_request] + +env: + # add 3.10+ after patching nose (https://github.com/nose-devs/nose/issues/1099) + # or switching to fork of https://github.com/mdmintz/pynose + all-cpython-versions: 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9 + main-cpython-versions: 2.7, 3.2, 3.5, 3.9 + pypy-versions: pypy-2.7, pypy-3.6, pypy-3.7 + cpython-versions: all + # test-set: both + test-set: core + +on: + push: + pull_request: + workflow_dispatch: + inputs: + cpython-versions: + type: choice + description: CPython versions (main = 2.7, 3.2, 3.5, 3.9) + options: + - all + - main + required: true + default: main + test-set: + type: choice + description: core, download + options: + - both + - core + - download + required: true + default: core + +permissions: + contents: read + jobs: + select: + name: Select tests from inputs + runs-on: ubuntu-latest + outputs: + cpython-versions: ${{ steps.run.outputs.cpython-versions }} + test-set: ${{ steps.run.outputs.test-set }} + own-pip-versions: ${{ steps.run.outputs.own-pip-versions }} + steps: + - id: run + run: | + # Make a JSON Array from comma/space-separated string (no extra escaping) + json_list() { \ + ret=""; IFS="${IFS},"; set -- $*; \ + for a in "$@"; do \ + ret=$(printf '%s"%s"' "${ret}${ret:+, }" "$a"); \ + done; \ + printf '[%s]' "$ret"; } + tests="${{ inputs.test-set || env.test-set }}" + [ $tests = both ] && tests="core download" + printf 'test-set=%s\n' "$(json_list $tests)" >> "$GITHUB_OUTPUT" + versions="${{ inputs.cpython-versions || env.cpython-versions }}" + if [ "$versions" = all ]; then \ + versions="${{ env.all-cpython-versions }}"; else \ + versions="${{ env.main-cpython-versions }}"; \ + fi + printf 'cpython-versions=%s\n' \ + "$(json_list ${versions}${versions:+, }${{ env.pypy-versions }})" >> "$GITHUB_OUTPUT" + # versions with a special get-pip.py in a per-version subdirectory + printf 'own-pip-versions=%s\n' \ + "$(json_list 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6)" >> "$GITHUB_OUTPUT" + tests: - name: Tests + name: Run tests + needs: select + permissions: + contents: read + packages: write runs-on: ${{ matrix.os }} strategy: fail-fast: true matrix: os: [ubuntu-20.04] - python-version: [2.6, 2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] + # outside steps, use github.env...., not env.... + python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }} python-impl: [cpython] - ytdl-test-set: [core, download] + ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }} run-tests-ext: [sh] include: - # python 3.2 is only available on windows via setup-python - os: windows-2019 python-version: 3.2 python-impl: cpython - ytdl-test-set: core + ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }} run-tests-ext: bat - os: windows-2019 python-version: 3.2 python-impl: cpython - ytdl-test-set: download + ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} run-tests-ext: bat # jython - os: ubuntu-20.04 python-impl: jython - ytdl-test-set: core + ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }} run-tests-ext: sh - os: ubuntu-20.04 python-impl: jython - ytdl-test-set: download + ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} run-tests-ext: sh steps: - - uses: actions/checkout@v3 + - name: Checkout + uses: actions/checkout@v3 + #-------- Python 3 ----- - name: Set up supported Python ${{ matrix.python-version }} + id: setup-python + if: ${{ matrix.python-impl == 'cpython' && matrix.python-version != '2.6' && matrix.python-version != '2.7'}} # wrap broken actions/setup-python@v4 uses: ytdl-org/setup-python@v1 with: python-version: ${{ matrix.python-version }} cache-build: true allow-build: info + - name: Locate supported Python ${{ matrix.python-version }} + if: ${{ env.pythonLocation }} + shell: bash + run: | + echo "PYTHONHOME=${pythonLocation}" >> "$GITHUB_ENV" + export expected="${{ steps.setup-python.outputs.python-path }}" + dirname() { printf '%s\n' \ + 'import os, sys' \ + 'print(os.path.dirname(sys.argv[1]))' \ + | ${expected} - "$1"; } + expd="$(dirname "$expected")" + export python="$(command -v python)" + [ "$expd" = "$(dirname "$python")" ] || echo "PATH=$expd:${PATH}" >> "$GITHUB_ENV" + [ -x "$python" ] || printf '%s\n' \ + 'import os' \ + 'exp = os.environ["expected"]' \ + 'python = os.environ["python"]' \ + 'exps = os.path.split(exp)' \ + 'if python and (os.path.dirname(python) == exp[0]):' \ + ' exit(0)' \ + 'exps[1] = "python" + os.path.splitext(exps[1])[1]' \ + 'python = os.path.join(*exps)' \ + 'try:' \ + ' os.symlink(exp, python)' \ + 'except AttributeError:' \ + ' os.rename(exp, python)' \ + | ${expected} - + printf '%s\n' \ + 'import sys' \ + 'print(sys.path)' \ + | ${expected} - + #-------- Python 2.7 -- + - name: Set up Python 2.7 + if: ${{ matrix.python-version == '2.7' }} + # install 2.7 + run: | + sudo apt-get install -y python2 python-is-python2 + echo "PYTHONHOME=/usr" >> "$GITHUB_ENV" + #-------- Python 2.6 -- + - name: Set up Python 2.6 environment + if: ${{ matrix.python-version == '2.6' }} + run: | + openssl_name=openssl-1.0.2u + echo "openssl_name=${openssl_name}" >> "$GITHUB_ENV" + openssl_dir=$HOME/.local/opt/$openssl_name + echo "openssl_dir=${openssl_dir}" >> "$GITHUB_ENV" + PYENV_ROOT=$HOME/.local/share/pyenv + echo "PYENV_ROOT=${PYENV_ROOT}" >> "$GITHUB_ENV" + sudo apt-get install -y openssl ca-certificates + - name: Cache Python 2.6 + id: cache26 + if: ${{ matrix.python-version == '2.6' }} + uses: actions/cache@v3 + with: + key: python-2.6.9 + path: | + ${{ env.openssl_dir }} + ${{ env.PYENV_ROOT }} + - name: Build and set up Python 2.6 + if: ${{ matrix.python-version == '2.6' && ! steps.cache26.outputs.cache-hit }} + # dl and build locally + run: | + # Install build environment + sudo apt-get install -y build-essential llvm libssl-dev tk-dev \ + libncursesw5-dev libreadline-dev libsqlite3-dev \ + libffi-dev xz-utils zlib1g-dev libbz2-dev liblzma-dev + # Download and install OpenSSL 1.0.2, back in time + openssl_name=${{ env.openssl_name }} + openssl_targz=${openssl_name}.tar.gz + openssl_dir=${{ env.openssl_dir }} + openssl_inc=$openssl_dir/include + openssl_lib=$openssl_dir/lib + openssl_ssl=$openssl_dir/ssl + curl -L "https://www.openssl.org/source/$openssl_targz" -o $openssl_targz + tar -xf $openssl_targz + ( cd $openssl_name; \ + ./config --prefix=$openssl_dir --openssldir=${openssl_dir}/ssl \ + --libdir=lib -Wl,-rpath=${openssl_dir}/lib shared zlib-dynamic && \ + make && \ + make install ) + rm -rf $openssl_name + rmdir $openssl_ssl/certs && ln -s /etc/ssl/certs $openssl_ssl/certs + + # Download PyEnv from its GitHub repository. + export PYENV_ROOT=${{ env.PYENV_ROOT }} + export PATH=$PYENV_ROOT/bin:$PATH + git clone https://github.com/pyenv/pyenv.git $PYENV_ROOT + eval "$(pyenv init --path)" + + # Prevent pyenv build trying (and failing) to update pip + export GET_PIP=get-pip-2.6.py + echo 'import sys; sys.exit(0)' > ${GET_PIP} + GET_PIP=$(realpath $GET_PIP) + + # Build and install Python + export CFLAGS="-I$openssl_inc" + export LDFLAGS="-L$openssl_lib" + export LD_LIBRARY_PATH="$openssl_lib" + pyenv install 2.6.9 + echo "PYTHONHOME=${PYENV_ROOT}" >> "$GITHUB_ENV" + echo "PATH=$PYENV_ROOT/bin:$PATH" >> "$GITHUB_ENV" + - name: Set up cached Python 2.6 + if: ${{ steps.cache26.outputs.cache-hit }} + run: | + export PYENV_ROOT + export PATH=$PYENV_ROOT/bin:$PATH + eval "$(pyenv init --path)" + pyenv local 2.6.9 + echo "PYTHONHOME=${PYENV_ROOT}" >> "$GITHUB_ENV" + echo "PATH=$PYENV_ROOT/bin:$PATH" >> "$GITHUB_ENV" + #-------- Jython ------ - name: Set up Java 8 if: ${{ matrix.python-impl == 'jython' }} uses: actions/setup-java@v2 with: java-version: 8 distribution: 'zulu' - - name: Install Jython + - name: Setup Jython environment if: ${{ matrix.python-impl == 'jython' }} run: | - wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar - java -jar jython-installer.jar -s -d "$HOME/jython" - echo "$HOME/jython/bin" >> $GITHUB_PATH - - name: Install nose - if: ${{ matrix.python-impl != 'jython' }} - run: pip install nose + echo "JYTHON_ROOT=${HOME}/jython" >> "$GITHUB_ENV" + - name: Cache Jython + id: cachejy + if: ${{ matrix.python-impl == 'jython' }} + uses: actions/cache@v3 + with: + # 2.7.3 now available, may solve SNI issue + key: jython-2.7.1 + path: | + ${{ env.JYTHON_ROOT }} + - name: Install Jython + if: ${{ matrix.python-impl == 'jython' && ! steps.cachejy.outputs.cache-hit }} + run: | + JYTHON_ROOT="${{ env.JYTHON_ROOT }}" + curl -L "https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar" -o jython-installer.jar + java -jar jython-installer.jar -s -d "${JYTHON_ROOT}" + echo "${JYTHON_ROOT}/bin" >> $GITHUB_PATH + - name: Set up cached Jython + if: ${{ steps.cachejy.outputs.cache-hit }} + run: | + JYTHON_ROOT="${{ env.JYTHON_ROOT }}" + echo "${JYTHON_ROOT}/bin" >> $GITHUB_PATH + #-------- pip --------- + - name: Set up supported Python ${{ matrix.python-version }} pip + if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.6' || matrix.python-version == '2.7' }} + # This step may run in either Linux or Windows + shell: bash + run: | + echo "$PATH" + echo "$PYTHONHOME" + # curl is available on both Windows and Linux, -L follows redirects, -O gets name + python -m ensurepip || python -m pip --version || { \ + get_pip="${{ contains(needs.select.outputs.own-pip-versions, matrix.python-version) && format('{0}/', matrix.python-version) || '' }}"; \ + curl -L -O "https://bootstrap.pypa.io/pip/${get_pip}get-pip.py"; \ + python get-pip.py; } + - name: Set up other Python ${{ matrix.python-version }} pip + if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }} + shell: bash + run: | + # https://files.pythonhosted.org/packages/8a/e9/8468cd68b582b06ef554be0b96b59f59779627131aad48f8a5bce4b13450/wheel-0.29.0-py2.py3-none-any.whl + # https://files.pythonhosted.org/packages/06/4b/86a670fd21f7849adb092e40883c48dcd0d66b8a878fc8d63b7f0ea04213/setuptools-29.0.1-py2.py3-none-any.whl + python -m pip --version || { \ + curl -L -O "https://bootstrap.pypa.io/pip/3.2/get-pip.py"; \ + curl -L -O "https://files.pythonhosted.org/packages/b2/d0/cd115fe345dd6f07ec1c780020a7dfe74966fceeb171e0f20d1d4905b0b7/pip-7.1.2-py2.py3-none-any.whl"; \ + python -v get-pip.py --no-setuptools --no-wheel pip-7.1.2-py2.py3-none-any.whl; } + + #-------- nose -------- + - name: Install nose for Python ${{ matrix.python-version }} + if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.6' || matrix.python-version == '2.7' }} + shell: bash + run: | + echo "$PATH" + echo "$PYTHONHOME" + python --version + python -m pip --version + python -m pip nose --version || python -m pip install nose + - name: Install nose for other Python ${{ matrix.python-version }} + if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }} + shell: bash + run: | + python -m pip nose --version || { \ + curl -L -O "https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl"; \ + python --version; \ + printf '%s\n' \ + 'import sys' \ + 'print(sys.path)' \ + | python -; \ + python -m pip --version; \ + python -m pip install nose-1.3.7-py3-none-any.whl; } - name: Install nose (Jython) if: ${{ matrix.python-impl == 'jython' }} - # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + # Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) run: | - wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl - pip install nose-1.3.7-py2-none-any.whl + pip nose --version || { \ + curl -L -O "https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl"; \ + pip --version; \ + pip install nose-1.3.7-py2-none-any.whl; } + - name: Set up nosetest test + if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }} + shell: bash + run: | + # define a test to validate the Python version used by nosetests + printf '%s\n' \ + 'from __future__ import unicode_literals' \ + 'import sys, os, platform, unittest' \ + 'class TestPython(unittest.TestCase):' \ + ' def setUp(self):' \ + ' self.ver = os.environ["PYTHON_VER"].split("-")' \ + ' def test_python_ver(self):' \ + ' self.assertEqual(sys.version[:3], self.ver[-1])' \ + ' self.assertTrue(sys.version.startswith(self.ver[-1]))' \ + ' self.assertIn(self.ver[0], sys.version.lower())' \ + ' def test_python_impl(self):' \ + ' self.assertIn(platform.python_implementation().lower(), (os.environ["PYTHON_IMPL"], self.ver[0]))' \ + > test/test_python.py + #-------- TESTS ------- - name: Run tests + if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }} continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: YTDL_TEST_SET: ${{ matrix.ytdl-test-set }} - run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} + PYTHON_VER: ${{ matrix.python-version }} + PYTHON_IMPL: ${{ matrix.python-impl }} + + run: | + ./devscripts/run_tests.${{ matrix.run-tests-ext }} + flake8: name: Linter runs-on: ubuntu-latest @@ -81,3 +357,4 @@ jobs: run: pip install flake8 - name: Run flake8 run: flake8 . + diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index edc19183d..4bddca047 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -6,6 +6,10 @@ import os from os.path import dirname as dirn import sys +from youtube_dl.compat import compat_register_utf8 + +compat_register_utf8() + print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) diff --git a/test/test_execution.py b/test/test_execution.py index 704e14612..1dee53a0f 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -10,10 +10,13 @@ import os import subprocess sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from youtube_dl.compat import compat_register_utf8 + from youtube_dl.utils import encodeArgument rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +compat_register_utf8() try: _DEV_NULL = subprocess.DEVNULL @@ -25,13 +28,14 @@ class TestExecution(unittest.TestCase): def test_import(self): subprocess.check_call([sys.executable, '-c', 'import youtube_dl'], cwd=rootDir) + @unittest.skipIf(sys.version_info < (2, 7), 'Python 2.6 doesn\'t support package execution') def test_module_exec(self): - if sys.version_info >= (2, 7): # Python 2.6 doesn't support package execution - subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_main_exec(self): subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + @unittest.skipIf(sys.version_info < (2, 7), 'Python 2.6 doesn\'t support package execution') def test_cmdline_umlauts(self): p = subprocess.Popen( [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'], diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py index 6c1b7ec91..c7c2252f5 100644 --- a/test/test_unicode_literals.py +++ b/test/test_unicode_literals.py @@ -15,6 +15,7 @@ IGNORED_FILES = [ 'setup.py', # http://bugs.python.org/issue13943 'conf.py', 'buildserver.py', + 'get-pip.py', ] IGNORED_DIRS = [ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e1bd67919..cc8285eba 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -5,7 +5,6 @@ from __future__ import unicode_literals __license__ = 'Public Domain' -import codecs import io import os import random @@ -17,6 +16,7 @@ from .options import ( ) from .compat import ( compat_getpass, + compat_register_utf8, compat_shlex_split, workaround_optparse_bug9161, ) @@ -46,10 +46,8 @@ from .YoutubeDL import YoutubeDL def _real_main(argv=None): - # Compatibility fixes for Windows - if sys.platform == 'win32': - # https://github.com/ytdl-org/youtube-dl/issues/820 - codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + # Compatibility fix for Windows + compat_register_utf8() workaround_optparse_bug9161() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index fe62caf80..0f4d3756f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -31,13 +31,17 @@ try: compat_str, compat_basestring, compat_chr = ( unicode, basestring, unichr ) - from .casefold import casefold as compat_casefold - except NameError: compat_str, compat_basestring, compat_chr = ( str, str, chr ) + +# casefold +try: + compat_str.casefold compat_casefold = lambda s: s.casefold() +except AttributeError: + from .casefold import casefold as compat_casefold try: import collections.abc as compat_collections_abc @@ -3137,6 +3141,15 @@ else: compat_open = open +# compat_register_utf8 +def compat_register_utf8(): + if sys.platform == 'win32': + # https://github.com/ytdl-org/youtube-dl/issues/820 + from codecs import register, lookup + register( + lambda name: lookup('utf-8') if name == 'cp65001' else None) + + legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -3203,6 +3216,7 @@ __all__ = [ 'compat_print', 'compat_re_Match', 'compat_re_Pattern', + 'compat_register_utf8', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', From b08a58090635777f1001d5cde2cd141a5565177c Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 30 Jun 2023 03:52:39 +0100 Subject: [PATCH 1503/1705] [workflows/ci.yml] Fix test support for Py 2.6 --- .github/workflows/ci.yml | 115 ++++++++++++++++++----------- devscripts/make_lazy_extractors.py | 8 +- test/test_execution.py | 16 ++-- 3 files changed, 83 insertions(+), 56 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8d8e654fb..ce878c1b1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,9 +6,8 @@ env: all-cpython-versions: 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9 main-cpython-versions: 2.7, 3.2, 3.5, 3.9 pypy-versions: pypy-2.7, pypy-3.6, pypy-3.7 - cpython-versions: all - # test-set: both - test-set: core + cpython-versions: main + test-set: both on: push: @@ -75,6 +74,10 @@ jobs: contents: read packages: write runs-on: ${{ matrix.os }} + env: + PIP: python -m pip + PIP_DISABLE_PIP_VERSION_CHECK: true + PIP_NO_PYTHON_VERSION_WARNING: true strategy: fail-fast: true matrix: @@ -152,12 +155,14 @@ jobs: - name: Set up Python 2.7 if: ${{ matrix.python-version == '2.7' }} # install 2.7 + shell: bash run: | sudo apt-get install -y python2 python-is-python2 echo "PYTHONHOME=/usr" >> "$GITHUB_ENV" #-------- Python 2.6 -- - name: Set up Python 2.6 environment if: ${{ matrix.python-version == '2.6' }} + shell: bash run: | openssl_name=openssl-1.0.2u echo "openssl_name=${openssl_name}" >> "$GITHUB_ENV" @@ -178,6 +183,7 @@ jobs: - name: Build and set up Python 2.6 if: ${{ matrix.python-version == '2.6' && ! steps.cache26.outputs.cache-hit }} # dl and build locally + shell: bash run: | # Install build environment sudo apt-get install -y build-essential llvm libssl-dev tk-dev \ @@ -203,8 +209,7 @@ jobs: # Download PyEnv from its GitHub repository. export PYENV_ROOT=${{ env.PYENV_ROOT }} export PATH=$PYENV_ROOT/bin:$PATH - git clone https://github.com/pyenv/pyenv.git $PYENV_ROOT - eval "$(pyenv init --path)" + git clone "https://github.com/pyenv/pyenv.git" "$PYENV_ROOT" # Prevent pyenv build trying (and failing) to update pip export GET_PIP=get-pip-2.6.py @@ -216,17 +221,14 @@ jobs: export LDFLAGS="-L$openssl_lib" export LD_LIBRARY_PATH="$openssl_lib" pyenv install 2.6.9 - echo "PYTHONHOME=${PYENV_ROOT}" >> "$GITHUB_ENV" - echo "PATH=$PYENV_ROOT/bin:$PATH" >> "$GITHUB_ENV" - - name: Set up cached Python 2.6 - if: ${{ steps.cache26.outputs.cache-hit }} + - name: Locate Python 2.6 + if: ${{ matrix.python-version == '2.6' }} + shell: bash run: | - export PYENV_ROOT - export PATH=$PYENV_ROOT/bin:$PATH - eval "$(pyenv init --path)" - pyenv local 2.6.9 - echo "PYTHONHOME=${PYENV_ROOT}" >> "$GITHUB_ENV" - echo "PATH=$PYENV_ROOT/bin:$PATH" >> "$GITHUB_ENV" + PYTHONHOME="${{ env.PYENV_ROOT }}/versions/2.6.9" + echo "PYTHONHOME=$PYTHONHOME" >> "$GITHUB_ENV" + echo "PATH=${PYTHONHOME}/bin:$PATH" >> "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=${{ env.openssl_dir }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH}" >> "$GITHUB_ENV" #-------- Jython ------ - name: Set up Java 8 if: ${{ matrix.python-impl == 'jython' }} @@ -236,8 +238,10 @@ jobs: distribution: 'zulu' - name: Setup Jython environment if: ${{ matrix.python-impl == 'jython' }} + shell: bash run: | echo "JYTHON_ROOT=${HOME}/jython" >> "$GITHUB_ENV" + echo "PIP=pip" >> "$GITHUB_ENV" - name: Cache Jython id: cachejy if: ${{ matrix.python-impl == 'jython' }} @@ -249,19 +253,21 @@ jobs: ${{ env.JYTHON_ROOT }} - name: Install Jython if: ${{ matrix.python-impl == 'jython' && ! steps.cachejy.outputs.cache-hit }} + shell: bash run: | JYTHON_ROOT="${{ env.JYTHON_ROOT }}" curl -L "https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar" -o jython-installer.jar java -jar jython-installer.jar -s -d "${JYTHON_ROOT}" - echo "${JYTHON_ROOT}/bin" >> $GITHUB_PATH + echo "${JYTHON_ROOT}/bin" >> "$GITHUB_PATH" - name: Set up cached Jython if: ${{ steps.cachejy.outputs.cache-hit }} + shell: bash run: | JYTHON_ROOT="${{ env.JYTHON_ROOT }}" echo "${JYTHON_ROOT}/bin" >> $GITHUB_PATH #-------- pip --------- - name: Set up supported Python ${{ matrix.python-version }} pip - if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.6' || matrix.python-version == '2.7' }} + if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.7' }} # This step may run in either Linux or Windows shell: bash run: | @@ -272,48 +278,66 @@ jobs: get_pip="${{ contains(needs.select.outputs.own-pip-versions, matrix.python-version) && format('{0}/', matrix.python-version) || '' }}"; \ curl -L -O "https://bootstrap.pypa.io/pip/${get_pip}get-pip.py"; \ python get-pip.py; } + - name: Set up Python 2.6 pip + if: ${{ matrix.python-version == '2.6' }} + shell: bash + run: | + python -m pip --version || { \ + curl -L -O "https://bootstrap.pypa.io/pip/2.6/get-pip.py"; \ + curl -L -O "https://files.pythonhosted.org/packages/ac/95/a05b56bb975efa78d3557efa36acaf9cf5d2fd0ee0062060493687432e03/pip-9.0.3-py2.py3-none-any.whl"; \ + python get-pip.py --no-setuptools --no-wheel pip-9.0.3-py2.py3-none-any.whl; } + # work-around to invoke pip module on 2.6: https://bugs.python.org/issue2751 + echo "PIP=python -m pip.__main__" >> "$GITHUB_ENV" - name: Set up other Python ${{ matrix.python-version }} pip if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }} shell: bash run: | - # https://files.pythonhosted.org/packages/8a/e9/8468cd68b582b06ef554be0b96b59f59779627131aad48f8a5bce4b13450/wheel-0.29.0-py2.py3-none-any.whl - # https://files.pythonhosted.org/packages/06/4b/86a670fd21f7849adb092e40883c48dcd0d66b8a878fc8d63b7f0ea04213/setuptools-29.0.1-py2.py3-none-any.whl python -m pip --version || { \ curl -L -O "https://bootstrap.pypa.io/pip/3.2/get-pip.py"; \ curl -L -O "https://files.pythonhosted.org/packages/b2/d0/cd115fe345dd6f07ec1c780020a7dfe74966fceeb171e0f20d1d4905b0b7/pip-7.1.2-py2.py3-none-any.whl"; \ - python -v get-pip.py --no-setuptools --no-wheel pip-7.1.2-py2.py3-none-any.whl; } - + python get-pip.py --no-setuptools --no-wheel pip-7.1.2-py2.py3-none-any.whl; } + #-------- unittest ---- + - name: Upgrade Unittest for Python 2.6 + if: ${{ matrix.python-version == '2.6' }} + shell: bash + run: | + # see pip for Jython + $PIP -qq show unittest2 || { \ + for u in "65/26/32b8464df2a97e6dd1b656ed26b2c194606c16fe163c695a992b36c11cdf/six-1.13.0-py2.py3-none-any.whl" \ + "f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl" \ + "c7/a3/c5da2a44c85bfbb6eebcfc1dde24933f8704441b98fdde6528f4831757a6/linecache2-1.0.0-py2.py3-none-any.whl" \ + "17/0a/6ac05a3723017a967193456a2efa0aa9ac4b51456891af1e2353bb9de21e/traceback2-1.4.0-py2.py3-none-any.whl" \ + "72/20/7f0f433060a962200b7272b8c12ba90ef5b903e218174301d0abfd523813/unittest2-1.1.0-py2.py3-none-any.whl"; do \ + curl -L -O "https://files.pythonhosted.org/packages/${u}"; \ + $PIP install ${u##*/}; \ + done; } + # make tests use unittest2 + for test in ./test/test_*.py; do + sed -r -i -e '/^import unittest$/s/test/test2 as unittest/' "$test" + done #-------- nose -------- - name: Install nose for Python ${{ matrix.python-version }} - if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.6' || matrix.python-version == '2.7' }} + if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.7' }} shell: bash run: | echo "$PATH" echo "$PYTHONHOME" - python --version - python -m pip --version - python -m pip nose --version || python -m pip install nose - - name: Install nose for other Python ${{ matrix.python-version }} + $PIP -qq show nose || $PIP install nose + - name: Install nose for other Python 2 + if: ${{ matrix.python-impl == 'jython' || matrix.python-version == '2.6' }} + shell: bash + run: | + # Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + $PIP -qq show nose || { \ + curl -L -O "https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl"; \ + $PIP install nose-1.3.7-py2-none-any.whl; } + - name: Install nose for other Python 3 if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }} shell: bash run: | - python -m pip nose --version || { \ + $PIP -qq show nose || { \ curl -L -O "https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl"; \ - python --version; \ - printf '%s\n' \ - 'import sys' \ - 'print(sys.path)' \ - | python -; \ - python -m pip --version; \ - python -m pip install nose-1.3.7-py3-none-any.whl; } - - name: Install nose (Jython) - if: ${{ matrix.python-impl == 'jython' }} - # Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) - run: | - pip nose --version || { \ - curl -L -O "https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl"; \ - pip --version; \ - pip install nose-1.3.7-py2-none-any.whl; } + $PIP install nose-1.3.7-py3-none-any.whl; } - name: Set up nosetest test if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }} shell: bash @@ -321,7 +345,11 @@ jobs: # define a test to validate the Python version used by nosetests printf '%s\n' \ 'from __future__ import unicode_literals' \ - 'import sys, os, platform, unittest' \ + 'import sys, os, platform' \ + 'try:' \ + ' import unittest2 as unittest' \ + 'except ImportError:' \ + ' import unittest' \ 'class TestPython(unittest.TestCase):' \ ' def setUp(self):' \ ' self.ver = os.environ["PYTHON_VER"].split("-")' \ @@ -340,7 +368,6 @@ jobs: YTDL_TEST_SET: ${{ matrix.ytdl-test-set }} PYTHON_VER: ${{ matrix.python-version }} PYTHON_IMPL: ${{ matrix.python-impl }} - run: | ./devscripts/run_tests.${{ matrix.run-tests-ext }} diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 4bddca047..a8b6ff1b9 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -6,10 +6,6 @@ import os from os.path import dirname as dirn import sys -from youtube_dl.compat import compat_register_utf8 - -compat_register_utf8() - print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) @@ -23,6 +19,10 @@ try: except OSError: pass +from youtube_dl.compat import compat_register_utf8 + +compat_register_utf8() + from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor diff --git a/test/test_execution.py b/test/test_execution.py index 1dee53a0f..35e7a5651 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -11,13 +11,12 @@ import subprocess sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import compat_register_utf8 - from youtube_dl.utils import encodeArgument -rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - compat_register_utf8() +rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + try: _DEV_NULL = subprocess.DEVNULL except AttributeError: @@ -33,21 +32,22 @@ class TestExecution(unittest.TestCase): subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_main_exec(self): - subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, os.path.normpath('youtube_dl/__main__.py'), '--version'], cwd=rootDir, stdout=_DEV_NULL) @unittest.skipIf(sys.version_info < (2, 7), 'Python 2.6 doesn\'t support package execution') def test_cmdline_umlauts(self): + os.environ['PYTHONIOENCODING'] = 'utf-8' p = subprocess.Popen( - [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'], + [sys.executable, os.path.normpath('youtube_dl/__main__.py'), encodeArgument('ä'), '--version'], cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) _, stderr = p.communicate() self.assertFalse(stderr) def test_lazy_extractors(self): - lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py' + lazy_extractors = os.path.normpath('youtube_dl/extractor/lazy_extractors.py') try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL) - subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, os.path.normpath('devscripts/make_lazy_extractors.py'), lazy_extractors], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, os.path.normpath('test/test_all_urls.py')], cwd=rootDir, stdout=_DEV_NULL) finally: for x in ['', 'c'] if sys.version_info[0] < 3 else ['']: try: From f24bc9272e9b74efc4c4af87c862f5f78921d424 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 4 Jul 2023 16:06:21 +0100 Subject: [PATCH 1504/1705] [Misc] Fixes for 2.6 compatibility --- test/test_jsinterp.py | 10 ++++++---- test/test_utils.py | 2 +- youtube_dl/YoutubeDL.py | 6 +++++- youtube_dl/compat.py | 12 ++++++++++++ youtube_dl/jsinterp.py | 13 ++++++++++++- youtube_dl/utils.py | 3 ++- 6 files changed, 38 insertions(+), 8 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index ecd6ab3c9..91b12f544 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -492,10 +492,12 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/; "".replace(a, ""); return a; } ''') - attrs = set(('findall', 'finditer', 'flags', 'groupindex', - 'groups', 'match', 'pattern', 'scanner', - 'search', 'split', 'sub', 'subn')) - self.assertTrue(set(dir(jsi.call_function('x'))) > attrs) + attrs = set(('findall', 'finditer', 'match', 'scanner', 'search', + 'split', 'sub', 'subn')) + if sys.version_info >= (2, 7): + # documented for 2.6 but may not be found + attrs.update(('flags', 'groupindex', 'groups', 'pattern')) + self.assertSetEqual(set(dir(jsi.call_function('x'))) & attrs, attrs) jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/i; return a; } diff --git a/test/test_utils.py b/test/test_utils.py index b85d397d0..5fab05f7c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1612,7 +1612,7 @@ Line 1 self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), [_TEST_DATA['urls']], msg='function as query key should perform a filter based on (key, value)') - self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], compat_str)), {'str'}, + self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], compat_str)), ('str',), msg='exceptions in the query function should be caught') # Test alternative paths diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 98b878fc1..068029d3e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -25,7 +25,11 @@ import tokenize import traceback import random -from ssl import OPENSSL_VERSION +try: + from ssl import OPENSSL_VERSION +except ImportError: + # Must be Python 2.6, should be built against 1.0.2 + OPENSSL_VERSION = 'OpenSSL 1.0.2(?)' from string import ascii_letters from .compat import ( diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0f4d3756f..2554fd1c3 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +from __future__ import division import base64 import binascii import collections import ctypes +import datetime import email import getpass import io @@ -3150,6 +3152,15 @@ def compat_register_utf8(): lambda name: lookup('utf-8') if name == 'cp65001' else None) +# compat_datetime_timedelta_total_seconds +try: + compat_datetime_timedelta_total_seconds = datetime.timedelta.total_seconds +except AttributeError: + # Py 2.6 + def compat_datetime_timedelta_total_seconds(td): + return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 + + legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -3187,6 +3198,7 @@ __all__ = [ 'compat_chr', 'compat_collections_abc', 'compat_collections_chain_map', + 'compat_datetime_timedelta_total_seconds', 'compat_http_cookiejar', 'compat_http_cookiejar_Cookie', 'compat_http_cookies', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 1ba9c3d67..882432b80 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -277,9 +277,20 @@ class JSInterpreter(object): def __getattr__(self, name): self.__instantiate() + # make Py 2.6 conform to its lying documentation + if name == 'flags': + self.flags = self.__flags + elif name == 'pattern': + self.pattern = self.__pattern_txt + elif name in ('groupindex', 'groups'): + # in case these get set after a match? + if hasattr(self.__self, name): + setattr(self, name, getattr(self.__self, name)) + else: + return 0 if name == 'groupindex' else {} if hasattr(self, name): return getattr(self, name) - return super(JSInterpreter.JS_RegExp, self).__getattr__(name) + raise AttributeError('{0} has no attribute named {1}'.format(self, name)) @classmethod def regex_flags(cls, expr): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 584581b6a..83f67bd95 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,6 +47,7 @@ from .compat import ( compat_collections_abc, compat_cookiejar, compat_ctypes_WINFUNCTYPE, + compat_datetime_timedelta_total_seconds, compat_etree_fromstring, compat_expanduser, compat_html_entities, @@ -3102,7 +3103,7 @@ def unified_timestamp(date_str, day_first=True): pass timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() + return calendar.timegm(timetuple) + pm_delta * 3600 - compat_datetime_timedelta_total_seconds(timezone) def determine_ext(url, default_ext='unknown_video'): From b6dff4073d469cceadb099c00ccbf3bd6fc515a6 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 7 Jul 2023 18:41:32 +0100 Subject: [PATCH 1505/1705] [core] Revert version display from b8a86dc --- youtube_dl/YoutubeDL.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 068029d3e..4e7fd1063 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2378,10 +2378,12 @@ class YoutubeDL(object): self.get_encoding())) write_string(encoding_str, encoding=None) - self._write_string('[debug] youtube-dl version ' + __version__ + (' (single file build)\n' if ytdl_is_updateable() else '\n')) + writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) + writeln_debug('youtube-dl version ', __version__) if _LAZY_LOADER: - self._write_string('[debug] Lazy loading extractors enabled\n') - writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) # moved down for easier merge + writeln_debug('Lazy loading extractors enabled') + if ytdl_is_updateable(): + writeln_debug('Single file build') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], From f47fdb9564d3ca1c0fa70ed6031148ec908fdc7b Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 6 Jul 2023 15:46:22 +0100 Subject: [PATCH 1506/1705] [utils] Add {expected_type} and Iterable support to traverse_obj() --- test/test_utils.py | 153 ++++++++++++++++++++++++++------ youtube_dl/utils.py | 211 +++++++++++++++++++++++++++++--------------- 2 files changed, 265 insertions(+), 99 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5fab05f7c..1fc16ed05 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -79,10 +79,12 @@ from youtube_dl.utils import ( rot47, shell_quote, smuggle_url, + str_or_none, str_to_int, strip_jsonp, strip_or_none, subtitles_filename, + T, timeconvert, traverse_obj, try_call, @@ -1566,6 +1568,7 @@ Line 1 self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam') def test_traverse_obj(self): + str = compat_str _TEST_DATA = { 100: 100, 1.2: 1.2, @@ -1598,8 +1601,8 @@ Line 1 # Test Ellipsis behavior self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis), - (item for item in _TEST_DATA.values() if item is not None), - msg='`...` should give all values except `None`') + (item for item in _TEST_DATA.values() if item not in (None, {})), + msg='`...` should give all non discarded values') self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(), msg='`...` selection for dicts should select all values') self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')), @@ -1607,13 +1610,51 @@ Line 1 msg='nested `...` queries should work') self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4), msg='`...` query result should be flattened') + self.assertEqual(traverse_obj(iter(range(4)), Ellipsis), list(range(4)), + msg='`...` should accept iterables') # Test function as key self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), [_TEST_DATA['urls']], msg='function as query key should perform a filter based on (key, value)') - self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], compat_str)), ('str',), - msg='exceptions in the query function should be caught') + self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, + msg='exceptions in the query function should be catched') + self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2], + msg='function key should accept iterables') + if __debug__: + with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): + traverse_obj(_TEST_DATA, lambda a: Ellipsis) + with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): + traverse_obj(_TEST_DATA, lambda a, b, c: Ellipsis) + + # Test set as key (transformation/type, like `expected_type`) + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str.upper), )), ['STR'], + msg='Function in set should be a transformation') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str))), ['str'], + msg='Type in set should be a type filter') + self.assertEqual(traverse_obj(_TEST_DATA, T(dict)), _TEST_DATA, + msg='A single set should be wrapped into a path') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str.upper))), ['STR'], + msg='Transformation function should not raise') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, T(str_or_none))), + [item for item in map(str_or_none, _TEST_DATA.values()) if item is not None], + msg='Function in set should be a transformation') + if __debug__: + with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): + traverse_obj(_TEST_DATA, set()) + with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): + traverse_obj(_TEST_DATA, {str.upper, str}) + + # Test `slice` as a key + _SLICE_DATA = [0, 1, 2, 3, 4] + self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None, + msg='slice on a dictionary should not throw') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1], + msg='slice key should apply slice to sequence') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2], + msg='slice key should apply slice to sequence') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2], + msg='slice key should apply slice to sequence') # Test alternative paths self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', @@ -1659,15 +1700,23 @@ Line 1 {0: ['https://www.example.com/1', 'https://www.example.com/0']}, msg='triple nesting in dict path should be treated as branches') self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {}, - msg='remove `None` values when dict key') + msg='remove `None` values when top level dict key fails') self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=Ellipsis), {0: Ellipsis}, - msg='do not remove `None` values if `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {0: {}}, - msg='do not remove empty values when dict key') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: {}}, - msg='do not remove empty values when dict key and a default') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', Ellipsis)}), {0: []}, - msg='if branch in dict key not successful, return `[]`') + msg='use `default` if key fails and `default`') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {}, + msg='remove empty values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: Ellipsis}, + msg='use `default` when dict key and `default`') + self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {}, + msg='remove empty values when nested dict key fails') + self.assertEqual(traverse_obj(None, {0: 'fail'}), {}, + msg='default to dict if pruned') + self.assertEqual(traverse_obj(None, {0: 'fail'}, default=Ellipsis), {0: Ellipsis}, + msg='default to dict if pruned and default is given') + self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=Ellipsis), {0: {0: Ellipsis}}, + msg='use nested `default` when nested dict key fails and `default`') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', Ellipsis)}), {}, + msg='remove key if branch in dict key not successful') # Testing default parameter behavior _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} @@ -1691,20 +1740,55 @@ Line 1 msg='if branched but not successful return `[]`, not `default`') self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', Ellipsis)), [], msg='if branched but object is empty return `[]`, not `default`') + self.assertEqual(traverse_obj(None, Ellipsis), [], + msg='if branched but object is `None` return `[]`, not `default`') + self.assertEqual(traverse_obj({0: None}, (0, Ellipsis)), [], + msg='if branched but state is `None` return `[]`, not `default`') + + branching_paths = [ + ('fail', Ellipsis), + (Ellipsis, 'fail'), + 100 * ('fail',) + (Ellipsis,), + (Ellipsis,) + 100 * ('fail',), + ] + for branching_path in branching_paths: + self.assertEqual(traverse_obj({}, branching_path), [], + msg='if branched but state is `None`, return `[]` (not `default`)') + self.assertEqual(traverse_obj({}, 'fail', branching_path), [], + msg='if branching in last alternative and previous did not match, return `[]` (not `default`)') + self.assertEqual(traverse_obj({0: 'x'}, 0, branching_path), 'x', + msg='if branching in last alternative and previous did match, return single value') + self.assertEqual(traverse_obj({0: 'x'}, branching_path, 0), 'x', + msg='if branching in first alternative and non-branching path does match, return single value') + self.assertEqual(traverse_obj({}, branching_path, 'fail'), None, + msg='if branching in first alternative and non-branching path does not match, return `default`') # Testing expected_type behavior _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=compat_str), 'str', - msg='accept matching `expected_type` type') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), None, - msg='reject non matching `expected_type` type') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: compat_str(x)), '0', - msg='transform type using type function') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', - expected_type=lambda _: 1 / 0), None, - msg='wrap expected_type function in try_call') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, Ellipsis, expected_type=compat_str), ['str'], - msg='eliminate items that expected_type fails on') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str), + 'str', msg='accept matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), + None, msg='reject non matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)), + '0', msg='transform type using type function') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0), + None, msg='wrap expected_type function in try_call') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, Ellipsis, expected_type=str), + ['str'], msg='eliminate items that expected_type fails on') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int), + {0: 100}, msg='type as expected_type should filter dict values') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none), + {0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values') + self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, {int_or_none}), expected_type=int), + 1, msg='expected_type should not filter non final dict values') + self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int), + {0: {0: 100}}, msg='expected_type should transform deep dict values') + self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(Ellipsis)), + [{0: Ellipsis}, {0: Ellipsis}], msg='expected_type should transform branched dict values') + self.assertEqual(traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int), + [4], msg='expected_type regression for type matching in tuple branching') + self.assertEqual(traverse_obj(_TEST_DATA, ['data', Ellipsis], expected_type=int), + [], msg='expected_type regression for type matching in dict result') # Test get_all behavior _GET_ALL_DATA = {'key': [0, 1, 2]} @@ -1749,14 +1833,23 @@ Line 1 _traverse_string=True), '.', msg='traverse into converted data if `traverse_string`') self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', Ellipsis), - _traverse_string=True), list('str'), - msg='`...` branching into string should result in list') + _traverse_string=True), 'str', + msg='`...` should result in string (same value) if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)), + _traverse_string=True), 'sr', + msg='`slice` should result in string if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"), + _traverse_string=True), 'str', + msg='function should result in string if `traverse_string`') self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), _traverse_string=True), ['s', 'r'], - msg='branching into string should result in list') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda _, x: x), - _traverse_string=True), list('str'), - msg='function branching into string should result in list') + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, Ellipsis), _traverse_string=True), [], + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, lambda x, y: True), _traverse_string=True), [], + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, slice(1)), _traverse_string=True), [], + msg='branching should result in list if `traverse_string`') # Test is_user_input behavior _IS_USER_INPUT_DATA = {'range8': list(range(8))} @@ -1793,6 +1886,8 @@ Line 1 msg='failing str key on a `re.Match` should return `default`') self.assertEqual(traverse_obj(mobj, 8), None, msg='failing int key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'], + msg='function on a `re.Match` should give group name as well') def test_get_first(self): self.assertEqual(get_first([{'a': None}, {'a': 'spam'}], 'a'), 'spam') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 83f67bd95..dbdbe5f59 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -16,6 +16,7 @@ import email.header import errno import functools import gzip +import inspect import io import itertools import json @@ -3881,7 +3882,7 @@ def detect_exe_version(output, version_re=None, unrecognized='present'): return unrecognized -class LazyList(compat_collections_abc.Sequence): +class LazyList(compat_collections_abc.Iterable): """Lazy immutable list from an iterable Note that slices of a LazyList are lists and not LazyList""" @@ -4223,10 +4224,16 @@ def multipart_encode(data, boundary=None): return out, content_type -def variadic(x, allowed_types=(compat_str, bytes, dict)): - if not isinstance(allowed_types, tuple) and isinstance(allowed_types, compat_collections_abc.Iterable): +def is_iterable_like(x, allowed_types=compat_collections_abc.Iterable, blocked_types=NO_DEFAULT): + if blocked_types is NO_DEFAULT: + blocked_types = (compat_str, bytes, compat_collections_abc.Mapping) + return isinstance(x, allowed_types) and not isinstance(x, blocked_types) + + +def variadic(x, allowed_types=NO_DEFAULT): + if isinstance(allowed_types, compat_collections_abc.Iterable): allowed_types = tuple(allowed_types) - return x if isinstance(x, compat_collections_abc.Iterable) and not isinstance(x, allowed_types) else (x,) + return x if is_iterable_like(x, blocked_types=allowed_types) else (x,) def dict_get(d, key_or_keys, default=None, skip_false_values=True): @@ -5993,7 +6000,7 @@ def clean_podcast_url(url): def traverse_obj(obj, *paths, **kwargs): """ - Safely traverse nested `dict`s and `Sequence`s + Safely traverse nested `dict`s and `Iterable`s >>> obj = [{}, {"key": "value"}] >>> traverse_obj(obj, (1, "key")) @@ -6001,14 +6008,17 @@ def traverse_obj(obj, *paths, **kwargs): Each of the provided `paths` is tested and the first producing a valid result will be returned. The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Sequence` and `re.Match`. - A value of None is treated as the absence of a value. + Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. + Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. The keys in the path can be one of: - `None`: Return the current object. - - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`. + - `set`: Requires the only item in the set to be a type or function, + like `{type}`/`{func}`. If a `type`, returns only values + of this type. If a function, returns `func(obj)`. + - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - `slice`: Branch out and return all values in `obj[key]`. - `Ellipsis`: Branch out and return a list of all values. - `tuple`/`list`: Branch out and return a list of all matching values. @@ -6016,6 +6026,9 @@ def traverse_obj(obj, *paths, **kwargs): - `function`: Branch out and return values filtered by the function. Read as: `[value for key, value in obj if function(key, value)]`. For `Sequence`s, `key` is the index of the value. + For `Iterable`s, `key` is the enumeration count of the value. + For `re.Match`es, `key` is the group number (0 = full match) + as well as additionally any group names, if given. - `dict` Transform the current object and return a matching dict. Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. @@ -6024,8 +6037,12 @@ def traverse_obj(obj, *paths, **kwargs): @params paths Paths which to traverse by. Keyword arguments: @param default Value to return if the paths do not match. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, depth first. Try to avoid if using nested `dict` keys. @param expected_type If a `type`, only accept final values of this type. If any other callable, try to call the function on each result. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, recursively. This does respect branching paths. @param get_all If `False`, return the first matching result, otherwise all matching ones. @param casesense If `False`, consider string dictionary keys as case insensitive. @@ -6036,12 +6053,15 @@ def traverse_obj(obj, *paths, **kwargs): @param _traverse_string Whether to traverse into objects as strings. If `True`, any non-compatible object will first be converted into a string and then traversed into. + The return value of that path will be a string instead, + not respecting any further branching. @returns The result of the object traversal. If successful, `get_all=True`, and the path branches at least once, then a list of results is returned instead. A list is always returned if the last path branches and no `default` is given. + If a path ends on a `dict` that result will always be a `dict`. """ # parameter defaults @@ -6055,7 +6075,6 @@ def traverse_obj(obj, *paths, **kwargs): # instant compat str = compat_str - is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes)) casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k if isinstance(expected_type, type): @@ -6063,128 +6082,180 @@ def traverse_obj(obj, *paths, **kwargs): else: type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + def lookup_or_none(v, k, getter=None): + try: + return getter(v, k) if getter else v[k] + except IndexError: + return None + def from_iterable(iterables): # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F for it in iterables: for item in it: yield item - def apply_key(key, obj): - if obj is None: - return + def apply_key(key, obj, is_last): + branching = False + + if obj is None and _traverse_string: + if key is Ellipsis or callable(key) or isinstance(key, slice): + branching = True + result = () + else: + result = None elif key is None: - yield obj + result = obj + + elif isinstance(key, set): + assert len(key) == 1, 'Set should only be used to wrap a single item' + item = next(iter(key)) + if isinstance(item, type): + result = obj if isinstance(obj, item) else None + else: + result = try_call(item, args=(obj,)) elif isinstance(key, (list, tuple)): - for branch in key: - _, result = apply_path(obj, branch) - for item in result: - yield item + branching = True + result = from_iterable( + apply_path(obj, branch, is_last)[0] for branch in key) elif key is Ellipsis: - result = [] + branching = True if isinstance(obj, compat_collections_abc.Mapping): result = obj.values() - elif is_sequence(obj): + elif is_iterable_like(obj): result = obj elif isinstance(obj, compat_re_Match): result = obj.groups() elif _traverse_string: + branching = False result = str(obj) - for item in result: - yield item + else: + result = () elif callable(key): - if is_sequence(obj): - iter_obj = enumerate(obj) - elif isinstance(obj, compat_collections_abc.Mapping): + branching = True + if isinstance(obj, compat_collections_abc.Mapping): iter_obj = obj.items() + elif is_iterable_like(obj): + iter_obj = enumerate(obj) elif isinstance(obj, compat_re_Match): - iter_obj = enumerate(itertools.chain([obj.group()], obj.groups())) + iter_obj = itertools.chain( + enumerate(itertools.chain((obj.group(),), obj.groups())), + obj.groupdict().items()) elif _traverse_string: + branching = False iter_obj = enumerate(str(obj)) else: - return - for item in (v for k, v in iter_obj if try_call(key, args=(k, v))): - yield item + iter_obj = () + + result = (v for k, v in iter_obj if try_call(key, args=(k, v))) + if not branching: # string traversal + result = ''.join(result) elif isinstance(key, dict): - iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items()) - yield dict((k, v if v is not None else default) for k, v in iter_obj - if v is not None or default is not NO_DEFAULT) + iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) + result = dict((k, v if v is not None else default) for k, v in iter_obj + if v is not None or default is not NO_DEFAULT) or None elif isinstance(obj, compat_collections_abc.Mapping): - yield (obj.get(key) if casesense or (key in obj) - else next((v for k, v in obj.items() if casefold(k) == key), None)) + result = (try_call(obj.get, args=(key,)) + if casesense or try_call(obj.__contains__, args=(key,)) + else next((v for k, v in obj.items() if casefold(k) == key), None)) elif isinstance(obj, compat_re_Match): + result = None if isinstance(key, int) or casesense: - try: - yield obj.group(key) - return - except IndexError: - pass - if not isinstance(key, str): - return + result = lookup_or_none(obj, key, getter=compat_re_Match.group) - yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + elif isinstance(key, str): + result = next((v for k, v in obj.groupdict().items() + if casefold(k) == key), None) else: - if _is_user_input: - key = (int_or_none(key) if ':' not in key - else slice(*map(int_or_none, key.split(':')))) + result = None + if isinstance(key, (int, slice)): + if is_iterable_like(obj, compat_collections_abc.Sequence): + branching = isinstance(key, slice) + result = lookup_or_none(obj, key) + elif _traverse_string: + result = lookup_or_none(str(obj), key) - if not isinstance(key, (int, slice)): - return + return branching, result if branching else (result,) - if not is_sequence(obj): - if not _traverse_string: - return - obj = str(obj) + def lazy_last(iterable): + iterator = iter(iterable) + prev = next(iterator, NO_DEFAULT) + if prev is NO_DEFAULT: + return - try: - yield obj[key] - except IndexError: - pass + for item in iterator: + yield False, prev + prev = item - def apply_path(start_obj, path): + yield True, prev + + def apply_path(start_obj, path, test_type): objs = (start_obj,) has_branched = False - for key in variadic(path): - if _is_user_input and key == ':': - key = Ellipsis + key = None + for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): + if _is_user_input and isinstance(key, str): + if key == ':': + key = Ellipsis + elif ':' in key: + key = slice(*map(int_or_none, key.split(':'))) + elif int_or_none(key) is not None: + key = int(key) if not casesense and isinstance(key, str): key = compat_casefold(key) - if key is Ellipsis or isinstance(key, (list, tuple)) or callable(key): - has_branched = True + if __debug__ and callable(key): + # Verify function signature + inspect.getcallargs(key, None, None) - key_func = functools.partial(apply_key, key) - objs = from_iterable(map(key_func, objs)) + new_objs = [] + for obj in objs: + branching, results = apply_key(key, obj, last) + has_branched |= branching + new_objs.append(results) - return has_branched, objs + objs = from_iterable(new_objs) - def _traverse_obj(obj, path, use_list=True): - has_branched, results = apply_path(obj, path) - results = LazyList(x for x in map(type_test, results) if x is not None) + if test_type and not isinstance(key, (dict, list, tuple)): + objs = map(type_test, objs) + + return objs, has_branched, isinstance(key, dict) + + def _traverse_obj(obj, path, allow_empty, test_type): + results, has_branched, is_dict = apply_path(obj, path, test_type) + results = LazyList(x for x in results if x not in (None, {})) if get_all and has_branched: - return results.exhaust() if results or use_list else None + if results: + return results.exhaust() + if allow_empty: + return [] if default is NO_DEFAULT else default + return None - return results[0] if results else None + return results[0] if results else {} if allow_empty and is_dict else None for index, path in enumerate(paths, 1): - use_list = default is NO_DEFAULT and index == len(paths) - result = _traverse_obj(obj, path, use_list) + result = _traverse_obj(obj, path, index == len(paths), True) if result is not None: return result return None if default is NO_DEFAULT else default +def T(x): + """ For use in yt-dl instead of {type} or set((type,)) """ + return set((x,)) + + def get_first(obj, keys, **kwargs): return traverse_obj(obj, (Ellipsis,) + tuple(variadic(keys)), get_all=False, **kwargs) From d5ef405c5d533c85cebd205a5b7958614c7013f3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 7 Jul 2023 18:45:31 +0100 Subject: [PATCH 1507/1705] [core] Align error reporting methods with yt-dlp --- test/helper.py | 3 ++- test/test_YoutubeDL.py | 10 ++-------- youtube_dl/YoutubeDL.py | 39 ++++++++++++++++++++++++++++++++------- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/test/helper.py b/test/helper.py index 883b2e877..e3314b03e 100644 --- a/test/helper.py +++ b/test/helper.py @@ -72,7 +72,8 @@ class FakeYDL(YoutubeDL): def to_screen(self, s, skip_eol=None): print(s) - def trouble(self, s, tb=None): + def trouble(self, *args, **kwargs): + s = args[0] if len(args) > 0 else kwargs.get('message', 'Missing message') raise Exception(s) def download(self, x): diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f8c8e619c..60780b8a7 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -930,17 +930,11 @@ class TestYoutubeDL(unittest.TestCase): # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): - class _YDL(YDL): - def __init__(self, *args, **kwargs): - super(_YDL, self).__init__(*args, **kwargs) - - def trouble(self, s, tb=None): - pass - - ydl = _YDL({ + ydl = YDL({ 'format': 'extra', 'ignoreerrors': True, }) + ydl.trouble = lambda *_, **__: None class VideoIE(InfoExtractor): _VALID_URL = r'video:(?P\d+)' diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4e7fd1063..1435754c2 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -582,7 +582,7 @@ class YoutubeDL(object): if self.params.get('cookiefile') is not None: self.cookiejar.save(ignore_discard=True, ignore_expires=True) - def trouble(self, message=None, tb=None): + def trouble(self, *args, **kwargs): """Determine action to take when a download problem appears. Depending on if the downloader has been configured to ignore @@ -591,6 +591,11 @@ class YoutubeDL(object): tb, if given, is additional traceback information. """ + # message=None, tb=None, is_error=True + message = args[0] if len(args) > 0 else kwargs.get('message', None) + tb = args[1] if len(args) > 1 else kwargs.get('tb', None) + is_error = args[2] if len(args) > 2 else kwargs.get('is_error', True) + if message is not None: self.to_stderr(message) if self.params.get('verbose'): @@ -603,7 +608,10 @@ class YoutubeDL(object): else: tb_data = traceback.format_list(traceback.extract_stack()) tb = ''.join(tb_data) - self.to_stderr(tb) + if tb: + self.to_stderr(tb) + if not is_error: + return if not self.params.get('ignoreerrors', False): if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: exc_info = sys.exc_info()[1].exc_info @@ -612,11 +620,18 @@ class YoutubeDL(object): raise DownloadError(message, exc_info) self._download_retcode = 1 - def report_warning(self, message): + def report_warning(self, message, only_once=False, _cache={}): ''' Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' + if only_once: + m_hash = hash((self, message)) + m_cnt = _cache.setdefault(m_hash, 0) + _cache[m_hash] = m_cnt + 1 + if m_cnt > 0: + return + if self.params.get('logger') is not None: self.params['logger'].warning(message) else: @@ -629,7 +644,7 @@ class YoutubeDL(object): warning_message = '%s %s' % (_msg_header, message) self.to_stderr(warning_message) - def report_error(self, message, tb=None): + def report_error(self, message, *args, **kwargs): ''' Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. @@ -638,8 +653,18 @@ class YoutubeDL(object): _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' - error_message = '%s %s' % (_msg_header, message) - self.trouble(error_message, tb) + kwargs['message'] = '%s %s' % (_msg_header, message) + self.trouble(*args, **kwargs) + + def report_unscoped_cookies(self, *args, **kwargs): + # message=None, tb=False, is_error=False + if len(args) <= 2: + kwargs.setdefault('is_error', False) + if len(args) <= 0: + kwargs.setdefault( + 'message', + 'Unscoped cookies are not allowed: please specify some sort of scoping') + self.report_error(*args, **kwargs) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" @@ -835,7 +860,7 @@ class YoutubeDL(object): msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) + self.report_error(compat_str(e), tb=e.format_traceback()) except MaxDownloadsReached: raise except Exception as e: From 1720c04dc56fa0d2caa0a455b1acbd569347482e Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 17 Jul 2023 20:47:58 +0100 Subject: [PATCH 1508/1705] [test] Make skipped tests in test_execution work with Py 2.6 --- test/test_execution.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_execution.py b/test/test_execution.py index 35e7a5651..ae59e562a 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -24,21 +24,24 @@ except AttributeError: class TestExecution(unittest.TestCase): + def setUp(self): + self.module = 'youtube_dl' + if sys.version_info < (2, 7): + self.module += '.__main__' + def test_import(self): subprocess.check_call([sys.executable, '-c', 'import youtube_dl'], cwd=rootDir) - @unittest.skipIf(sys.version_info < (2, 7), 'Python 2.6 doesn\'t support package execution') def test_module_exec(self): - subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, '-m', self.module, '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_main_exec(self): subprocess.check_call([sys.executable, os.path.normpath('youtube_dl/__main__.py'), '--version'], cwd=rootDir, stdout=_DEV_NULL) - @unittest.skipIf(sys.version_info < (2, 7), 'Python 2.6 doesn\'t support package execution') def test_cmdline_umlauts(self): os.environ['PYTHONIOENCODING'] = 'utf-8' p = subprocess.Popen( - [sys.executable, os.path.normpath('youtube_dl/__main__.py'), encodeArgument('ä'), '--version'], + [sys.executable, '-m', self.module, encodeArgument('ä'), '--version'], cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) _, stderr = p.communicate() self.assertFalse(stderr) From 648dc5304cb2476592ff142988b8c62675011fcc Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 7 Jul 2023 18:51:38 +0100 Subject: [PATCH 1509/1705] [compat] Add Request and HTTPClient compat for redirect * support `method` parameter of `Request.__init__` (Py 2 and old Py 3) * support `getcode` method of compat_http_client.HTTPResponse (Py 2) --- youtube_dl/compat.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2554fd1c3..cd11ba5aa 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,7 @@ import socket import struct import subprocess import sys +import types import xml.etree.ElementTree # naming convention @@ -55,6 +56,22 @@ try: except ImportError: # Python 2 import urllib2 as compat_urllib_request +# Also fix up lack of method arg in old Pythons +try: + _req = compat_urllib_request.Request + _req('http://127.0.0.1', method='GET') +except TypeError: + class _request(object): + def __new__(cls, url, *args, **kwargs): + method = kwargs.pop('method', None) + r = _req(url, *args, **kwargs) + if method: + r.get_method = types.MethodType(lambda _: method, r) + return r + + compat_urllib_request.Request = _request + + try: import urllib.error as compat_urllib_error except ImportError: # Python 2 @@ -79,6 +96,12 @@ try: except ImportError: # Python 2 import urllib as compat_urllib_response +try: + compat_urllib_response.addinfourl.status +except AttributeError: + # .getcode() is deprecated in Py 3. + compat_urllib_response.addinfourl.status = property(lambda self: self.getcode()) + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -2360,6 +2383,11 @@ try: import http.client as compat_http_client except ImportError: # Python 2 import httplib as compat_http_client +try: + compat_http_client.HTTPResponse.getcode +except AttributeError: + # Py < 3.1 + compat_http_client.HTTPResponse.getcode = lambda self: self.status try: from urllib.error import HTTPError as compat_HTTPError From 46fde7caeeab13a6277aab22a0e8a29e10c30cc3 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 7 Jun 2023 14:51:50 +0100 Subject: [PATCH 1510/1705] [core] Update redirect handling from yt-dlp * Thx coletdjnz: https://github.com/yt-dlp/yt-dlp/pull/7094 * add test that redirected `POST` loses its `Content-Type` --- test/test_http.py | 489 +++++++++++++++++++++++++++++++++++++++----- youtube_dl/utils.py | 74 ++++--- 2 files changed, 484 insertions(+), 79 deletions(-) diff --git a/test/test_http.py b/test/test_http.py index 487a9bc77..1a65df9e0 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,33 +8,160 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import gzip +import io +import ssl +import tempfile +import threading +import zlib + +# avoid deprecated alias assertRaisesRegexp +if hasattr(unittest.TestCase, 'assertRaisesRegex'): + unittest.TestCase.assertRaisesRegexp = unittest.TestCase.assertRaisesRegex + +try: + import brotli +except ImportError: + brotli = None +try: + from urllib.request import pathname2url +except ImportError: + from urllib import pathname2url + +from youtube_dl.compat import ( + compat_http_cookiejar_Cookie, + compat_http_server, + compat_str as str, + compat_urllib_error, + compat_urllib_HTTPError, + compat_urllib_parse, + compat_urllib_request, +) + +from youtube_dl.utils import ( + sanitized_Request, + urlencode_postdata, +) + from test.helper import ( + FakeYDL, FakeLogger, http_server_port, ) from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server, compat_urllib_request -import ssl -import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + # work-around old/new -style class inheritance + def super(self, meth_name, *args, **kwargs): + from types import MethodType + try: + super() + fn = lambda s, m, *a, **k: getattr(super(), m)(*a, **k) + except TypeError: + fn = lambda s, m, *a, **k: getattr(compat_http_server.BaseHTTPRequestHandler, m)(s, *a, **k) + self.super = MethodType(fn, self) + return self.super(meth_name, *args, **kwargs) + def log_message(self, format, *args): pass + def _headers(self): + payload = str(self.headers).encode('utf-8') + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def _redirect(self): + self.send_response(int(self.path[len('/redirect_'):])) + self.send_header('Location', '/method') + self.send_header('Content-Length', '0') + self.end_headers() + + def _method(self, method, payload=None): + self.send_response(200) + self.send_header('Content-Length', str(len(payload or ''))) + self.send_header('Method', method) + self.end_headers() + if payload: + self.wfile.write(payload) + + def _status(self, status): + payload = '{0} NOT FOUND'.format(status).encode('utf-8') + self.send_response(int(status)) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def _read_data(self): + if 'Content-Length' in self.headers: + return self.rfile.read(int(self.headers['Content-Length'])) + + def _test_url(self, path, host='127.0.0.1', scheme='http', port=None): + return '{0}://{1}:{2}/{3}'.format( + scheme, host, + port if port is not None + else http_server_port(self.server), path) + + def do_POST(self): + data = self._read_data() + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('POST', data) + elif self.path.startswith('/headers'): + self._headers() + else: + self._status(404) + + def do_HEAD(self): + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('HEAD') + else: + self._status(404) + + def do_PUT(self): + data = self._read_data() + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('PUT', data) + else: + self._status(404) + def do_GET(self): + + def respond(payload=b'