From 0fa67c1d686c1c25b467906307cafefa885c4a80 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 08:55:51 +0100 Subject: [PATCH 001/860] [generic] Add support for VHX Embeds(#27546) --- youtube_dl/extractor/generic.py | 22 ++++++++++++++++++++-- youtube_dl/extractor/vimeo.py | 7 +++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6e46b2c0e..14c27c6da 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -67,7 +67,10 @@ from .tube8 import Tube8IE from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE -from .vimeo import VimeoIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE @@ -2193,7 +2196,18 @@ class GenericIE(InfoExtractor): # 'params': { # 'force_generic_extractor': True, # }, - # } + # }, + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, ] def report_following_redirect(self, new_url): @@ -2571,6 +2585,10 @@ class GenericIE(InfoExtractor): if vimeo_urls: return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) + vhx_url = VHXEmbedIE._extract_url(webpage) + if vhx_url: + return self.url_result(vhx_url, VHXEmbedIE.ie_key()) + vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', webpage, 'vid.me embed', default=None) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4c55946f1..15cd06268 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1119,6 +1119,12 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P\d+)' + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) + return unescapeHTML(mobj.group(1)) if mobj else None + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -1127,5 +1133,6 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): 'ott data'), video_id, js_to_json)['config_url'] config = self._download_json(config_url, video_id) info = self._parse_config(config, video_id) + info['id'] = video_id self._vimeo_sort_formats(info['formats']) return info From aed617e311b95d771e10f48cc8dcde25d7816224 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 08:58:34 +0100 Subject: [PATCH 002/860] [amcnetworks] improve auth only video detection(closes #27548) --- youtube_dl/extractor/amcnetworks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 12b6de0bf..b8027bbca 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -80,7 +80,8 @@ class AMCNetworksIE(ThePlatformIE): title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - if properties.get('videoCategory') == 'TVE-Auth': + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( From e4749965412edf2c6d3938d4b8f1d3dbab61b0b5 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 27 Dec 2020 21:15:09 +0700 Subject: [PATCH 003/860] [youtube] Update invidious.snopyta.org (#22667) Co-authored-by: sofutru <54445344+sofutru@users.noreply.github.com> --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 289d9bab1..5ef58d730 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -321,7 +321,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| From f86b299d0ecdba3462d67247cf70cee3a2809a1f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 16:20:49 +0100 Subject: [PATCH 004/860] [telecinco] fix extraction --- youtube_dl/extractor/telecinco.py | 77 ++++++++----------------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 9ba3da341..eecd6a5c9 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -5,14 +5,11 @@ import json import re from .common import InfoExtractor -from .ooyala import OoyalaIE from ..utils import ( clean_html, - determine_ext, int_or_none, str_or_none, try_get, - urljoin, ) @@ -28,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ - 'md5': 'adb28c37238b675dad0f042292f209a7', + 'md5': '7ee56d665cfd241c0e6d80fd175068b0', 'info_dict': { 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', @@ -38,7 +35,7 @@ class TelecincoIE(InfoExtractor): }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', + 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -48,7 +45,7 @@ class TelecincoIE(InfoExtractor): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', + 'md5': 'eddb50291df704ce23c74821b995bcac', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -90,58 +87,24 @@ class TelecincoIE(InfoExtractor): def _parse_content(self, content, url): video_id = content['dataMediaId'] - if content.get('dataCmsId') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) - config_url = urljoin(url, content['dataConfig']) config = self._download_json( - config_url, video_id, 'Downloading config JSON') + content['dataConfig'], video_id, 'Downloading config JSON') title = config['info']['title'] - - def mmc_url(mmc_type): - return re.sub( - r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, - config['services']['mmc']) - - duration = None - formats = [] - for mmc_type in ('flash', 'html5'): - mmc = self._download_json( - mmc_url(mmc_type), video_id, - 'Downloading %s mmc JSON' % mmc_type, fatal=False) - if not mmc: - continue - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }, fatal=False) or {} - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + stream = caronte['dls'][0]['stream'] + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'application/json;charset=UTF-8', + 'Origin': re.match(r'https?://[^/]+', url).group(0), + }) + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers=headers)['tokens']['1']['cdn'] + formats = self._extract_m3u8_formats( + stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) return { @@ -149,7 +112,7 @@ class TelecincoIE(InfoExtractor): 'title': title, 'formats': formats, 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, + 'duration': int_or_none(content.get('dataDuration')), } def _real_extract(self, url): From 4c7a4dbc4d07786734324b267a41b74ee7099a85 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 16:22:01 +0100 Subject: [PATCH 005/860] [mitele] fix free video extraction(#24624)(closes #25827)(closes #26757) --- youtube_dl/extractor/mitele.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index ad9da9612..b5937233b 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,15 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .telecinco import TelecincoIE from ..utils import ( int_or_none, parse_iso8601, - smuggle_url, ) -class MiTeleIE(InfoExtractor): +class MiTeleIE(TelecincoIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' @@ -31,7 +30,6 @@ class MiTeleIE(InfoExtractor): 'timestamp': 1471209401, 'upload_date': '20160814', }, - 'add_ie': ['Ooyala'], }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -54,7 +52,6 @@ class MiTeleIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, @@ -70,16 +67,11 @@ class MiTeleIE(InfoExtractor): r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', webpage, 'Pre Player'), display_id)['prePlayer'] title = pre_player['title'] - video = pre_player['video'] - video_id = video['dataMediaId'] + video_info = self._parse_content(pre_player['video'], url) content = pre_player.get('content') or {} info = content.get('info') or {} - return { - '_type': 'url_transparent', - # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}), - 'id': video_id, + video_info.update({ 'title': title, 'description': info.get('synopsis'), 'series': content.get('title'), @@ -87,7 +79,7 @@ class MiTeleIE(InfoExtractor): 'episode': content.get('subtitle'), 'episode_number': int_or_none(info.get('episode_number')), 'duration': int_or_none(info.get('duration')), - 'thumbnail': video.get('dataPoster'), 'age_limit': int_or_none(info.get('rating')), 'timestamp': parse_iso8601(pre_player.get('publishedTime')), - } + }) + return video_info From 6f2eaaf73daef3ac0995cd7b51c677b003c04218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Dec 2020 22:57:50 +0700 Subject: [PATCH 006/860] [teachable] Improve embed detection (closes #26923) --- youtube_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 6f264bddc..2394f86d4 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -140,7 +140,7 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _is_teachable(webpage): return 'teachableTracker.linker:autoLink' in webpage and re.search( - r']+href=["\']https?://process\.fs\.teachablecdn\.com', + r']+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) @staticmethod From 794771a164009ff94046c98d8a7d45f7706547af Mon Sep 17 00:00:00 2001 From: JamKage Date: Sun, 27 Dec 2020 17:36:21 +0000 Subject: [PATCH 007/860] [go] Added support for FXNetworks (#26826) Co-authored-by: James Kirrage closes #13972 closes #22467 closes #23754 --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/fxnetworks.py | 77 ------------------------------ youtube_dl/extractor/go.py | 21 +++++++- 3 files changed, 19 insertions(+), 80 deletions(-) delete mode 100644 youtube_dl/extractor/fxnetworks.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index da472d58e..cf50b897b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -399,7 +399,6 @@ from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .fusion import FusionIE -from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e67426b..000000000 --- a/youtube_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'()', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 03cfba91f..0d731e90a 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -38,13 +38,17 @@ class GoIE(AdobePassIE): 'disneynow': { 'brand': '011', 'resource_id': 'Disney', - } + }, + 'fxnow.fxnetworks': { + 'brand': '025', + 'requestor_id': 'dtci', + }, } _VALID_URL = r'''(?x) https?:// (?: (?:(?P%s)\.)?go| - (?Pabc|freeform|disneynow) + (?Pabc|freeform|disneynow|fxnow\.fxnetworks) )\.com/ (?: (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| @@ -99,6 +103,19 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'info_dict': { + 'id': 'VDKA12782841', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'md5:fa73584a95761c605d9d54904e35b407', + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, From af93ecfd88d539cccea97f6cfc33b8cbe362a8ed Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 22:26:20 +0100 Subject: [PATCH 008/860] [toggle] add support for live.mewatch.sg (closes #27555) --- youtube_dl/extractor/toggle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index 3b9b54759..270c84daa 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -200,7 +200,7 @@ class ToggleIE(InfoExtractor): class MeWatchIE(InfoExtractor): IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', 'info_dict': { @@ -220,6 +220,9 @@ class MeWatchIE(InfoExtractor): }, { 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', 'only_matching': True, + }, { + 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', + 'only_matching': True, }] def _real_extract(self, url): From c0071885987b3737d2c586133007c61ab513a477 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 Dec 2020 23:47:28 +0100 Subject: [PATCH 009/860] [zype] Add support for uplynk videos --- youtube_dl/extractor/zype.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 2e2e97a0c..5288f40d8 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -85,7 +85,13 @@ class ZypeIE(InfoExtractor): else: m3u8_url = self._search_regex( r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', - body, 'm3u8 url', group='url') + body, 'm3u8 url', group='url', default=None) + if not m3u8_url: + source = self._parse_json(self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, + 'source'), video_id, js_to_json) + if source.get('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( From f27224d57b6768569e1aedfaff326605bdb4f049 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Dec 2020 10:50:29 +0100 Subject: [PATCH 010/860] [piksel] import format extraction --- youtube_dl/extractor/nhk.py | 2 +- youtube_dl/extractor/piksel.py | 109 ++++++++++++++++++++++++--------- 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index c5b406573..8a9331a79 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -90,7 +90,7 @@ class NhkVodIE(NhkBaseIE): _TESTS = [{ # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '256a1be14f48d960a7e61e2532d95ec3', + 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', 'info_dict': { 'id': 'a95j5iza', 'ext': 'mp4', diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py index 88b6859b0..ecf56ff8f 100644 --- a/youtube_dl/extractor/piksel.py +++ b/youtube_dl/extractor/piksel.py @@ -6,16 +6,33 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, dict_get, + ExtractorError, int_or_none, - unescapeHTML, parse_iso8601, + try_get, + unescapeHTML, ) class PikselIE(InfoExtractor): - _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P[a-z0-9_]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P[^/]+)/prefid/)?(?P[\w-]+)''' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -56,46 +73,41 @@ class PikselIE(InfoExtractor): if mobj: return mobj.group('url') + def _call_api(self, app_token, resource, display_id, query, fatal=True): + response = (self._download_json( + 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), + display_id, query=query, fatal=fatal) or {}).get('response') + failure = try_get(response, lambda x: x['failure']['reason']) + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + def _real_extract(self, url): - display_id = self._match_id(url) + ref_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-de-program-uuid=[\'"]([a-z0-9]+)', - webpage, 'program uuid', default=display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' ], webpage, 'app token') - response = self._download_json( - 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, - video_id, query={ - 'v': video_id - })['response'] - failure = response.get('failure') - if failure: - raise ExtractorError(response['failure']['reason'], expected=True) - video_data = response['WsProgramResponse']['program']['asset'] + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) formats = [] - m3u8_url = dict_get(video_data, [ - 'm3u8iPadURL', - 'ipadM3u8Url', - 'm3u8AndroidURL', - 'm3u8iPhoneURL', - 'iphoneM3u8Url']) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - asset_type = dict_get(video_data, ['assetType', 'asset_type']) - for asset_file in video_data.get('assetFiles', []): + def process_asset_file(asset_file): + if not asset_file: + return # TODO: extract rtmp formats http_url = asset_file.get('http_url') if not http_url: - continue + return tbr = None vbr = int_or_none(asset_file.get('videoBitrate'), 1024) abr = int_or_none(asset_file.get('audioBitrate'), 1024) @@ -118,6 +130,43 @@ class PikselIE(InfoExtractor): 'filesize': int_or_none(asset_file.get('filesize')), 'tbr': tbr, }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + self._sort_formats(formats) subtitles = {} From 782ea947b487d4ef2b7b11fa40a00a518b529fed Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Dec 2020 11:12:57 +0100 Subject: [PATCH 011/860] [brightcove] remove sonyliv specific code --- youtube_dl/extractor/brightcove.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 300d75458..65b44c099 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -534,14 +534,6 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) - if not formats: - # for sonyliv.com DRM protected videos - s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') - if s3_source_url: - formats.append({ - 'url': s3_source_url, - 'format_id': 'source', - }) errors = json_data.get('errors') if not formats and errors: From 64e419bd7386b2a16a3c2e7ac5da30427afe856d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 Dec 2020 18:19:30 +0100 Subject: [PATCH 012/860] [aparat] Fix extraction closes #22285 closes #22611 closes #23348 closes #24354 closes #24591 closes #24904 closes #25418 closes #26070 closes #26350 closes #26738 closes #27563 --- youtube_dl/extractor/aparat.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 883dcee7a..a9527e785 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + get_element_by_id, int_or_none, merge_dicts, mimetype2ext, @@ -39,23 +40,15 @@ class AparatIE(InfoExtractor): webpage = self._download_webpage(url, video_id, fatal=False) if not webpage: - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - options = self._parse_json( - self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', - webpage, 'options', group='value'), - video_id) - - player = options['plugins']['sabaPlayerPlugin'] + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) formats = [] - for sources in player['multiSRC']: + for sources in (options.get('multiSRC') or []): for item in sources: if not isinstance(item, dict): continue @@ -85,11 +78,12 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) From f1bc56c99bac05dccb01c1b68ef778eb08fbcb71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:11:48 +0700 Subject: [PATCH 013/860] [youtube:tab] Restore retry on browse requests (closes #27313, closes #27564) --- youtube_dl/extractor/youtube.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5ef58d730..77f128285 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_HTTPError, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -3009,10 +3010,24 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): if not continuation: break - browse = self._download_json( - 'https://www.youtube.com/browse_ajax', None, - 'Downloading page %d' % page_num, - headers=headers, query=continuation, fatal=False) + count = 0 + retries = 3 + while count <= retries: + try: + # Downloading page may result in intermittent 5xx HTTP error + # that is usually worked around with a retry + browse = self._download_json( + 'https://www.youtube.com/browse_ajax', None, + 'Downloading page %d%s' + % (page_num, ' (retry #%d)' % count if count else ''), + headers=headers, query=continuation) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): + count += 1 + if count <= retries: + continue + raise if not browse: break response = try_get(browse, lambda x: x[1]['response'], dict) From 71febd1c52d6de89ff571d4c212846aaaafb33ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:19:43 +0700 Subject: [PATCH 014/860] [youtube:tab] Improve URL matching (closes #27559) --- youtube_dl/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 77f128285..0044ed909 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1103,6 +1103,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -2730,6 +2734,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( 'channelId', webpage, 'channel id', default=None) From 1a95953867412bc7a785f21f6bff5145b2b13fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:29:34 +0700 Subject: [PATCH 015/860] [youtube] Improve yt initial data extraction (closes #27524) --- youtube_dl/extractor/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0044ed909..87bdc1677 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -280,6 +280,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' + _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta| Date: Tue, 29 Dec 2020 02:49:53 +0700 Subject: [PATCH 016/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 86b12b1c3..7e1bcb237 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors +* [youtube] Improve yt initial data extraction (#27524) +* [youtube:tab] Improve URL matching #27559) +* [youtube:tab] Restore retry on browse requests (#27313, #27564) +* [aparat] Fix extraction (#22285, #22611, #23348, #24354, #24591, #24904, + #25418, #26070, #26350, #26738, #27563) +- [brightcove] Remove sonyliv specific code +* [piksel] Improve format extraction ++ [zype] Add support for uplynk videos ++ [toggle] Add support for live.mewatch.sg (#27555) ++ [go] Add support for fxnow.fxnetworks.com (#13972, #22467, #23754, #26826) +* [teachable] Improve embed detection (#26923) +* [mitele] Fix free video extraction (#24624, #25827, #26757) +* [telecinco] Fix extraction +* [youtube] Update invidious.snopyta.org (#22667) +* [amcnetworks] Improve auth only video detection (#27548) ++ [generic] Add support for VHX Embeds (#27546) + + version 2020.12.26 Extractors From 479cc6d5a166dc2f250687616041c9f3b36c80b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Dec 2020 02:52:31 +0700 Subject: [PATCH 017/860] release 2020.12.29 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 52295e426..0d5a8e666 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.26 + [debug] youtube-dl version 2020.12.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 40a61bc80..88a6ea3fc 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index e71c3ad61..1d75d1a79 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b10168073..83f1f43cd 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.26 + [debug] youtube-dl version 2020.12.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index cba0b3394..d3e03f78b 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.26** +- [ ] I've verified that I'm running youtube-dl version **2020.12.29** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 7e1bcb237..2dad14949 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.29 Extractors * [youtube] Improve yt initial data extraction (#27524) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c54507e90..1406ba8b8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -317,7 +317,6 @@ - **Funk** - **Fusion** - **Fux** - - **FXNetworks** - **Gaia** - **GameInformer** - **GameSpot** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d1dcebd88..c7fb697c4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.26' +__version__ = '2020.12.29' From bcfe485e0172ff32c450bb7835cfae7fca7594ae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:05:23 +0100 Subject: [PATCH 018/860] [brightcove] raise ExtractorError for DRM protected videos(closes #23467)(closes #27568) --- youtube_dl/extractor/brightcove.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 65b44c099..6022076ac 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -471,13 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 + continue + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -535,11 +540,14 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) From 7acd042bbb555962f42fa4f0f236772194d2da64 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:06:58 +0100 Subject: [PATCH 019/860] [tenplay] fix format extraction(closes #26653) --- youtube_dl/extractor/tenplay.py | 34 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index af325fea8..cd30d57f4 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + HEADRequest, parse_age_limit, parse_iso8601, - smuggle_url, + # smuggle_url, ) @@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor): 'uploader_id': '2199827728001', }, 'params': { - 'format': 'bestvideo', + # 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + _GEO_BYPASS = False + _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' def _real_extract(self, url): content_id = self._match_id(url) @@ -40,19 +43,28 @@ class TenPlayIE(InfoExtractor): video = data.get('video') or {} metadata = data.get('metaData') or {} brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - brightcove_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['AU']}) + # brightcove_url = smuggle_url( + # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + # {'geo_countries': ['AU']}) + m3u8_url = self._request_webpage(HEADRequest( + self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': brightcove_url, - 'id': content_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + # '_type': 'url_transparent', + # 'url': brightcove_url, + 'formats': formats, + 'id': brightcove_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], 'description': video.get('description'), 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), 'series': metadata.get('showName'), 'season': metadata.get('showContentSeason'), 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'ie_key': 'BrightcoveNew', + 'thumbnail': video.get('poster'), + 'uploader_id': '2199827728001', + # 'ie_key': 'BrightcoveNew', } From c931c4b8ddb32371cddf48ea52d0c036a6a66240 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:09:10 +0100 Subject: [PATCH 020/860] [sevenplay] detect API errors --- youtube_dl/extractor/sevenplus.py | 32 ++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py index 84568ac69..240afc18f 100644 --- a/youtube_dl/extractor/sevenplus.py +++ b/youtube_dl/extractor/sevenplus.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, try_get, update_url_query, ) @@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE): def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise for source in media.get('sources', {}): src = source.get('src') From 53528e1d2385494c72349f609907f0164d6f8431 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 14:10:46 +0100 Subject: [PATCH 021/860] [uktvplay] match new video URLs(closes #17909) --- youtube_dl/extractor/uktvplay.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py index 2137502a1..f28fd514d 100644 --- a/youtube_dl/extractor/uktvplay.py +++ b/youtube_dl/extractor/uktvplay.py @@ -5,10 +5,9 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' - _TEST = { + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' + _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', - 'md5': '', 'info_dict': { 'id': '2117008346001', 'ext': 'mp4', @@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Failed to download MPD manifest'] - } + }, { + 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', + 'only_matching': True, + }] + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' def _real_extract(self, url): From 9ee984fc760c9a3f4818055ff28ea886aadc58cb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 16:13:36 +0100 Subject: [PATCH 022/860] [aenetworks] add support for biography.com (closes #3863) --- youtube_dl/extractor/aenetworks.py | 97 ++++++++++++++++++++++-------- youtube_dl/extractor/extractors.py | 2 + 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 3d0cf1208..237012978 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -6,6 +6,7 @@ import re from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, update_url_query, urlencode_postdata, @@ -28,6 +29,7 @@ class AENetworksBaseIE(ThePlatformIE): 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), 'fyi.tv': ('FYI', 'fyi'), 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), } def _extract_aen_smil(self, smil_url, video_id, auth=None): @@ -54,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -67,6 +71,34 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' @@ -139,32 +171,7 @@ class AENetworksIE(AENetworksBaseIE): def _real_extract(self, url): domain, canonical = re.match(self._VALID_URL, url).groups() - requestor_id, brand = self._DOMAIN_MAP[domain] - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - canonical, query={'filter[canonical]': '/' + canonical})['results'][0] - title = result['title'] - video_id = result['id'] - media_url = result['publicUrl'] - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - info.update({ - 'title': title, - 'series': result.get('seriesName'), - 'season_number': int_or_none(result.get('tvSeasonNumber')), - 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), - }) - return info + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) class AENetworksListBaseIE(AENetworksBaseIE): @@ -294,3 +301,41 @@ class HistoryTopicIE(AENetworksBaseIE): return self.url_result( 'http://www.history.com/videos/' + display_id, AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = self._search_regex( + r']+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cf50b897b..20472f2f7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -33,6 +33,8 @@ from .aenetworks import ( AENetworksCollectionIE, AENetworksShowIE, HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, ) from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE From 5966095e65b5365e8e4d211ffca6ab50514d3c84 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 16:59:31 +0100 Subject: [PATCH 023/860] [aenetworks] fix HistoryPlayerIE tests --- youtube_dl/extractor/aenetworks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 237012978..8e4963131 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -306,6 +306,7 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() From ebdcf70b0d52fd7b4627b2435b3081c7f4f4ce4b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 17:15:13 +0100 Subject: [PATCH 024/860] [nbc] fix NBCSport VPlayer URL extraction(closes #16640) --- youtube_dl/extractor/nbc.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index ea5f5a315..9695a9616 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -158,7 +158,8 @@ class NBCIE(AdobePassIE): class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -174,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor): }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): iframe_m = re.search( - r']+src="(?Phttps?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) if iframe_m: return iframe_m.group('url') @@ -192,21 +196,29 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https because its certificate is invalid - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P[0-9a-z-]+)' - _TEST = { + _TESTS = [{ + # iframe src 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', 'info_dict': { 'id': 'PHJSaFWbrTY9', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, } - } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From c706fbe9fe4a517ad43ca95393384e5f78870f82 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 17:21:05 +0100 Subject: [PATCH 025/860] [nbc] Remove CSNNE extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/nbc.py | 27 --------------------------- 2 files changed, 28 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 20472f2f7..51e6a463a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -692,7 +692,6 @@ from .nba import ( NBAChannelIE, ) from .nbc import ( - CSNNEIE, NBCIE, NBCNewsIE, NBCOlympicsIE, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 9695a9616..0d77648c2 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -286,33 +286,6 @@ class NBCSportsStreamIE(AdobePassIE): } -class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' - - _TEST = { - 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', - 'info_dict': { - 'id': 'yvBLLUgQ8WU0', - 'ext': 'mp4', - 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', - 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', - 'timestamp': 1459369979, - 'upload_date': '20160330', - 'uploader': 'NBCU-SPORTS', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': self._html_search_meta('twitter:player:stream', webpage), - 'display_id': display_id, - } - - class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' From 9c1e164e0cd77331ea4f0b474b32fd06f84bad71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 29 Dec 2020 19:29:08 +0100 Subject: [PATCH 026/860] [YoutubeDL] Allow format filtering using audio language(#16209) --- README.md | 1 + youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ffa34493e..85fed6d3a 100644 --- a/README.md +++ b/README.md @@ -678,6 +678,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format + - `language`: Language code Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6f477bc32..aaac149e9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1083,7 +1083,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?Pext|acodec|vcodec|container|protocol|format_id|language) \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ From 9dd674e1d20440564a3d25f33cd8785695e110f6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 09:22:30 +0100 Subject: [PATCH 027/860] [utils] accept only supported protocols in url_or_none --- test/test_utils.py | 5 +++++ youtube_dl/utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 925a21d34..d49d3239c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -554,6 +554,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8cefafd79..d5fb6fd24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3640,7 +3640,7 @@ def url_or_none(url): if not url or not isinstance(url, compat_str): return None url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def parse_duration(s): From f7e95fb2a0516f90edffe72d9911222d1ed1a2bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 09:24:37 +0100 Subject: [PATCH 028/860] [yandexvideo] fix extraction(closes #25000) --- youtube_dl/extractor/yandexvideo.py | 116 ++++++++++++++++++---------- 1 file changed, 76 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index 46529be05..36d01cc8e 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -13,26 +13,30 @@ class YandexVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=| + yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| frontend\.vh\.yandex\.ru/player/ ) - (?P[\da-f]+) + (?P(?:[\da-f]{32}|[\w-]{12})) ''' _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { - 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', - 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 0, - 'duration': 30, + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, }, }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, }, { 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', @@ -52,53 +56,85 @@ class YandexVideoIE(InfoExtractor): # DASH with DRM 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) content = self._download_json( - 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - })['content'] + # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + # video_id, query={ + # 'stream_options': 'hires', + # 'disable_trackings': 1, + # })['content'] + 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id.encode())['player']['content']['content'] - content_url = url_or_none(content.get('content_url')) or url_or_none( - content['streams'][0]['url']) - title = content.get('title') or content.get('computed_title') + title = content.get('title') or content['computed_title'] - ext = determine_ext(content_url) - - if ext == 'm3u8': - formats = self._extract_m3u8_formats( - content_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - elif ext == 'mpd': - formats = self._extract_mpd_formats( - content_url, video_id, mpd_id='dash') - else: - formats = [{'url': content_url}] + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) self._sort_formats(formats) - description = content.get('description') - thumbnail = content.get('thumbnail') timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) - duration = int_or_none(content.get('duration')) - series = content.get('program_title') - age_limit = int_or_none(content.get('restriction_age')) + season = content.get('season') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'age_limit': age_limit, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), 'formats': formats, } From bdd044e67b5d10736aa712e9be64beff0d47f490 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 13:30:11 +0100 Subject: [PATCH 029/860] [yandexvideo] use old api call as fallback --- youtube_dl/extractor/yandexvideo.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index 36d01cc8e..ab8c84c93 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + try_get, url_or_none, ) @@ -64,12 +65,7 @@ class YandexVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, - # video_id, query={ - # 'stream_options': 'hires', - # 'disable_trackings': 1, - # })['content'] + player = try_get((self._download_json( 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ player(content_id: "%s") { computed_title @@ -90,7 +86,15 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode())['player']['content']['content'] +}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] title = content.get('title') or content['computed_title'] From bd18824c2a99d6d01b00edfa186b9fd227af255c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 13:43:56 +0100 Subject: [PATCH 030/860] [yandexdisk] fix extraction(closes #17861)(closes #27131) --- youtube_dl/extractor/yandexdisk.py | 138 +++++++++++++++++------------ 1 file changed, 81 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py index e8f6ae10f..21f37c192 100644 --- a/youtube_dl/extractor/yandexdisk.py +++ b/youtube_dl/extractor/yandexdisk.py @@ -1,19 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( determine_ext, + ExtractorError, float_or_none, int_or_none, - try_get, - urlencode_postdata, + mimetype2ext, + parse_iso8601, + urljoin, ) class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?:www\.)?yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P[^/?#&]+)''' _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', @@ -25,94 +46,97 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', + 'timestamp': 1421396809, + 'upload_date': '20150116', 'view_count': int, }, }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - status = self._download_webpage( - 'https://disk.yandex.com/auth/status', video_id, query={ - 'urlOrigin': url, - 'source': 'public', - 'md5': 'false', - }) + try: + resource = self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources', + video_id, query={'public_key': url}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_description = self._parse_json( + e.cause.read().decode(), video_id)['description'] + raise ExtractorError(error_description, expected=True) + raise - sk = self._search_regex( - r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', - status, 'sk', group='value') + title = resource['name'] + public_url = resource.get('public_url') + if public_url: + video_id = self._match_id(public_url) - webpage = self._download_webpage(url, video_id) + self._set_cookie('yadi.sk', 'yandexuid', '0') - models = self._parse_json( - self._search_regex( - r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s* Date: Wed, 30 Dec 2020 16:45:53 +0100 Subject: [PATCH 031/860] [yandexdisk] extract info from webpage the public API does not return metadata when download limit is reached --- youtube_dl/extractor/yandexdisk.py | 89 ++++++++++++++++-------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py index 21f37c192..6fcd8ee7e 100644 --- a/youtube_dl/extractor/yandexdisk.py +++ b/youtube_dl/extractor/yandexdisk.py @@ -2,24 +2,23 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( determine_ext, - ExtractorError, float_or_none, int_or_none, mimetype2ext, - parse_iso8601, + try_get, urljoin, ) class YandexDiskIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// - (?: - (?:www\.)?yadi\.sk| + (?P + yadi\.sk| disk\.yandex\. (?: az| @@ -38,7 +37,7 @@ class YandexDiskIE(InfoExtractor): _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -46,10 +45,9 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', - 'timestamp': 1421396809, - 'upload_date': '20150116', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, @@ -59,51 +57,58 @@ class YandexDiskIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() - try: - resource = self._download_json( - 'https://cloud-api.yandex.net/v1/disk/public/resources', - video_id, query={'public_key': url}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_description = self._parse_json( - e.cause.read().decode(), video_id)['description'] - raise ExtractorError(error_description, expected=True) - raise + webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r']+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] title = resource['name'] - public_url = resource.get('public_url') + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') if public_url: video_id = self._match_id(public_url) - self._set_cookie('yadi.sk', 'yandexuid', '0') + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) - def call_api(action): - return (self._download_json( - urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ - 'hash': url, - # obtain sk if needed from call_api('check-auth') while - # the yandexuid cookie is set and sending an empty JSON object - 'sk': 'ya6b52f8c6b12abe91a66d22d3a31084b' - }).encode(), headers={ - 'Content-Type': 'text/plain', - }, fatal=False) or {}).get('data') or {} + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') formats = [] - source_url = resource.get('file') - if not source_url: - source_url = call_api('download-url').get('url') if source_url: formats.append({ 'url': source_url, 'format_id': 'source', - 'ext': determine_ext(title, mimetype2ext(resource.get('mime_type')) or 'mp4'), + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), 'quality': 1, - 'filesize': int_or_none(resource.get('size')) + 'filesize': int_or_none(meta.get('size')) }) - video_streams = call_api('get-video-streams') for video in (video_streams.get('videos') or []): format_url = video.get('url') if not format_url: @@ -128,15 +133,15 @@ class YandexDiskIE(InfoExtractor): }) self._sort_formats(formats) - owner = resource.get('owner') or {} + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) return { 'id': video_id, 'title': title, 'duration': float_or_none(video_streams.get('duration'), 1000), - 'uploader': owner.get('display_name'), - 'uploader_id': owner.get('uid'), - 'view_count': int_or_none(resource.get('views_count')), - 'timestamp': parse_iso8601(resource.get('created')), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), 'formats': formats, } From 2e21b06ea2f7a1ee5e038bf4274e5a74a4f52c2c Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 30 Dec 2020 18:12:17 +0100 Subject: [PATCH 032/860] [vvvvid] add playlists support (#27574) closes #18130 --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/vvvvid.py | 65 +++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 51e6a463a..254206a08 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1425,7 +1425,10 @@ from .vshare import VShareIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE -from .vvvvid import VVVVIDIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .wakanim import WakanimIE diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 6906cd2ab..5b8ea3665 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -12,7 +12,8 @@ from ..utils import ( class VVVVIDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', @@ -45,20 +46,26 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, - }) + }, fatal=fatal) if response['result'] == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response['data']))[0] + lambda episode: episode.get('video_id') == vid, response))[0] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -156,3 +163,49 @@ class VVVVIDIE(InfoExtractor): 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), } + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/seasons/', show_title) + + show_infos = self._download_info( + show_id, '/info/', show_title, fatal=False) + + entries = [] + for season in response: + episodes = season.get('episodes') or [] + for episode in episodes: + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + + video_url = '/'.join([base_url, season_id, video_id]) + + entries.append({ + '_type': 'url_transparent', + 'ie_key': VVVVIDIE.ie_key(), + 'url': video_url, + 'title': episode.get('title'), + 'thumbnail': episode.get('thumbnail'), + 'description': episode.get('description'), + 'season_number': int_or_none(episode.get('season_number')), + 'episode_number': int_or_none(episode.get('number')), + }) + return self.playlist_result( + entries, show_id, show_infos.get('title'), show_infos.get('description')) From 7b0f04ed1f72fd4e4b5b3e935e08a912857fa8c4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 18:16:47 +0100 Subject: [PATCH 033/860] [vvvvid] imporove info extraction --- youtube_dl/extractor/vvvvid.py | 78 ++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 5b8ea3665..014a67e53 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -22,6 +22,16 @@ class VVVVIDIE(InfoExtractor): 'id': '489048', 'ext': 'mp4', 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'season_number': 1, + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -38,6 +48,9 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True }] _conn_id = None @@ -48,24 +61,34 @@ class VVVVIDIE(InfoExtractor): def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, }, fatal=fatal) - if response['result'] == 'error': + if not (response or fatal): + return + if response.get('result') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) return response['data'] + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_number': int_or_none(video_data.get('number')), + 'episode_id': str_or_none(video_data.get('id')), + } + def _real_extract(self, url): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, '/season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -148,25 +171,25 @@ class VVVVIDIE(InfoExtractor): 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) self._sort_formats(formats) - return { + info = self._extract_common_video_info(video_data) + info.update({ 'id': video_id, - 'title': video_data['title'], + 'title': title, 'formats': formats, - 'thumbnail': video_data.get('thumbnail'), 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, 'season_number': video_data.get('season_number'), - 'episode_id': str_or_none(video_data.get('id')), - 'episode_number': int_or_none(video_data.get('number')), - 'episode_title': video_data['title'], + 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), - } + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info class VVVVIDShowIE(VVVVIDIE): - _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _VALID_URL = r'(?P%s(?P\d+)(?:/(?P[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.vvvvid.it/show/156/psyco-pass', 'info_dict': { @@ -175,37 +198,40 @@ class VVVVIDShowIE(VVVVIDIE): 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', }, 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, }] def _real_extract(self, url): base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() - response = self._download_info( - show_id, '/seasons/', show_title) + seasons = self._download_info( + show_id, 'seasons/', show_title) - show_infos = self._download_info( - show_id, '/info/', show_title, fatal=False) + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) entries = [] - for season in response: + for season in (seasons or []): + season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): continue - - video_url = '/'.join([base_url, season_id, video_id]) - - entries.append({ - '_type': 'url_transparent', + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url', 'ie_key': VVVVIDIE.ie_key(), - 'url': video_url, + 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), - 'thumbnail': episode.get('thumbnail'), 'description': episode.get('description'), - 'season_number': int_or_none(episode.get('season_number')), - 'episode_number': int_or_none(episode.get('number')), + 'season_number': season_number, + 'season_id': season_id, }) + entries.append(info) + return self.playlist_result( - entries, show_id, show_infos.get('title'), show_infos.get('description')) + entries, show_id, show_info.get('title'), show_info.get('description')) From d1d0612160ed2d753cc57b30483776b7c1b03473 Mon Sep 17 00:00:00 2001 From: ozburo Date: Tue, 22 Dec 2020 06:12:14 -0600 Subject: [PATCH 034/860] [redditr] Extract all thumbnails --- youtube_dl/extractor/reddit.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 3b2abb262..2d1a1fd99 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, try_get, url_or_none, + unescapeHTML, ) @@ -118,11 +119,23 @@ class RedditRIE(InfoExtractor): else: age_limit = None + thumbnails = [] + images = try_get( + data, lambda x: x['preview']['images'][0]['resolutions']) or [] + for image in images: + url = url_or_none(unescapeHTML(image['url'])) + if url is not None: + thumbnails.append({ + 'url': url, + 'width': int_or_none(image['width']), + 'height': int_or_none(image['height']), + }) + return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), - 'thumbnail': url_or_none(data.get('thumbnail')), + 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), 'duration': int_or_none(try_get( From 4046ffe1e16a65196e113cb12aa2d935f7f17c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Dec 2020 05:06:54 +0700 Subject: [PATCH 035/860] [redditr] Fix review issues and extract source thumbnail (closes #27503) --- youtube_dl/extractor/reddit.py | 36 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 2d1a1fd99..222fa0172 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -8,8 +8,8 @@ from ..utils import ( int_or_none, float_or_none, try_get, - url_or_none, unescapeHTML, + url_or_none, ) @@ -57,7 +57,8 @@ class RedditRIE(InfoExtractor): 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', @@ -120,16 +121,27 @@ class RedditRIE(InfoExtractor): age_limit = None thumbnails = [] - images = try_get( - data, lambda x: x['preview']['images'][0]['resolutions']) or [] - for image in images: - url = url_or_none(unescapeHTML(image['url'])) - if url is not None: - thumbnails.append({ - 'url': url, - 'width': int_or_none(image['width']), - 'height': int_or_none(image['height']), - }) + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) return { '_type': 'url_transparent', From 2a84694b1e8d91b271eb5bb74b9d63cbfcc58cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Dec 2020 05:14:33 +0700 Subject: [PATCH 036/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ChangeLog b/ChangeLog index 2dad14949..1045fef5e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +version + +Core +* [utils] Accept only supported protocols in url_or_none +* [YoutubeDL] Allow format filtering using audio language (#16209) + +Extractors ++ [redditr] Extract all thumbnails (#27503) +* [vvvvid] Improve info extraction ++ [vvvvid] Add support for playlists (#18130, #27574) ++ [yandexdisk] Extract info from webpage +* [yandexdisk] Fix extraction (#17861, #27131) +* [yandexvideo] Use old API call as fallback +* [yandexvideo] Fix extraction (#25000) +- [nbc] Remove CSNNE extractor +* [nbc] Fix NBCSport VPlayer URL extraction (#16640) ++ [aenetworks] Add support for biography.com (#3863) +* [uktvplay] Match new video URLs (#17909) +* [sevenplay] Detect API errors +* [tenplay] Fix format extraction (#26653) +* [brightcove] Raise error for DRM protected videos (#23467, #27568) + + version 2020.12.29 Extractors From 4066945919a3f51a01f551ade8957f4856c67e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Dec 2020 05:17:55 +0700 Subject: [PATCH 037/860] release 2020.12.31 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 +++- youtube_dl/version.py | 2 +- 8 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 0d5a8e666..d1a6ad1f6 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.29 + [debug] youtube-dl version 2020.12.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 88a6ea3fc..ded5beadf 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 1d75d1a79..0e06de8dc 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 83f1f43cd..dfbfe3701 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.29 + [debug] youtube-dl version 2020.12.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d3e03f78b..30061808d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.29** +- [ ] I've verified that I'm running youtube-dl version **2020.12.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1045fef5e..3782ad090 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.12.31 Core * [utils] Accept only supported protocols in url_or_none diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1406ba8b8..b1112f83b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -104,6 +104,7 @@ - **BilibiliAudioAlbum** - **BiliBiliPlayer** - **BioBioChileTV** + - **Biography** - **BIQLE** - **BitChute** - **BitChuteChannel** @@ -197,7 +198,6 @@ - **CrooksAndLiars** - **crunchyroll** - **crunchyroll:playlist** - - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTV** @@ -349,6 +349,7 @@ - **hgtv.com:show** - **HiDive** - **HistoricFilms** + - **history:player** - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** @@ -1088,6 +1089,7 @@ - **vube**: Vube.com - **VuClip** - **VVVVID** + - **VVVVIDShow** - **VyboryMos** - **Vzaar** - **Wakanim** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c7fb697c4..f795f0735 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.29' +__version__ = '2020.12.31' From 2f56caf0835786fa5af92da428ef90a530d90f26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 11:58:52 +0100 Subject: [PATCH 038/860] [yandexvideo] fix extraction for Python 3.4 --- youtube_dl/extractor/yandexvideo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py index ab8c84c93..6a166ec9b 100644 --- a/youtube_dl/extractor/yandexvideo.py +++ b/youtube_dl/extractor/yandexvideo.py @@ -66,7 +66,7 @@ class YandexVideoIE(InfoExtractor): video_id = self._match_id(url) player = try_get((self._download_json( - 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ player(content_id: "%s") { computed_title content_url @@ -86,7 +86,7 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) if not player or player.get('error'): player = self._download_json( 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, From c35bc8260665e071d5088f5be3bb37f7664ef06e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 12:04:35 +0100 Subject: [PATCH 039/860] [vvvvid] skip unplayable episodes and extract akamai formats(closes #27599) --- youtube_dl/extractor/vvvvid.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 014a67e53..145805492 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -152,7 +152,6 @@ class VVVVIDIE(InfoExtractor): embed_code = ds(embed_code) video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') if video_type == 'video/kenc': kenc = self._download_json( 'https://www.vvvvid.it/kenc', video_id, query={ @@ -163,9 +162,7 @@ class VVVVIDIE(InfoExtractor): kenc_message = kenc.get('message') if kenc_message: embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_akamai_formats(embed_code, video_id)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) @@ -217,6 +214,8 @@ class VVVVIDShowIE(VVVVIDIE): season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: + if episode.get('playable') is False: + continue season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): From 4d7d056909665f68e2aaa95fe42a000c287265b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 12:17:55 +0100 Subject: [PATCH 040/860] [sky] add support for Sports News articles and Brighcove videos(closes #13054) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sky.py | 99 +++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 254206a08..54cbbe9d2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1052,6 +1052,7 @@ from .skynewsarabia import ( from .sky import ( SkyNewsIE, SkySportsIE, + SkySportsNewsIE, ) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE diff --git a/youtube_dl/extractor/sky.py b/youtube_dl/extractor/sky.py index 681691004..ff2c977a0 100644 --- a/youtube_dl/extractor/sky.py +++ b/youtube_dl/extractor/sky.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( extract_attributes, @@ -11,36 +13,59 @@ from ..utils import ( class SkyBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = extract_attributes(self._search_regex( - r'(]+>)', - webpage, 'video data')) + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + _SDC_EL_REGEX = r'(?s)(]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' - video_url = 'ooyala:%s' % video_data['data-video-id'] - if video_data.get('data-token-required') == 'true': - token_fetch_options = self._parse_json(video_data.get( - 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} - token_fetch_url = token_fetch_options.get('url') - if token_fetch_url: - embed_token = self._download_webpage(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token.strip('"')}) + def _process_ooyala_element(self, webpage, sdc_el, url): + sdc = extract_attributes(sdc_el) + provider = sdc.get('data-provider') + if provider == 'ooyala': + video_id = sdc['data-sdc-video-id'] + video_url = 'ooyala:%s' % video_id + ie_key = 'Ooyala' + ooyala_el = self._search_regex( + r'(]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, + webpage, 'video data', fatal=False) + if ooyala_el: + ooyala_attrs = extract_attributes(ooyala_el) or {} + if ooyala_attrs.get('data-token-required') == 'true': + token_fetch_url = (self._parse_json(ooyala_attrs.get( + 'data-token-fetch-options', '{}'), + video_id, fatal=False) or {}).get('url') + if token_fetch_url: + embed_token = self._download_json(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token}) + elif provider == 'brightcove': + video_id = sdc['data-video-id'] + account_id = sdc.get('data-account-id') or '6058004172001' + player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' + video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) + ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, + 'ie_key': ie_key, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._process_ooyala_element(webpage, self._search_regex( + self._SDC_EL_REGEX, webpage, 'sdc element'), url) + info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), - 'ie_key': 'Ooyala', - } + }) + return info class SkySportsIE(SkyBaseIE): + IE_NAME = 'sky:sports' _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', @@ -62,15 +87,45 @@ class SkySportsIE(SkyBaseIE): class SkyNewsIE(SkyBaseIE): + IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', - 'md5': 'd6327e581473cea9976a3236ded370cd', + 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { - 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + 'uploader_id': '6058004172001', + 'timestamp': 1567112345, + 'upload_date': '20190829', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], } + + +class SkySportsNewsIE(SkyBaseIE): + IE_NAME = 'sky:sports:news' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P\d+)' + _TEST = { + 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', + 'info_dict': { + 'id': '10871916', + 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', + 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [] + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): + entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) From ab89a8678b77a53ef3ca701868a3acd5a4c300c1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 23:31:07 +0100 Subject: [PATCH 041/860] [arcpublishing] Add new extractor closes #2298 closes #9340 closes #17200 --- youtube_dl/extractor/arcpublishing.py | 173 +++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 19 +++ youtube_dl/extractor/washingtonpost.py | 101 +++------------ 4 files changed, 210 insertions(+), 84 deletions(-) create mode 100644 youtube_dl/extractor/arcpublishing.py diff --git a/youtube_dl/extractor/arcpublishing.py b/youtube_dl/extractor/arcpublishing.py new file mode 100644 index 000000000..d1fb1a054 --- /dev/null +++ b/youtube_dl/extractor/arcpublishing.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P[a-z]+):(?P%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + def _extract_urls(webpage): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = re.match(self._VALID_URL, url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'preference': -1, + }) + self._sort_formats( + formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 54cbbe9d2..d9d3f4940 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -56,6 +56,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE from .ard import ( ARDBetaMediathekIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 14c27c6da..780971a92 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -127,6 +127,7 @@ from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE +from .arcpublishing import ArcPublishingIE class GenericIE(InfoExtractor): @@ -2208,6 +2209,20 @@ class GenericIE(InfoExtractor): 'uploader': 'OTT Videos', }, }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, ] def report_following_redirect(self, new_url): @@ -2574,6 +2589,10 @@ class GenericIE(InfoExtractor): if tp_urls: return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') + arc_urls = ArcPublishingIE._extract_urls(webpage) + if arc_urls: + return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 625d0a1cc..8afb1af83 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -4,17 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - strip_jsonp, -) class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' - _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _TEST = { + _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', 'info_dict': { @@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Egypt finds belongings, debris from plane crash', 'description': 'md5:a17ceee432f215a5371388c1f680bd86', 'upload_date': '20160520', - 'uploader': 'Reuters', - 'timestamp': 1463778452, + 'timestamp': 1463775187, }, - } + }, { + 'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html', + 'only_matching': True, + }, { + 'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html', + 'only_matching': True, + }] @classmethod def _extract_urls(cls, webpage): @@ -35,73 +36,8 @@ class WashingtonPostIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, - video_id, transform_source=strip_jsonp)[0]['contentConfig'] - title = video_data['title'] - - urls = [] - formats = [] - for s in video_data.get('streams', []): - s_url = s.get('url') - if not s_url or s_url in urls: - continue - urls.append(s_url) - video_type = s.get('type') - if video_type == 'smil': - continue - elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): - m3u8_formats = self._extract_m3u8_formats( - s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - for m3u8_format in m3u8_formats: - width = m3u8_format.get('width') - if not width: - continue - vbr = self._search_regex( - r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) - if vbr: - m3u8_format.update({ - 'vbr': int_or_none(vbr), - }) - formats.extend(m3u8_formats) - else: - width = int_or_none(s.get('width')) - vbr = int_or_none(s.get('bitrate')) - has_width = width != 0 - formats.append({ - 'format_id': ( - '%s-%d-%d' % (video_type, width, vbr) - if width - else video_type), - 'vbr': vbr if has_width else None, - 'width': width, - 'height': int_or_none(s.get('height')), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if has_width else 'none', - 'filesize': int_or_none(s.get('fileSize')), - 'url': s_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, - }) - source_media_url = video_data.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats( - formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('blurb'), - 'uploader': video_data.get('credits', {}).get('source'), - 'formats': formats, - 'duration': int_or_none(video_data.get('videoDuration'), 100), - 'timestamp': int_or_none( - video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), - } + return self.url_result( + 'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id) class WashingtonPostArticleIE(InfoExtractor): @@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'Breaking Points: The Paper Mine', 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', - 'uploader': 'The Washington Post', - 'timestamp': 1395527908, - 'upload_date': '20140322', + 'timestamp': 1395440416, + 'upload_date': '20140321', }, }, { 'md5': '1fff6a689d8770966df78c8cb6c8c17c', @@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', 'duration': 2220, - 'timestamp': 1395528005, - 'upload_date': '20140322', - 'uploader': 'The Washington Post', + 'timestamp': 1395441819, + 'upload_date': '20140321', }, }], }, { @@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'ext': 'mp4', 'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.', 'upload_date': '20141230', - 'uploader': 'The Washington Post', - 'timestamp': 1419974765, + 'timestamp': 1419972442, 'title': 'Why black boxes don’t transmit data in real time', } }] From 40d66e07dfef72fbef761ac720b82eb77deb7398 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 Dec 2020 23:42:17 +0100 Subject: [PATCH 042/860] [arcpublishing] add missing staticmethod decorator --- youtube_dl/extractor/arcpublishing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/arcpublishing.py b/youtube_dl/extractor/arcpublishing.py index d1fb1a054..ca6a6c4d8 100644 --- a/youtube_dl/extractor/arcpublishing.py +++ b/youtube_dl/extractor/arcpublishing.py @@ -73,6 +73,7 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] + @staticmethod def _extract_urls(webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview From 8c17afc4710cb39801bdae5027fe3bf641bf485d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Jan 2021 17:05:42 +0100 Subject: [PATCH 043/860] [acast] fix extraction(closes #21444)(closes #27612)(closes #27613) --- youtube_dl/extractor/acast.py | 116 ++++++++++++++++------------------ 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index b17c792d2..60378db1b 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -2,21 +2,47 @@ from __future__ import unicode_literals import re -import functools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, - float_or_none, int_or_none, - try_get, - unified_timestamp, - OnDemandPagedList, + parse_iso8601, ) -class ACastIE(InfoExtractor): +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': episode['url'], + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): IE_NAME = 'acast' _VALID_URL = r'''(?x) https?:// @@ -28,15 +54,15 @@ class ACastIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', 'timestamp': 1477346700, 'upload_date': '20161024', - 'duration': 2766.602563, + 'duration': 2766, 'creator': 'Anton Berg & Martin Johnson', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', @@ -45,7 +71,7 @@ class ACastIE(InfoExtractor): 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', 'only_matching': True, }, { 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', @@ -54,40 +80,14 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - s = self._download_json( - 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), - display_id) - media_url = s['url'] - if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): - episode_url = s.get('episodeUrl') - if episode_url: - display_id = episode_url - else: - channel, display_id = re.match(self._VALID_URL, s['link']).groups() - cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), - display_id)['result'] - e = cast_data['episode'] - title = e.get('name') or s['title'] - return { - 'id': compat_str(e['id']), - 'display_id': display_id, - 'url': media_url, - 'title': title, - 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), - 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), - 'duration': float_or_none(e.get('duration') or s.get('duration')), - 'filesize': int_or_none(e.get('contentLength')), - 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), - 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), - 'season_number': int_or_none(e.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(e.get('episodeNumber')), - } + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) -class ACastChannelIE(InfoExtractor): +class ACastChannelIE(ACastBaseIE): IE_NAME = 'acast:channel' _VALID_URL = r'''(?x) https?:// @@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor): 'info_dict': { 'id': '4efc5294-5385-4847-98bd-519799ce5786', 'title': 'Today in Focus', - 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', }, - 'playlist_mincount': 35, + 'playlist_mincount': 200, }, { 'url': 'http://play.acast.com/s/ft-banking-weekly', 'only_matching': True, }] - _API_BASE_URL = 'https://play.acast.com/api/' - _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _fetch_page(self, channel_slug, page): - casts = self._download_json( - self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), - channel_slug, note='Download page %d of channel data' % page) - for cast in casts: - yield self.url_result( - 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), - 'ACast', cast['id']) - def _real_extract(self, url): - channel_slug = self._match_id(url) - channel_data = self._download_json( - self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_slug), self._PAGE_SIZE) - return self.playlist_result(entries, compat_str( - channel_data['id']), channel_data['name'], channel_data.get('description')) + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) From d0d838638c8b49514c8bf093d6c76fea98019971 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Jan 2021 18:30:56 +0100 Subject: [PATCH 044/860] [stitcher] fix extraction(closes #20811)(closes #27606) --- youtube_dl/extractor/stitcher.py | 60 ++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 97d1ff681..b8b5711b1 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -4,25 +4,28 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, + clean_html, + ExtractorError, int_or_none, - js_to_json, - unescapeHTML, + str_or_none, + try_get, ) class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', - 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', - 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20180126', + 'timestamp': 1516989316, }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -38,6 +41,7 @@ class StitcherIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', @@ -45,37 +49,39 @@ class StitcherIE(InfoExtractor): }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, + }, { + 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - audio_id = mobj.group('id') - display_id = mobj.group('display_id') or audio_id + display_id, audio_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) + resp = self._download_json( + 'https://api.prod.stitcher.com/episode/' + audio_id, + display_id or audio_id) + episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) + if not episode: + raise ExtractorError(resp['errors'][0]['message'], expected=True) - episode = self._parse_json( - js_to_json(self._search_regex( - r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), - display_id)['config']['episode'] + title = episode['title'].strip() + audio_url = episode['audio_url'] - title = unescapeHTML(episode['title']) - formats = [{ - 'url': episode[episode_key], - 'ext': determine_ext(episode[episode_key]) or 'mp3', - 'vcodec': 'none', - } for episode_key in ('episodeURL',) if episode.get(episode_key)] - description = self._search_regex( - r'Episode Info:\s*([^<]+)<', webpage, 'description', fatal=False) - duration = int_or_none(episode.get('duration')) - thumbnail = episode.get('episodeImage') + thumbnail = None + show_id = episode.get('show_id') + if show_id and episode.get('classic_id') != -1: + thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id return { 'id': audio_id, 'display_id': display_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': clean_html(episode.get('html_description') or episode.get('description')), + 'duration': int_or_none(episode.get('duration')), 'thumbnail': thumbnail, - 'formats': formats, + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_created')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), } From 75972e200d033429bf9d34793ad3ffc813042347 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 1 Jan 2021 20:21:59 +0100 Subject: [PATCH 045/860] [vvvvid] fix season metadata extraction(#18130) --- youtube_dl/extractor/vvvvid.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 145805492..f4cae7fe9 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -25,7 +25,6 @@ class VVVVIDIE(InfoExtractor): 'duration': 239, 'series': '"Perché dovrei guardarlo?" di Dario Moccia', 'season_id': '437', - 'season_number': 1, 'episode': 'Ping Pong', 'episode_number': 1, 'episode_id': '3334', @@ -75,7 +74,6 @@ class VVVVIDIE(InfoExtractor): def _extract_common_video_info(self, video_data): return { 'thumbnail': video_data.get('thumbnail'), - 'episode_number': int_or_none(video_data.get('number')), 'episode_id': str_or_none(video_data.get('id')), } @@ -145,6 +143,17 @@ class VVVVIDIE(InfoExtractor): return d + info = {} + + def metadata_from_url(r_url): + if not info and r_url: + mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) + if mobj: + info['episode_number'] = int(mobj.group(2)) + season_number = mobj.group(1) + if season_number: + info['season_number'] = int(season_number) + for quality in ('_sd', ''): embed_code = video_data.get('embed_info' + quality) if not embed_code: @@ -166,9 +175,12 @@ class VVVVIDIE(InfoExtractor): else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + metadata_from_url(embed_code) + self._sort_formats(formats) - info = self._extract_common_video_info(video_data) + metadata_from_url(video_data.get('thumbnail')) + info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, @@ -176,7 +188,6 @@ class VVVVIDIE(InfoExtractor): 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, - 'season_number': video_data.get('season_number'), 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), @@ -211,7 +222,6 @@ class VVVVIDShowIE(VVVVIDIE): entries = [] for season in (seasons or []): - season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: if episode.get('playable') is False: @@ -227,7 +237,6 @@ class VVVVIDShowIE(VVVVIDIE): 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), - 'season_number': season_number, 'season_id': season_id, }) entries.append(info) From 2df93a0c4ada8eff399844ac4a249a743e0a980d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 05:01:45 +0700 Subject: [PATCH 046/860] [nrktv] Switch to playback endpoint mediaelement endpoint is no longer in use. --- youtube_dl/extractor/nrk.py | 273 ++++++++---------------------------- 1 file changed, 57 insertions(+), 216 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index b545f291b..871e4845c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -6,15 +6,11 @@ import random import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_age_limit, parse_duration, try_get, urljoin, @@ -63,60 +59,8 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('http://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query) - - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?\#&]+) - ''' - - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'mp4', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', - 'only_matching': True, - }] + fatal=fatal, query=query, + headers={'Accept-Encoding': 'gzip, deflate, br'}) def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id @@ -178,6 +122,59 @@ class NRKIE(NRKBaseIE): 'formats': formats, } + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) @@ -187,7 +184,6 @@ class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', @@ -290,164 +286,9 @@ class NRKTVIE(NRKBaseIE): 'only_matching': True, }] - _api_host = None - - def _extract_from_mediaelement(self, video_id): - api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - - for api_host in api_hosts: - data = self._download_json( - 'http://%s/mediaelement/%s' % (api_host, video_id), - video_id, 'Downloading mediaelement JSON', - fatal=api_host == api_hosts[-1]) - if not data: - continue - self._api_host = api_host - break - - title = data.get('fullTitle') or data.get('mainTitle') or data['title'] - video_id = data.get('id') or video_id - - urls = [] - entries = [] - - conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' - or data.get('isLive') is True or conviva.get('isLive')) - - def make_title(t): - return self._live_title(t) if live else t - - media_assets = data.get('mediaAssets') - if media_assets and isinstance(media_assets, list): - def video_id_and_title(idx): - return ((video_id, title) if len(media_assets) == 1 - else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) - for num, asset in enumerate(media_assets, 1): - asset_url = asset.get('url') - if not asset_url or asset_url in urls: - continue - urls.append(asset_url) - formats = self._extract_nrk_formats(asset_url, video_id) - if not formats: - continue - self._sort_formats(formats) - - entry_id, entry_title = video_id_and_title(num) - duration = parse_duration(asset.get('duration')) - subtitles = {} - for subtitle in ('webVtt', 'timedText'): - subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) - if subtitle_url: - subtitles.setdefault('no', []).append({ - 'url': compat_urllib_parse_unquote(subtitle_url) - }) - entries.append({ - 'id': asset.get('carrierId') or entry_id, - 'title': make_title(entry_title), - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats, - 'is_live': live, - }) - - if not entries: - media_url = data.get('mediaUrl') - if media_url and media_url not in urls: - formats = self._extract_nrk_formats(media_url, video_id) - if formats: - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - 'is_live': live, - }] - - if not entries: - self._raise_error(data) - - series = conviva.get('seriesName') or data.get('seriesTitle') - episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - - season_number = None - episode_number = None - if data.get('mediaElementType') == 'Episode': - _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ - data.get('relativeOriginUrl', '') - EPISODENUM_RE = [ - r'/s(?P\d{,2})e(?P\d{,2})\.', - r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', - ] - season_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'season number', - default=None, group='season')) - episode_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'episode number', - default=None, group='episode')) - - thumbnails = None - images = data.get('images') - if images and isinstance(images, dict): - web_images = images.get('webImages') - if isinstance(web_images, list): - thumbnails = [{ - 'url': image['imageUrl'], - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in web_images if image.get('imageUrl')] - - description = data.get('description') - category = data.get('mediaAnalytics', {}).get('category') - - common_info = { - 'description': description, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'categories': [category] if category else None, - 'age_limit': parse_age_limit(data.get('legalAge')), - 'thumbnails': thumbnails, - } - - vcodec = 'none' if data.get('mediaType') == 'Audio' else None - - for entry in entries: - entry.update(common_info) - for f in entry['formats']: - f['vcodec'] = vcodec - - points = data.get('shortIndexPoints') - if isinstance(points, list): - chapters = [] - for next_num, point in enumerate(points, start=1): - if not isinstance(point, dict): - continue - start_time = parse_duration(point.get('startPoint')) - if start_time is None: - continue - end_time = parse_duration( - data.get('duration') - if next_num == len(points) - else points[next_num].get('startPoint')) - if end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': point.get('title'), - }) - if chapters and len(entries) == 1: - entries[0]['chapters'] = chapters - - return self.playlist_result(entries, video_id, title, description) - def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_mediaelement(video_id) + return self._extract_from_playback(video_id) class NRKTVEpisodeIE(InfoExtractor): From 21a42e2588226b781a02124e26d709a9562c8fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:29:52 +0700 Subject: [PATCH 047/860] [nrk] Improve extraction (closes #27634, closes #27635) + Add support for mp3 formats * Generalize and delegate all item extractors to nrk, beware ie key breakages + Add support for podcasts + Generalize nrk shortcut form to support all kind of ids --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 248 ++++++++++++++++++++--------- 2 files changed, 172 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d9d3f4940..74743a449 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -790,6 +790,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKRadioPodkastIE, NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeasonIE, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 871e4845c..9621522d4 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -62,66 +62,6 @@ class NRKBaseIE(InfoExtractor): fatal=fatal, query=query, headers={'Accept-Encoding': 'gzip, deflate, br'}) - def _extract_from_playback(self, video_id): - path_templ = 'playback/%s/' + video_id - - def call_playback_api(item, query=None): - return self._call_api(path_templ % item, video_id, item, query=query) - # known values for preferredCdn: akamai, iponly, minicdn and telenor - manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) - - if manifest.get('playability') == 'nonPlayable': - self._raise_error(manifest['nonPlayable']) - - playable = manifest['playable'] - - formats = [] - for asset in playable['assets']: - if not isinstance(asset, dict): - continue - if asset.get('encrypted'): - continue - format_url = url_or_none(asset.get('url')) - if not format_url: - continue - if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_nrk_formats(format_url, video_id)) - self._sort_formats(formats) - - data = call_playback_api('metadata') - - preplay = data['preplay'] - titles = preplay['titles'] - title = titles['title'] - alt_title = titles.get('subtitle') - - description = preplay.get('description') - duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) - - thumbnails = [] - for image in try_get( - preplay, lambda x: x['poster']['images'], list) or []: - if not isinstance(image, dict): - continue - image_url = url_or_none(image.get('url')) - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - 'width': int_or_none(image.get('pixelWidth')), - 'height': int_or_none(image.get('pixelHeight')), - }) - - return { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - } - class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -173,14 +113,97 @@ class NRKIE(NRKBaseIE): }, { 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', 'only_matching': True, + }, { + # podcast + 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + # clip + 'url': 'nrk:150533', + 'only_matching': True, + }, { + # episode + 'url': 'nrk:MDDP12000117', + 'only_matching': True, + }, { + # direkte + 'url': 'nrk:nrk1', + 'only_matching': True, }] + def _extract_from_playback(self, video_id): + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + asset_format = (asset.get('format') or '').lower() + if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_nrk_formats(format_url, video_id)) + elif asset_format == 'mp3': + formats.append({ + 'url': format_url, + 'format_id': asset_format, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + data = call_playback_api('metadata') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) -class NRKTVIE(NRKBaseIE): +class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE @@ -288,7 +311,8 @@ class NRKTVIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_playback(video_id) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) class NRKTVEpisodeIE(InfoExtractor): @@ -359,8 +383,6 @@ class NRKTVSerieBaseIE(NRKBaseIE): nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue - if not re.match(NRKTVIE._EPISODE_RE, nrk_id): - continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries @@ -372,6 +394,10 @@ class NRKTVSerieBaseIE(NRKBaseIE): if embedded.get(asset_key): return asset_key + @staticmethod + def _catalog_name(serie_kind): + return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' + def _entries(self, data, display_id): for page_num in itertools.count(1): embedded = data.get('_embedded') or data @@ -405,7 +431,16 @@ class NRKTVSerieBaseIE(NRKBaseIE): class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?Ptv|radio)\.nrk\.no/ + (?Pserie|pod[ck]ast)/ + (?P[^/]+)/ + (?: + (?:sesong/)?(?P\d+)| + sesong/(?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { @@ -441,19 +476,34 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): # 180 entries, single page 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', + 'info_dict': { + 'id': 'hele_historien/diagnose-kverulant', + 'title': 'Diagnose kverulant', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - domain, serie, season_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie_kind = mobj.group('serie_kind') + serie = mobj.group('serie') + season_id = mobj.group('id') or mobj.group('id_2') display_id = '%s/%s' % (serie, season_id) data = self._call_api( - '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + '%s/catalog/%s/%s/seasons/%s' + % (domain, self._catalog_name(serie_kind), serie, season_id), display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id @@ -463,7 +513,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?Pserie|pod[ck]ast)/(?P[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -523,23 +573,33 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://nrksuper.no/serie/labyrint', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', + 'info_dict': { + 'id': 'ulrikkes_univers', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', + 'only_matching': True, }] @classmethod def suitable(cls, url): return ( False if any(ie.suitable(url) - for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - site, series_id = re.match(self._VALID_URL, url).groups() + site, serie_kind, series_id = re.match(self._VALID_URL, url).groups() is_radio = site == 'radio.nrk' domain = 'radio' if is_radio else 'tv' size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), + '%s/catalog/%s/%s' + % (domain, self._catalog_name(serie_kind), series_id), series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], @@ -554,12 +614,14 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): embedded_seasons = embedded.get('seasons') or [] if len(linked_seasons) > len(embedded_seasons): for season in linked_seasons: - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): + season_url = urljoin(url, season.get('href')) + if not season_url: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_url: entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), + season_url, ie=NRKTVSeasonIE.ie_key(), video_title=season.get('title'))) else: for season in embedded_seasons: @@ -584,6 +646,38 @@ class NRKTVDirekteIE(NRKTVIE): }] +class NRKRadioPodkastIE(InfoExtractor): + _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?Pl_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + class NRKPlaylistBaseIE(InfoExtractor): def _extract_description(self, webpage): pass From 1f1d01d498fee8120bb3415b1b1a20e21259bbf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:41:37 +0700 Subject: [PATCH 048/860] [nrk] Add more shortcut tests --- youtube_dl/extractor/nrk.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 9621522d4..61a7c9aad 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -117,18 +117,30 @@ class NRKIE(NRKBaseIE): # podcast 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', 'only_matching': True, + }, { + 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, }, { # clip 'url': 'nrk:150533', 'only_matching': True, }, { - # episode + 'url': 'nrk:clip/150533', + 'only_matching': True, + }, { + # program 'url': 'nrk:MDDP12000117', 'only_matching': True, + }, { + 'url': 'nrk:program/ENRK10100318', + 'only_matching': True, }, { # direkte 'url': 'nrk:nrk1', 'only_matching': True, + }, { + 'url': 'nrk:channel/nrk1', + 'only_matching': True, }] def _extract_from_playback(self, video_id): From 7b643d4cd0f5aa02149e29d8212acd42038da63b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:44:28 +0700 Subject: [PATCH 049/860] [nrk] Improve video id extraction --- youtube_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 61a7c9aad..5f12b0d9e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -151,6 +151,8 @@ class NRKIE(NRKBaseIE): # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -211,7 +213,7 @@ class NRKIE(NRKBaseIE): } def _real_extract(self, url): - video_id = self._match_id(url) + video_id = self._match_id(url).split('/')[-1] return self._extract_from_playback(video_id) From cabfd4b1f0354518068bbdf0718cd36497d5c8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:45:17 +0700 Subject: [PATCH 050/860] [nrk] Inline _extract_from_playback --- youtube_dl/extractor/nrk.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 5f12b0d9e..520206534 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -143,7 +143,9 @@ class NRKIE(NRKBaseIE): 'only_matching': True, }] - def _extract_from_playback(self, video_id): + def _real_extract(self, url): + video_id = self._match_id(url).split('/')[-1] + path_templ = 'playback/%s/' + video_id def call_playback_api(item, query=None): @@ -212,10 +214,6 @@ class NRKIE(NRKBaseIE): 'formats': formats, } - def _real_extract(self, url): - video_id = self._match_id(url).split('/')[-1] - return self._extract_from_playback(video_id) - class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' From a25d03d7cb52111fab906c2c180f9bdf2c52ed12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 06:51:21 +0700 Subject: [PATCH 051/860] [nrk] Fix age limit extraction --- youtube_dl/extractor/nrk.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 520206534..d023de7f7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -204,6 +204,9 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + age_limit = int_or_none(try_get( + data, lambda x: x['legalAge']['body']['rating']['code'])) + return { 'id': video_id, 'title': title, @@ -211,6 +214,7 @@ class NRKIE(NRKBaseIE): 'description': description, 'duration': duration, 'thumbnails': thumbnails, + 'age_limit': age_limit, 'formats': formats, } From 7dfd966848e17e3d438e5e49e1088b8b1444f9e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:02:13 +0700 Subject: [PATCH 052/860] [nrk] Extract subtitles --- youtube_dl/extractor/nrk.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index d023de7f7..bd96d9d14 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_duration, + str_or_none, try_get, urljoin, url_or_none, @@ -204,6 +205,21 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + subtitles = {} + for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + sub_url = url_or_none(sub.get('webVtt')) + if not sub_url: + continue + sub_key = str_or_none(sub.get('language')) or 'nb' + sub_type = str_or_none(sub.get('type')) + if sub_type: + sub_key += '-%s' % sub_type + subtitles.setdefault(sub_key, []).append({ + 'url': sub_url, + }) + age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) @@ -216,6 +232,7 @@ class NRKIE(NRKBaseIE): 'thumbnails': thumbnails, 'age_limit': age_limit, 'formats': formats, + 'subtitles': subtitles, } From 85de33b04ef339005cea949576dda4b553296f50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:30:45 +0700 Subject: [PATCH 053/860] [nrk] Improve series metadata extraction --- youtube_dl/extractor/nrk.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index bd96d9d14..20a5d7673 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -223,7 +223,9 @@ class NRKIE(NRKBaseIE): age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) - return { + is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + + info = { 'id': video_id, 'title': title, 'alt_title': alt_title, @@ -235,6 +237,27 @@ class NRKIE(NRKBaseIE): 'subtitles': subtitles, } + if is_series: + series = title + if alt_title: + title += ' - %s' % alt_title + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + episode = alt_title if is_series else None + episode_number = int_or_none(self._search_regex( + r'(\d+)\.\s+episode', episode or '', 'episode number', + default=None)) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info + class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' From bc2f83b95e02c9a75e576109ca0bced8a6c67eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:31:03 +0700 Subject: [PATCH 054/860] [nrktv] Fix tests --- youtube_dl/extractor/nrk.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 20a5d7673..4fb7df959 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -267,7 +267,7 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { - 'id': 'MDDP12000117AA', + 'id': 'MDDP12000117', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', @@ -278,24 +278,25 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { - 'id': 'MUHH48000314AA', + 'id': 'MUHH48000314', 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', + 'title': '20 spørsmål - 23. mai 2014', + 'alt_title': '23. mai 2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, 'series': '20 spørsmål', - 'episode': '23.05.2014', + 'episode': '23. mai 2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { - 'id': 'MDFP15000514CA', + 'id': 'MDFP15000514', 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605.08, 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', + 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', }, 'params': { 'skip_download': True, @@ -304,7 +305,7 @@ class NRKTVIE(InfoExtractor): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', @@ -317,22 +318,23 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'Ikke tilgjengelig utenfor Norge', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { - 'id': 'KMTE50001317AA', + 'id': 'KMTE50001317', 'ext': 'mp4', - 'title': 'Anno 13:30', + 'title': 'Anno - 13. episode', 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', 'duration': 2340, 'series': 'Anno', - 'episode': '13:30', + 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, }, @@ -342,7 +344,7 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', 'info_dict': { - 'id': 'MUHH46000317AA', + 'id': 'MUHH46000317', 'ext': 'mp4', 'title': 'Nytt på Nytt 27.01.2017', 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', From b2d9fd9c9f7d4b9f6b6bebbbb8f7e799cb08d4b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:53:05 +0700 Subject: [PATCH 055/860] [nrk] Improve episode and season number extraction --- youtube_dl/extractor/nrk.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 4fb7df959..48387420c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -246,7 +246,9 @@ class NRKIE(NRKBaseIE): default=None)) episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( - r'(\d+)\.\s+episode', episode or '', 'episode number', + r'^(\d+)\.', episode or '', 'episode number', + default=None)) or int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) info.update({ 'title': title, @@ -374,19 +376,19 @@ class NRKTVIE(InfoExtractor): class NRKTVEpisodeIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/(?P\d+)/episode/(?P\d+))' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', 'info_dict': { - 'id': 'MUHH36005220BA', + 'id': 'MUHH36005220', 'ext': 'mp4', - 'title': 'Kro, krig og kjærlighet 2:6', - 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', - 'duration': 1563, + 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', + 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'duration': 1563.92, 'series': 'Hellums kro', - 'season_number': 1, + # 'season_number': 1, 'episode_number': 2, - 'episode': '2:6', + 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, }, 'params': { @@ -395,15 +397,15 @@ class NRKTVEpisodeIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { - 'id': 'MSUI14000816AA', + 'id': 'MSUI14000816', 'ext': 'mp4', - 'title': 'Backstage 8:30', + 'title': 'Backstage - 8. episode', 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', 'duration': 1320, 'series': 'Backstage', 'season_number': 1, 'episode_number': 8, - 'episode': '8:30', + 'episode': '8. episode', }, 'params': { 'skip_download': True, @@ -412,7 +414,7 @@ class NRKTVEpisodeIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -424,10 +426,12 @@ class NRKTVEpisodeIE(InfoExtractor): assert re.match(NRKTVIE._EPISODE_RE, nrk_id) info.update({ - '_type': 'url_transparent', + '_type': 'url', 'id': nrk_id, 'url': 'nrk:%s' % nrk_id, 'ie_key': NRKIE.ie_key(), + 'season_number': int(season_number), + 'episode_number': int(episode_number), }) return info From 1aaee908b9ba12fc363c65daa2ef7d1690472a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 07:57:12 +0700 Subject: [PATCH 056/860] [nrk] PEP 8 --- youtube_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 48387420c..2873d7938 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -247,9 +247,11 @@ class NRKIE(NRKBaseIE): episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( r'^(\d+)\.', episode or '', 'episode number', - default=None)) or int_or_none(self._search_regex( - r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', + 'episode number', default=None)) info.update({ 'title': title, 'series': series, From 5c6e84c0ff732559d8307e9e823b5a07b0bcc9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 08:31:26 +0700 Subject: [PATCH 057/860] [nrk] Improve series metadata extraction (closes #27473) --- youtube_dl/extractor/nrk.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 2873d7938..5d33355e7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -238,16 +238,29 @@ class NRKIE(NRKBaseIE): } if is_series: - series = title + series = season_id = season_number = episode = episode_number = None + programs = self._call_api( + 'programs/%s' % video_id, video_id, 'programs', fatal=False) + if programs and isinstance(programs, dict): + series = str_or_none(programs.get('seriesTitle')) + season_id = str_or_none(programs.get('seasonId')) + season_number = int_or_none(programs.get('seasonNumber')) + episode = str_or_none(programs.get('episodeTitle')) + episode_number = int_or_none(programs.get('episodeNumber')) + if not series: + series = title if alt_title: title += ' - %s' % alt_title - season_number = int_or_none(self._search_regex( - r'Sesong\s+(\d+)', description or '', 'season number', - default=None)) - episode = alt_title if is_series else None - episode_number = int_or_none(self._search_regex( - r'^(\d+)\.', episode or '', 'episode number', - default=None)) + if not season_number: + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + if not episode: + episode = alt_title if is_series else None + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'^(\d+)\.', episode or '', 'episode number', + default=None)) if not episode_number: episode_number = int_or_none(self._search_regex( r'\((\d+)\s*:\s*\d+\)', description or '', @@ -255,6 +268,7 @@ class NRKIE(NRKBaseIE): info.update({ 'title': title, 'series': series, + 'season_id': season_id, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, @@ -388,7 +402,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', 'duration': 1563.92, 'series': 'Hellums kro', - # 'season_number': 1, + 'season_number': 1, 'episode_number': 2, 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, From d5b8cf093cdcf3ba873dba9aad898a3af91429e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 08:58:05 +0700 Subject: [PATCH 058/860] [nrk] Fix age limit extraction --- youtube_dl/extractor/nrk.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 5d33355e7..69178e157 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -220,8 +220,15 @@ class NRKIE(NRKBaseIE): 'url': sub_url, }) - age_limit = int_or_none(try_get( - data, lambda x: x['legalAge']['body']['rating']['code'])) + legal_age = try_get( + data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + # https://en.wikipedia.org/wiki/Norwegian_Media_Authority + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) + else: + age_limit = None is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' @@ -304,6 +311,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1741, 'series': '20 spørsmål', 'episode': '23. mai 2014', + 'age_limit': 0, }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', @@ -315,6 +323,7 @@ class NRKTVIE(InfoExtractor): 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -327,6 +336,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -340,6 +350,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'Ikke tilgjengelig utenfor Norge', @@ -355,6 +366,7 @@ class NRKTVIE(InfoExtractor): 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -369,6 +381,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1796, 'series': 'Nytt på nytt', 'episode': '27.01.2017', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -422,6 +435,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'season_number': 1, 'episode_number': 8, 'episode': '8. episode', + 'age_limit': 0, }, 'params': { 'skip_download': True, From f4afb9a6a8832c48888956d1e8722cb5bd36a78e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 09:10:40 +0700 Subject: [PATCH 059/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 3782ad090..4f5fbc7f8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors +* [nrk] Improve series metadata extraction (#27473) ++ [nrk] Extract subtitles +* [nrk] Fix age limit extraction +* [nrk] Improve video id extraction ++ [nrk] Add support for podcasts (#27634, #27635) +* [nrk] Generalize and delegate all item extractors to nrk ++ [nrk] Add support for mp3 formats +* [nrktv] Switch to playback endpoint +* [vvvvid] Fix season metadata extraction (#18130) +* [stitcher] Fix extraction (#20811, #27606) +* [acast] Fix extraction (#21444, #27612, #27613) ++ [arcpublishing] Add support for arcpublishing.com (#2298, #9340, #17200) ++ [sky] Add support for Sports News articles and Brighcove videos (#13054) ++ [vvvvid] Extract akamai formats +* [vvvvid] Skip unplayable episodes (#27599) +* [yandexvideo] Fix extraction for Python 3.4 + + version 2020.12.31 Core From 8e953dcbb10a1a42f4e12e4e132657cb0100a1f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 09:12:47 +0700 Subject: [PATCH 060/860] release 2021.01.03 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 8 files changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d1a6ad1f6..ee2da644b 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.31 + [debug] youtube-dl version 2021.01.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index ded5beadf..7d535b6d1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 0e06de8dc..74674ce62 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index dfbfe3701..7b0402d41 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.12.31 + [debug] youtube-dl version 2021.01.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 30061808d..17ffff465 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.12.31** +- [ ] I've verified that I'm running youtube-dl version **2021.01.03** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 4f5fbc7f8..9942ee578 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.01.03 Extractors * [nrk] Improve series metadata extraction (#27473) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b1112f83b..8178af5b6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -57,6 +57,7 @@ - **appletrailers** - **appletrailers:section** - **archive.org**: archive.org videos + - **ArcPublishing** - **ARD** - **ARD:mediathek** - **ARDBetaMediathek** @@ -610,6 +611,7 @@ - **Npr** - **NRK** - **NRKPlaylist** + - **NRKRadioPodkast** - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte @@ -813,12 +815,13 @@ - **ShowRoomLive** - **Sina** - **sky.it** + - **sky:news** + - **sky:sports** + - **sky:sports:news** - **skyacademy.it** - **SkylineWebcams** - - **SkyNews** - **skynewsarabia:article** - **skynewsarabia:video** - - **SkySports** - **Slideshare** - **SlidesLive** - **Slutload** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f795f0735..1588ee8e4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.12.31' +__version__ = '2021.01.03' From ac71fd5919302f0d42c0cd79e04522cab8ab0318 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 10:04:32 +0100 Subject: [PATCH 061/860] [stv] improve episode id extraction(closes #23083) --- youtube_dl/extractor/stv.py | 42 ++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py index bae8b71f4..539220a94 100644 --- a/youtube_dl/extractor/stv.py +++ b/youtube_dl/extractor/stv.py @@ -8,13 +8,17 @@ from ..utils import ( compat_str, float_or_none, int_or_none, + smuggle_url, + str_or_none, + try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?Pepisode|video)/(?P[a-z0-9]{4})' - _TEST = { + _TESTS = [{ + # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { @@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor): 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', - } + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', @@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), - video_id) - result = resp['results'] + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + video = result['video'] video_id = compat_str(video['id']) @@ -57,7 +85,7 @@ class STVPlayerIE(InfoExtractor): return { '_type': 'url_transparent', 'id': video_id, - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, From 8522bcd97c4173407261a3fa0283dd7800c39601 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 12:12:06 +0100 Subject: [PATCH 062/860] [stitcher] Add support for shows and show metadata extraction(closes #20510) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/stitcher.py | 120 +++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 74743a449..d1e1e9a60 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1092,7 +1092,10 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) -from .stitcher import StitcherIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index b8b5711b1..3dd0d3b5f 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -1,19 +1,60 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, int_or_none, str_or_none, try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': 'e9635098e0da10b21a0e2b85585530f6', @@ -24,8 +65,9 @@ class StitcherIE(InfoExtractor): 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20180126', - 'timestamp': 1516989316, + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -55,33 +97,47 @@ class StitcherIE(InfoExtractor): }] def _real_extract(self, url): - display_id, audio_id = re.match(self._VALID_URL, url).groups() + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) - resp = self._download_json( - 'https://api.prod.stitcher.com/episode/' + audio_id, - display_id or audio_id) - episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) - if not episode: - raise ExtractorError(resp['errors'][0]['message'], expected=True) - title = episode['title'].strip() - audio_url = episode['audio_url'] +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] - thumbnail = None - show_id = episode.get('show_id') - if show_id and episode.get('classic_id') != -1: - thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': clean_html(episode.get('html_description') or episode.get('description')), - 'duration': int_or_none(episode.get('duration')), - 'thumbnail': thumbnail, - 'url': audio_url, - 'vcodec': 'none', - 'timestamp': int_or_none(episode.get('date_created')), - 'season_number': int_or_none(episode.get('season')), - 'season_id': str_or_none(episode.get('season_id')), - } + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) + + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show)) From 491ee7efe45c287cfb8b28e6d74290d844b8bbb8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 13:29:00 +0100 Subject: [PATCH 063/860] [twitter] try to use a Generic fallback for unknown twitter cards(closes #25982) --- youtube_dl/extractor/twitter.py | 52 +++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 4284487db..a35e1686c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'simon vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1492000653, 'upload_date': '20170412', }, + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -380,6 +381,14 @@ class TwitterIE(TwitterBaseIE): # promo_video_website card 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, }] def _real_extract(self, url): @@ -462,7 +471,25 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + # amplify, promo_video_website, promo_video_convo, appplayer, ... + else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) @@ -488,25 +515,6 @@ class TwitterIE(TwitterBaseIE): 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), }) - elif card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - else: - raise ExtractorError('Unsupported Twitter Card.') else: expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) if not expanded_url: From 19ec46863587758fa6de274df293ede09b10eeb3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 13:37:24 +0100 Subject: [PATCH 064/860] [twitter] Add support for summary card(closes #25121) --- youtube_dl/extractor/twitter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a35e1686c..1190d721e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -488,6 +488,11 @@ class TwitterIE(TwitterBaseIE): 'url': get_binding_value('broadcast_url'), 'ie_key': TwitterBroadcastIE.ie_key(), }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) # amplify, promo_video_website, promo_video_convo, appplayer, ... else: is_amplify = card_name == 'amplify' From 0021a2b9a14626b1b56c77bec4d93e70a217d32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jan 2021 22:25:17 +0700 Subject: [PATCH 065/860] [nrktv] Add subtitles test --- youtube_dl/extractor/nrk.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 69178e157..cafb85616 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -298,6 +298,14 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', 'duration': 2223.44, 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From 0889eb33e0d40d567be5b2f8431952a5517276fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 4 Jan 2021 00:17:51 +0700 Subject: [PATCH 066/860] [xfileshare] Add support for aparat.cam (closes #27651) --- youtube_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 48ef07ed1..cbd5d1cbb 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -45,6 +45,7 @@ def aa_decode(aa_code): class XFileShareIE(InfoExtractor): _SITES = ( + (r'aparat\.cam', 'Aparat'), (r'clipwatching\.com', 'ClipWatching'), (r'gounlimited\.to', 'GoUnlimited'), (r'govid\.me', 'GoVid'), @@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor): 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, }] @staticmethod From e88c9ef62a4a26cc77370b741a4244d298c7d45a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 00:51:55 +0100 Subject: [PATCH 067/860] [utils] add a function to clean podcast URLs --- test/test_utils.py | 5 +++++ youtube_dl/utils.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index d49d3239c..259c4763e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from youtube_dl.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -1470,6 +1471,10 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d5fb6fd24..8e4d144c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -5706,3 +5706,20 @@ def random_birthday(year_field, month_field, day_field): month_field: str(random_date.month), day_field: str(random_date.day), } + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) From a563c97c5cddf55f8989ed7ea8314ef78e30107f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 00:53:32 +0100 Subject: [PATCH 068/860] [stitcher] clean podcast URLs --- youtube_dl/extractor/stitcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 3dd0d3b5f..822782507 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, + clean_podcast_url, ExtractorError, int_or_none, str_or_none, @@ -43,7 +44,7 @@ class StitcherBaseIE(InfoExtractor): 'title': episode['title'].strip(), 'description': self._extract_description(episode), 'duration': int_or_none(episode.get('duration')), - 'url': audio_url, + 'url': clean_podcast_url(audio_url), 'vcodec': 'none', 'timestamp': int_or_none(episode.get('date_published')), 'season_number': int_or_none(episode.get('season')), From 0e96b4b5ce79a5f5c990247f6260da40f7d71a7c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 00:53:58 +0100 Subject: [PATCH 069/860] [acast] clean podcast URLs --- youtube_dl/extractor/acast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 60378db1b..b9355a2c8 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + clean_podcast_url, int_or_none, parse_iso8601, ) @@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor): info = { 'id': episode['id'], 'display_id': episode.get('episodeUrl'), - 'url': episode['url'], + 'url': clean_podcast_url(episode['url']), 'title': title, 'description': clean_html(episode.get('description') or episode.get('summary')), 'thumbnail': episode.get('image'), From 9c484c0019ba16cd4b4d686930bfa038f1bf9320 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:00:31 +0100 Subject: [PATCH 070/860] [iheart] Add new extractor for iHeartRadio(#27037) --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/iheart.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/iheart.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d1e1e9a60..82d440a6d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -463,6 +463,10 @@ from .ign import ( OneUPIE, PCMagIE, ) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py new file mode 100644 index 000000000..6710baeb4 --- /dev/null +++ b/youtube_dl/extractor/iheart.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + str_or_none, +) + + +class IHeartRadioBaseIE(InfoExtractor): + def _call_api(self, path, video_id, fatal=True, query=None): + return self._download_json( + 'https://api.iheart.com/api/v3/podcast/' + path, + video_id, fatal=fatal, query=query) + + def _extract_episode(self, episode): + return { + 'thumbnail': episode.get('imageUrl'), + 'description': episode.get('description'), + 'timestamp': int_or_none(episode.get('startDate'), 1000), + 'duration': int_or_none(episode.get('duration')), + } + + +class IHeartRadioIE(IHeartRadioBaseIE): + IENAME = 'iheartradio' + _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P[^/?&#]+)-|iheartradio:)(?P\d+)' + _TEST = { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'timestamp': 1597741200, + 'upload_date': '20200818', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api( + 'episodes/' + episode_id, episode_id)['episode'] + info = self._extract_episode(episode) + print(episode['mediaUrl']) + info.update({ + 'id': episode_id, + 'title': episode['title'], + 'url': clean_podcast_url(episode['mediaUrl']), + }) + return info + + +class IHeartRadioPodcastIE(IHeartRadioBaseIE): + IE_NAME = 'iheartradio:podcast' + _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P\d+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'id': '30717896', + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277', + 'only_matching': True, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + path = 'podcasts/' + podcast_id + episodes = self._call_api( + path + '/episodes', podcast_id, query={'limit': 1000000000})['data'] + + entries = [] + for episode in episodes: + episode_id = str_or_none(episode.get('id')) + if not episode_id: + continue + info = self._extract_episode(episode) + info.update({ + '_type': 'url', + 'id': episode_id, + 'title': episode.get('title'), + 'url': 'iheartradio:' + episode_id, + 'ie_key': IHeartRadioIE.ie_key(), + }) + entries.append(info) + + podcast = self._call_api(path, podcast_id, False) or {} + + return self.playlist_result( + entries, podcast_id, podcast.get('title'), podcast.get('description')) From 8487e8b98afd1b469c2b9d29ee53bd173ff9a7e0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:04:20 +0100 Subject: [PATCH 071/860] [googlepodcasts] Add new extractor --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/googlepodcasts.py | 88 ++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/googlepodcasts.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 82d440a6d..3668197b1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -423,6 +423,10 @@ from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE diff --git a/youtube_dl/extractor/googlepodcasts.py b/youtube_dl/extractor/googlepodcasts.py new file mode 100644 index 000000000..31ad79907 --- /dev/null +++ b/youtube_dl/extractor/googlepodcasts.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + try_get, + urlencode_postdata, +) + + +class GooglePodcastsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' + + def _batch_execute(self, func_id, video_id, params): + return json.loads(self._download_json( + 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', + video_id, data=urlencode_postdata({ + 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), + }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) + + def _extract_episode(self, episode): + return { + 'id': episode[4][3], + 'title': episode[8], + 'url': clean_podcast_url(episode[13]), + 'thumbnail': episode[2], + 'description': episode[9], + 'creator': try_get(episode, lambda x: x[14]), + 'timestamp': int_or_none(episode[11]), + 'duration': int_or_none(episode[12]), + 'series': episode[1], + } + + +class GooglePodcastsIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/]+)/episode/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) From ac61f2e0581ad15727870e8dd9a80ddacf01636e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:09:11 +0100 Subject: [PATCH 072/860] [applepodcasts] Add new extractor(#25918) --- youtube_dl/extractor/applepodcasts.py | 61 +++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/applepodcasts.py diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/youtube_dl/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3668197b1..dbc8f89a6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -55,6 +55,7 @@ from .appletrailers import ( AppleTrailersIE, AppleTrailersSectionIE, ) +from .applepodcasts import ApplePodcastsIE from .archiveorg import ArchiveOrgIE from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE From 964a8eb754dd082069419a2fda1ecedea62b7cc5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 01:13:53 +0100 Subject: [PATCH 073/860] [googleplus] Remove Extractor(closes #4955)(closes #7400) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/googleplus.py | 73 ------------------------------ 2 files changed, 74 deletions(-) delete mode 100644 youtube_dl/extractor/googleplus.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dbc8f89a6..a82f6e5f0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -423,7 +423,6 @@ from .go import GoIE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE from .googlepodcasts import ( GooglePodcastsIE, GooglePodcastsFeedIE, diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py deleted file mode 100644 index 6b927bb44..000000000 --- a/youtube_dl/extractor/googleplus.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import codecs - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class GooglePlusIE(InfoExtractor): - IE_DESC = 'Google Plus' - _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' - IE_NAME = 'plus.google' - _TEST = { - 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', - 'info_dict': { - 'id': 'ZButuJc6CtH', - 'ext': 'flv', - 'title': '嘆きの天使 降臨', - 'upload_date': '20120613', - 'uploader': '井上ヨシマサ', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Step 1, Retrieve post webpage to extract further information - webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - - title = self._og_search_description(webpage).splitlines()[0] - upload_date = unified_strdate(self._html_search_regex( - r'''(?x) - ([0-9]{4}-[0-9]{2}-[0-9]{2})''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE)) - uploader = self._html_search_regex( - r'rel="author".*?>(.*?)', webpage, 'uploader', fatal=False) - - # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex( - r' Date: Mon, 4 Jan 2021 09:42:27 +0100 Subject: [PATCH 074/860] [iheart] remove print statement --- youtube_dl/extractor/iheart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py index 6710baeb4..7a7295ff4 100644 --- a/youtube_dl/extractor/iheart.py +++ b/youtube_dl/extractor/iheart.py @@ -45,7 +45,6 @@ class IHeartRadioIE(IHeartRadioBaseIE): episode = self._call_api( 'episodes/' + episode_id, episode_id)['episode'] info = self._extract_episode(episode) - print(episode['mediaUrl']) info.update({ 'id': episode_id, 'title': episode['title'], From 745db8899d77c56bf14443be60970aed1d9e2bdd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 09:59:20 +0100 Subject: [PATCH 075/860] [iheart] clean HTML tags from episode description --- youtube_dl/extractor/iheart.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py index 7a7295ff4..266c67a76 100644 --- a/youtube_dl/extractor/iheart.py +++ b/youtube_dl/extractor/iheart.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, int_or_none, str_or_none, @@ -18,7 +19,7 @@ class IHeartRadioBaseIE(InfoExtractor): def _extract_episode(self, episode): return { 'thumbnail': episode.get('imageUrl'), - 'description': episode.get('description'), + 'description': clean_html(episode.get('description')), 'timestamp': int_or_none(episode.get('startDate'), 1000), 'duration': int_or_none(episode.get('duration')), } From 8a3797a4abdc0b63b6fcbd1fbc9d81acc57ec142 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 4 Jan 2021 12:16:54 +0100 Subject: [PATCH 076/860] [nrk] fix extraction for videos without a legalAge rating --- youtube_dl/extractor/nrk.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index cafb85616..40dee2162 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -223,12 +223,12 @@ class NRKIE(NRKBaseIE): legal_age = try_get( data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority - if legal_age == 'A': - age_limit = 0 - elif legal_age.isdigit(): - age_limit = int_or_none(legal_age) - else: - age_limit = None + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' From f6ea29e24bf67d13d21de3c623975891a8d61ac1 Mon Sep 17 00:00:00 2001 From: Yurii H Date: Mon, 4 Jan 2021 18:16:27 +0200 Subject: [PATCH 077/860] [iheart] Update test description value (#27037) the description has no HTML tags now. --- youtube_dl/extractor/iheart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iheart.py b/youtube_dl/extractor/iheart.py index 266c67a76..b54c05eeb 100644 --- a/youtube_dl/extractor/iheart.py +++ b/youtube_dl/extractor/iheart.py @@ -35,7 +35,7 @@ class IHeartRadioIE(IHeartRadioBaseIE): 'id': '70346499', 'ext': 'mp3', 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', - 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', 'timestamp': 1597741200, 'upload_date': '20200818', } From 766fcdd0fad8495222b2b5a14f1626960cd79d89 Mon Sep 17 00:00:00 2001 From: Kevin O'Connor Date: Mon, 4 Jan 2021 13:24:01 -0500 Subject: [PATCH 078/860] [downloader/hls] Disable decryption in tests (#27660) Tests truncate the download to 10241 bytes, which is not divisible by 16 and cannot be decrypted. Tests don't really care about the decrypted content, just that the data they retrieved is the expected data. Therefore, it's fine to just return the encrypted data to tests. See: #27621 and #27620 --- youtube_dl/downloader/hls.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 5e1ff4f6b..7aaebc940 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -172,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: From 9237aaa77f7e2f0b9fc17c66589423632172f473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Jan 2021 01:34:28 +0700 Subject: [PATCH 079/860] [workflows/ci.yml] Add support for jython --- .github/workflows/ci.yml | 26 +++++++++++++++++++++++++- devscripts/install_jython.sh | 5 ----- 2 files changed, 25 insertions(+), 6 deletions(-) delete mode 100755 devscripts/install_jython.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 99944e848..ac34196cb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,28 +10,52 @@ jobs: os: [ubuntu-latest] # TODO: python 2.6 python-version: [2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] + python-impl: [cpython] ytdl-test-set: [core, download] run-tests-ext: [sh] include: # python 3.2 is only available on windows via setup-python - os: windows-latest python-version: 3.2 + python-impl: cpython ytdl-test-set: core run-tests-ext: bat - os: windows-latest python-version: 3.2 + python-impl: cpython ytdl-test-set: download run-tests-ext: bat + # jython + - os: ubuntu-latest + python-impl: jython + ytdl-test-set: core + run-tests-ext: sh + - os: ubuntu-latest + python-impl: jython + ytdl-test-set: download + run-tests-ext: sh steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 + if: ${{ matrix.python-impl == 'cpython' }} with: python-version: ${{ matrix.python-version }} + - name: Set up Java 8 + if: ${{ matrix.python-impl == 'jython' }} + uses: actions/setup-java@v1 + with: + java-version: 8 + - name: Install Jython + if: ${{ matrix.python-impl == 'jython' }} + run: | + wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + java -jar jython-installer.jar -s -d "$HOME/jython" + echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose run: pip install nose - name: Run tests - continue-on-error: ${{ matrix.ytdl-test-set == 'download' }} + continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: YTDL_TEST_SET: ${{ matrix.ytdl-test-set }} run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} diff --git a/devscripts/install_jython.sh b/devscripts/install_jython.sh deleted file mode 100755 index bafca4da4..000000000 --- a/devscripts/install_jython.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -wget http://central.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -java -jar jython-installer-2.7.1.jar -s -d "$HOME/jython" -$HOME/jython/bin/jython -m pip install nose From c3399cac19a51897024545fb267f7579ea42318b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Jan 2021 02:44:27 +0700 Subject: [PATCH 080/860] [travis] Drop Travis CI (closes #7193, closes #12366) --- .travis.yml | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d828d027d..000000000 --- a/.travis.yml +++ /dev/null @@ -1,50 +0,0 @@ -language: python -python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "3.5" - - "3.6" - - "pypy" - - "pypy3" -dist: trusty -env: - - YTDL_TEST_SET=core -# - YTDL_TEST_SET=download -jobs: - include: - - python: 3.7 - dist: xenial - env: YTDL_TEST_SET=core -# - python: 3.7 -# dist: xenial -# env: YTDL_TEST_SET=download - - python: 3.8 - dist: xenial - env: YTDL_TEST_SET=core -# - python: 3.8 -# dist: xenial -# env: YTDL_TEST_SET=download - - python: 3.8-dev - dist: xenial - env: YTDL_TEST_SET=core -# - python: 3.8-dev -# dist: xenial -# env: YTDL_TEST_SET=download - - env: JYTHON=true; YTDL_TEST_SET=core -# - env: JYTHON=true; YTDL_TEST_SET=download - - name: flake8 - python: 3.8 - dist: xenial - install: pip install flake8 - script: flake8 . - fast_finish: true - allow_failures: -# - env: YTDL_TEST_SET=download - - env: JYTHON=true; YTDL_TEST_SET=core -# - env: JYTHON=true; YTDL_TEST_SET=download -before_install: - - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi -script: ./devscripts/run_tests.sh From f318882955b90bead8206ee411641e65037b1011 Mon Sep 17 00:00:00 2001 From: cladmi Date: Wed, 16 Dec 2020 09:54:48 +0100 Subject: [PATCH 081/860] [motherless] Fix recent videos upload date extraction (closes #27661) Less than a week old videos use a '20h ago' or '1d ago' format. I kept the support for 'Ago' with uppercase start at is was already in the code. --- youtube_dl/extractor/motherless.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index b1615b4d8..6cc36b308 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -85,18 +85,27 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', # 1,234,567 Views + r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), + (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - if 'Ago' in upload_date: - days = int(re.search(r'([0-9]+)', upload_date).group(1)) - upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') + relative = re.match(r'(\d+)([hd])$', upload_date) + if relative: + delta = int(relative.group(1)) + unit = relative.group(2) + if unit == 'h': + delta_t = datetime.timedelta(hours=delta) + else: # unit == 'd' + delta_t = datetime.timedelta(days=delta) + upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) From ecae54a98d2a8d9300142bf3d586f31e8144ccd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Jan 2021 07:40:06 +0700 Subject: [PATCH 082/860] [motherless] Fix review issues and improve extraction (closes #26495, closes #27450) --- youtube_dl/extractor/motherless.py | 52 +++++++++++++++++++----------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6cc36b308..ef1e081f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Views<', # 1,234,567 Views - r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + (r'>([\d,.]+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago - r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - relative = re.match(r'(\d+)([hd])$', upload_date) - if relative: - delta = int(relative.group(1)) - unit = relative.group(2) - if unit == 'h': - delta_t = datetime.timedelta(hours=delta) - else: # unit == 'd' - delta_t = datetime.timedelta(days=delta) - upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( From be1a3f2d11b9d88c10b624b12b00b508d923983d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jan 2021 17:31:19 +0100 Subject: [PATCH 083/860] [dplay] Add suport Discovery+ domains(closes #27680) --- youtube_dl/extractor/dplay.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a7b9db568..47501dbe6 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -17,7 +17,12 @@ from ..utils import ( class DPlayIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P - (?:www\.)?(?Pdplay\.(?Pdk|fi|jp|se|no))| + (?:www\.)?(?Pd + (?: + play\.(?Pdk|fi|jp|se|no)| + iscoveryplus\.(?Pdk|es|fi|it|se|no) + ) + )| (?Pes|it)\.dplay\.com )/[^/]+/(?P[^/]+/[^/?#]+)''' @@ -126,6 +131,24 @@ class DPlayIE(InfoExtractor): }, { 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, }] def _get_disco_api_info(self, url, display_id, disco_host, realm, country): @@ -241,7 +264,7 @@ class DPlayIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') - host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) From 8f757c735306355e1743a444364b79b681dea661 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jan 2021 18:17:04 +0100 Subject: [PATCH 084/860] [ketnet] fix extraction(closes #27662) --- youtube_dl/extractor/ketnet.py | 119 ++++++++++++++------------------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py index 93a98e1e0..e0599d02f 100644 --- a/youtube_dl/extractor/ketnet.py +++ b/youtube_dl/extractor/ketnet.py @@ -2,92 +2,71 @@ from __future__ import unicode_literals from .canvas import CanvasIE from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_iso8601, +) class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': '6bdeb65998930251bbd1c510750edba9', + 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', 'info_dict': { - 'id': 'zomerse-filmpjes', + 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', 'ext': 'mp4', - 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', - 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'title': 'Nachtwacht - Reeks 3: Aflevering 1', + 'description': 'De Nachtwacht krijgt te maken met een parasiet', 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - # mzid in playerConfig instead of sources - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', - 'md5': '90139b746a0a9bd7bb631283f6e2a64e', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'flv', - 'title': 'Nachtwacht: De Greystook', - 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.03, + 'duration': 1468.02, + 'timestamp': 1609225200, + 'upload_date': '20201229', + 'series': 'Nachtwacht', + 'season': 'Reeks 3', + 'episode': 'De Greystook', + 'episode_number': 1, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { - 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', - 'only_matching': True, - }, { - 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', - 'only_matching': True, - }, { - # mzsource, geo restricted to Belgium - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', + 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] - config = self._parse_json( - self._search_regex( - r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, - 'player config'), - video_id) - - mzid = config.get('mzid') - if mzid: - return self.url_result( - 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, - CanvasIE.ie_key(), video_id=mzid) - - title = config['title'] - - formats = [] - for source_key in ('', 'mz'): - source = config.get('%ssource' % source_key) - if not isinstance(source, dict): - continue - for format_id, format_url in source.items(): - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) - elif format_id == 'hds': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) + mz_id = compat_urllib_parse_unquote(video['mediaReference']) return { - 'id': video_id, - 'title': title, - 'description': config.get('description'), - 'thumbnail': config.get('image'), - 'series': config.get('program'), - 'episode': config.get('episode'), - 'formats': formats, + '_type': 'url_transparent', + 'id': mz_id, + 'title': video['titleVideodetail'], + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, + 'thumbnail': video.get('imageUrl'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publicationDate')), + 'series': video.get('programTitle'), + 'season': video.get('seasonTitle'), + 'episode': video.get('subtitleVideodetail'), + 'episode_number': int_or_none(video.get('episodeNr')), + 'ie_key': CanvasIE.ie_key(), } From fcd90d258305cdafa5bd23c50443229205fcb9ed Mon Sep 17 00:00:00 2001 From: nixxo Date: Tue, 5 Jan 2021 19:49:56 +0100 Subject: [PATCH 085/860] [rai] Detect ContentItem in iframe (closes #12652) (#27673) Co-authored-by: Sergey M. --- youtube_dl/extractor/rai.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index ecb628f14..0a68d16b0 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -326,6 +326,19 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # ContentItem in iframe (see #12652) + 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', + 'info_dict': { + 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', + 'ext': 'mp4', + 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', + 'description': 'md5:d291b03407ec505f95f27970c0b025f4', + 'upload_date': '20150913', + }, + 'params': { + 'skip_download': True, + }, }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', @@ -403,7 +416,8 @@ class RaiIE(RaiBaseIE): r'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)= + <(?:[^>]+\bdata-id|var\s+uniquename)=| + ]+\bsrc= ) (["\']) (?:(?!\1).)*\bContentItem-(?P%s) From 5b24f8f505582f353c3c2f7b79b5f67612ba9c87 Mon Sep 17 00:00:00 2001 From: 23rd <23rd@vivaldi.net> Date: Sun, 3 Jan 2021 16:30:56 +0300 Subject: [PATCH 086/860] [twitch] Switch access token to GraphQL and refactor. --- youtube_dl/extractor/twitch.py | 114 ++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ab6654432..50dcb93ef 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -160,7 +160,64 @@ class TwitchBaseIE(InfoExtractor): return compat_str(self._parse_json(token, channel_name)['channel_id']) -class TwitchVodIE(TwitchBaseIE): +class TwitchGraphQLBaseIE(TwitchBaseIE): + _PAGE_LIMIT = 100 + + _OPERATION_HASHES = { + 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', + 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', + 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', + 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', + } + + def _download_base_gql(self, video_id, ops, note, fatal=True): + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps(ops).encode(), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + }, fatal=fatal) + + def _download_gql(self, video_id, ops, note, fatal=True): + for op in ops: + op['extensions'] = { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': self._OPERATION_HASHES[op['operationName']], + } + } + return self._download_base_gql(video_id, ops, note) + + def _download_access_token_gql(self, video_id, item_type=None): + if item_type == 'vod': + method = 'videoPlaybackAccessToken' + param_name = 'id' + else: + method = 'streamPlaybackAccessToken' + param_name = 'channelName' + ops = { + 'query': '''{ + %s( + %s: "%s", + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + }) { + value + signature + } + }''' % (method, param_name, video_id), + } + note = 'Downloading access token GraphQL' + return self._download_base_gql(video_id, ops, note)['data'][method] + + +class TwitchVodIE(TwitchGraphQLBaseIE): IE_NAME = 'twitch:vod' _VALID_URL = r'''(?x) https?:// @@ -276,9 +333,7 @@ class TwitchVodIE(TwitchBaseIE): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._call_api( - 'api/vods/%s/access_token' % vod_id, vod_id, - 'Downloading %s access token' % self._ITEM_TYPE) + access_token = self._download_access_token_gql(vod_id, self._ITEM_TYPE) formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -289,8 +344,8 @@ class TwitchVodIE(TwitchBaseIE): 'allow_spectre': 'true', 'player': 'twitchweb', 'playlist_include_framerate': 'true', - 'nauth': access_token['token'], - 'nauthsig': access_token['sig'], + 'nauth': access_token['value'], + 'nauthsig': access_token['signature'], })), vod_id, 'mp4', entry_protocol='m3u8_native') @@ -333,36 +388,6 @@ def _make_video_result(node): } -class TwitchGraphQLBaseIE(TwitchBaseIE): - _PAGE_LIMIT = 100 - - _OPERATION_HASHES = { - 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', - 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', - 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', - 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - } - - def _download_gql(self, video_id, ops, note, fatal=True): - for op in ops: - op['extensions'] = { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': self._OPERATION_HASHES[op['operationName']], - } - } - return self._download_json( - 'https://gql.twitch.tv/gql', video_id, note, - data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) - - class TwitchCollectionIE(TwitchGraphQLBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P[^/]+)' @@ -814,8 +839,8 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token(channel_name) - token = access_token['token'] + access_token = self._download_access_token_gql(channel_name) + token = access_token['value'] stream_id = stream.get('id') or channel_name query = { @@ -826,7 +851,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): 'player': 'twitchweb', 'playlist_include_framerate': 'true', 'segment_preference': '4', - 'sig': access_token['sig'].encode('utf-8'), + 'sig': access_token['signature'].encode('utf-8'), 'token': token.encode('utf-8'), } formats = self._extract_m3u8_formats( @@ -866,7 +891,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): } -class TwitchClipsIE(TwitchBaseIE): +class TwitchClipsIE(TwitchGraphQLBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'''(?x) https?:// @@ -912,8 +937,8 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_json( - 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ + clip = self._download_base_gql( + video_id, { 'query': '''{ clip(slug: "%s") { broadcaster { @@ -937,10 +962,7 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id, - }).encode(), headers={ - 'Client-ID': self._CLIENT_ID, - })['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] if not clip: raise ExtractorError( From ccc71122915e630d99e8266c73a2eba26707f199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 6 Jan 2021 02:10:44 +0700 Subject: [PATCH 087/860] [twitch] Improve access token extraction and remove unused code (closes #27646) --- youtube_dl/extractor/twitch.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 50dcb93ef..a939ea24e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -192,29 +192,27 @@ class TwitchGraphQLBaseIE(TwitchBaseIE): } return self._download_base_gql(video_id, ops, note) - def _download_access_token_gql(self, video_id, item_type=None): - if item_type == 'vod': - method = 'videoPlaybackAccessToken' - param_name = 'id' - else: - method = 'streamPlaybackAccessToken' - param_name = 'channelName' + def _download_access_token_gql(self, video_id, token_kind, param_name): + method = '%sPlaybackAccessToken' % token_kind ops = { 'query': '''{ %s( %s: "%s", - params: { - platform: "web", - playerBackend: "mediaplayer", - playerType: "site" - }) { + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + } + ) + { value signature } }''' % (method, param_name, video_id), } - note = 'Downloading access token GraphQL' - return self._download_base_gql(video_id, ops, note)['data'][method] + return self._download_base_gql( + video_id, ops, + 'Downloading %s access token GraphQL' % token_kind)['data'][method] class TwitchVodIE(TwitchGraphQLBaseIE): @@ -227,8 +225,6 @@ class TwitchVodIE(TwitchGraphQLBaseIE): ) (?P\d+) ''' - _ITEM_TYPE = 'vod' - _ITEM_SHORTCUT = 'v' _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', @@ -333,7 +329,7 @@ class TwitchVodIE(TwitchGraphQLBaseIE): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._download_access_token_gql(vod_id, self._ITEM_TYPE) + access_token = self._download_access_token_gql(vod_id, 'video', 'id') formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -839,7 +835,8 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token_gql(channel_name) + access_token = self._download_access_token_gql( + channel_name, 'stream', 'channelName') token = access_token['value'] stream_id = stream.get('id') or channel_name From 1ae7ae0b969d378ea41e6b90b9c5d44358d3e36b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jan 2021 21:17:08 +0100 Subject: [PATCH 088/860] [canvas] Fix VRT NU extraction(closes #26957)(closes #27053) --- youtube_dl/extractor/canvas.py | 120 ++++++++++++--------------------- 1 file changed, 43 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8667a0d04..65d65d52e 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,12 +7,12 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( + extract_attributes, ExtractorError, strip_or_none, float_or_none, int_or_none, merge_dicts, - parse_iso8601, str_or_none, url_or_none, ) @@ -37,6 +37,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +48,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -208,17 +214,21 @@ class VrtNUIE(GigyaBaseIE): _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?Pvrtnu)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +310,25 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) + webpage = self._download_webpage(url, display_id) + + attrs = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id + + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)

(.+?)

', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)
(.+?)
', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)\s* - seizoen\ (.+?)\s* - ''', - r'
[^>]*)(?=$|)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r']+>[^<]+', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _extract_chapters(self, webpage, description, video_id, duration): - return (self._extract_chapters_from_json(webpage, video_id, duration) - or self._extract_chapters_from_description(description, duration)) + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + webpage = self._download_webpage(webpage_url, video_id, fatal=False) - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') + player_response = None + if webpage: + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, + video_id, 'initial player response') + if not player_response: + player_response = self._call_api( + 'player', {'videoId': video_id}, video_id) - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) + playability_status = player_response.get('playabilityStatus') or {} + if playability_status.get('reason') == 'Sign in to confirm your age': + pr = self._parse_json(try_get(compat_parse_qs( + self._download_webpage( + base_url + 'get_video_info', video_id, + 'Refetching age-gated info webpage', + 'unable to download video info webpage', query={ + 'video_id': video_id, + }, fatal=False)), + lambda x: x['player_response'][0], + compat_str) or '{}', video_id) + if pr: + player_response = pr - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) + trailer_video_id = try_get( + playability_status, + lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], + compat_str) + if trailer_video_id: + return self.url_result( + trailer_video_id, self.ie_key(), trailer_video_id) - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage, urlh = self._download_webpage_handle(url, video_id) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) - video_id = qs.get('v', [None])[0] or video_id - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: + def get_text(x): + if not x: return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response + return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) - player_response = {} - - # Get video info - video_info = {} - embed_webpage = None - ytplayer_config = None - - if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - - if not video_info and not player_response: - player_response = extract_player_response( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage, - 'initial player response', default='{}'), - video_id) - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info and not player_response: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - if not isinstance(video_info, dict): - video_info = {} - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} + search_meta = ( + lambda x: self._html_search_meta(x, webpage, default=None)) \ + if webpage else lambda x: None + video_details = player_response.get('videoDetails') or {} microformat = try_get( - player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - ]*> - [^<]+\.{3}\s* - - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = video_details.get('shortDescription') - if video_description is None: - video_description = self._html_search_meta('description', video_webpage) + player_response, + lambda x: x['microformat']['playerMicroformatRenderer'], + dict) or {} + video_title = video_details.get('title') \ + or get_text(microformat.get('title')) \ + or search_meta(['og:title', 'twitter:title', 'title']) + video_description = video_details.get('shortDescription') if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): multifeed_metadata_list = try_get( player_response, lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + compat_str) if multifeed_metadata_list: entries = [] feed_ids = [] @@ -1821,10 +1298,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) def feed_entry(name): - return try_get(feed_data, lambda x: x[name][0], compat_str) + return try_get( + feed_data, lambda x: x[name][0], compat_str) feed_id = feed_entry('id') if not feed_id: @@ -1837,7 +1316,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + base_url + 'watch?v=' + feed_data['id'][0], {'force_singlefeed': True}), 'title': title, }) @@ -1845,631 +1324,393 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) + return self.playlist_result( + entries, video_id, video_title, video_description) else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - if view_count is None and microformat: - view_count = int_or_none(microformat.get('viewCount')) + formats = [] + itags = [] + player_url = None + q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) + streaming_data = player_response.get('streamingData') or {} + streaming_formats = streaming_data.get('formats') or [] + streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) + for fmt in streaming_formats: + if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + continue - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: + fmt_url = fmt.get('url') + if not fmt_url: + sc = compat_parse_qs(fmt.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not (sc and fmt_url and encrypted_sig): continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drmFamilies') or fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') or fmt.get('signatureCipher') - if not cipher: + if not player_url: + if not webpage: continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage, 'player URL', fatal=False) + if not player_url: + continue + signature = self._decrypt_signature(sc['s'][0], video_id, player_url) + sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' + fmt_url += '&' + sp + '=' + signature + + itag = str_or_none(fmt.get('itag')) + if itag: + itags.append(itag) + quality = fmt.get('quality') + dct = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_id': itag, + 'format_note': fmt.get('qualityLabel') or quality, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + 'tbr': float_or_none(fmt.get( + 'averageBitrate') or fmt.get('bitrate'), 1000), + 'url': fmt_url, + 'width': fmt.get('width'), + } + mimetype = fmt.get('mimeType') + if mimetype: + mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) + if mobj: + dct['ext'] = mimetype2ext(mobj.group(1)) + dct.update(parse_codecs(mobj.group(2))) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + formats.append(dct) + + hls_manifest_url = streaming_data.get('hlsManifestUrl') + if hls_manifest_url: + for f in self._extract_m3u8_formats( + hls_manifest_url, video_id, 'mp4', fatal=False): + itag = self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None) + if itag: + f['format_id'] = itag + formats.append(f) + + if self._downloader.params.get('youtube_include_dash_manifest'): + dash_manifest_url = streaming_data.get('dashManifestUrl') + if dash_manifest_url: + for f in self._extract_mpd_formats( + dash_manifest_url, video_id, fatal=False): + if f['format_id'] in itags: continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + filesize = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') + or f['url'], 'file size', default=None)) + if filesize: + f['filesize'] = filesize + formats.append(f) - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue + if not formats: + if streaming_data.get('licenseInfos'): + raise ExtractorError( + 'This video is DRM protected.', expected=True) + pemr = try_get( + playability_status, + lambda x: x['errorScreen']['playerErrorMessageRenderer'], + dict) or {} + reason = get_text(pemr.get('reason')) or playability_status.get('reason') + subreason = pemr.get('subreason') + if subreason: + subreason = clean_html(get_text(subreason)) + if subreason == 'The uploader has not made this video available in your country.': + countries = microformat.get('availableCountries') + if not countries: + regions_allowed = search_meta('regionsAllowed') + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + subreason, countries) + reason += '\n' + subreason + if reason: + raise ExtractorError(reason, expected=True) - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) + self._sort_formats(formats) - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = ( - r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', - r'"jsUrl"\s*:\s*("[^"]+")', - r'"assets":.+?"js":\s*("[^"]+")') - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_desc = 'unknown' - else: - player_type, player_version = self._extract_player_info(player_url) - player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - if not error_message: - reason_list = try_get( - player_response, - lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], - list) or [] - for reason in reason_list: - if not isinstance(reason, dict): - continue - reason_text = try_get(reason, lambda x: x['text'], compat_str) - if reason_text: - if not error_message: - error_message = '' - error_message += reason_text - if error_message: - error_message = clean_html(error_message) - if not error_message: - error_message = clean_html(try_get( - player_response, lambda x: x['playabilityStatus']['reason'], - compat_str)) - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) - if owner_profile_url: - video_uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', - default=None) - video_uploader_url = owner_profile_url - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + keywords = video_details.get('keywords') or [] + if not keywords and webpage: + keywords = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] + for keyword in keywords: + if keyword.startswith('yt:stretch='): + w, h = keyword.split('=')[1].split(':') + w, h = int(w), int(h) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio thumbnails = [] - thumbnails_list = try_get( - video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) - - if not thumbnails: - video_thumbnail = None - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) - if thumbnail_url: - video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) - if video_thumbnail: - thumbnails.append({'url': video_thumbnail}) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = microformat.get('publishDate') or microformat.get('uploadDate') - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) + for container in (video_details, microformat): + for thumbnail in (try_get( + container, + lambda x: x['thumbnail']['thumbnails'], list) or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'height': int_or_none(thumbnail.get('height')), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + }) + if thumbnails: + break else: - video_alt_title = video_creator = None + thumbnail = search_meta(['og:image', 'twitter:image']) + if thumbnail: + thumbnails = [{'url': thumbnail}] - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) + category = microformat.get('category') or search_meta('genre') + channel_id = video_details.get('channelId') \ + or microformat.get('externalChannelId') \ + or search_meta('channelId') + duration = int_or_none( + video_details.get('lengthSeconds') + or microformat.get('lengthSeconds')) \ + or parse_duration(search_meta('duration')) + is_live = video_details.get('isLive') + owner_profile_url = microformat.get('ownerProfileUrl') - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') + info = { + 'id': video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_description, + 'upload_date': unified_strdate( + microformat.get('uploadDate') + or search_meta('uploadDate')), + 'uploader': video_details['author'], + 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, + 'duration': duration, + 'view_count': int_or_none( + video_details.get('viewCount') + or microformat.get('viewCount') + or search_meta('interactionCount')), + 'average_rating': float_or_none(video_details.get('averageRating')), + 'age_limit': 18 if ( + microformat.get('isFamilySafe') is False + or search_meta('isFamilyFriendly') == 'false' + or search_meta('og:restrictions:age') == '18+') else 0, + 'webpage_url': webpage_url, + 'categories': [category] if category else None, + 'tags': keywords, + 'is_live': is_live, + } + + pctr = try_get( + player_response, + lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) + if pctr: + def process_language(container, base_url, caption, query): + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + query.update({ + 'fmt': fmt, + }) + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, query), + }) + subtitles[caption['languageCode']] = lang_subs + + subtitles = {} + for caption_track in pctr['captionTracks']: + base_url = caption_track['baseUrl'] + if caption_track.get('kind') != 'asr': + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, { + 'fmt': fmt, + }), + }) + subtitles[caption_track['languageCode']] = lang_subs + continue + automatic_captions = {} + for translation_language in pctr['translationLanguages']: + translation_language_code = translation_language['languageCode'] + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, { + 'fmt': fmt, + 'tlang': translation_language_code, + }), + }) + automatic_captions[translation_language_code] = lang_subs + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + for k, v in query.items(): + for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: + d_k += '_time' + if d_k not in info and k in s_ks: + info[d_k] = parse_duration(query[k][0]) - # Youtube Music Auto-generated description - release_date = release_year = None if video_description: mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) release_year = mobj.group('release_year') release_date = mobj.group('release_date') if release_date: release_date = release_date.replace('-', '') if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) + release_year = release_date[:4] + info.update({ + 'album': mobj.group('album'.strip()), + 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'track': mobj.group('track').strip(), + 'release_date': release_date, + 'release_year': int(release_year), + }) - yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) - contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - for content in contents: - rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True - break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = try_get( - mrr, lambda x: x['title']['simpleText'], compat_str) - mrr_contents = try_get( - mrr, lambda x: x['contents'][0], dict) or {} - mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) - if not (mrr_title and mrr_contents_text): + initial_data = None + if webpage: + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, + 'yt initial data') + if not initial_data: + initial_data = self._call_api( + 'next', {'videoId': video_id}, video_id, fatal=False) + + if initial_data: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: continue - if mrr_title == 'License': - video_license = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - album = mrr_contents_text - elif mrr_title == 'Artist': - artist = mrr_contents_text - elif mrr_title == 'Song': - track = mrr_contents_text - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None + def chapter_time(mmlir): + return parse_duration(mmlir.get( + get_text(mmlir.get('timeDescription')))) - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - category = None - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - if not category: - category = try_get( - microformat, lambda x: x['category'], compat_str) - video_categories = None if category is None else [category] + chapters = [] + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if not (start_time and end_time): + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + info['chapters'] = chapters - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - if not video_tags: - video_tags = try_get(video_details, lambda x: x['keywords'], list) + contents = try_get( + initial_data, + lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], + list) or [] + for content in contents: + vpir = content.get('videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: + info.update({ + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), + }) + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = content.get('videoSecondaryInfoRenderer') + if vsir: + info['channel'] = get_text(try_get( + vsir, + lambda x: x['owner']['videoOwnerRenderer']['title'], + compat_str)) + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = get_text(mrr['title']) + mrr_contents_text = get_text(mrr['contents'][0]) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text - def _extract_count(count_name): - return str_to_int(self._search_regex( - (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), - r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), - video_webpage, count_name, default=None)) + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + v = info.get(s_k) + if v: + info[d_k] = v - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') + self.mark_watched(video_id, player_response) - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = None - ytcfg = self._extract_ytcfg(video_id, video_webpage) - if ytcfg: - xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) - if not xsrf_token: - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = None - if ytcfg: - xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) - if not xsrf_field_name: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df.get('fragment_base_url') or df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnails': thumbnails, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } + return info class YoutubeTabIE(YoutubeBaseInfoExtractor): From 9c724601ba234085dc5071ec9c1c3d98e6834817 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:08:50 +0100 Subject: [PATCH 170/860] [youtube] remove description chapters tests video description no longer contain yt.www.watch.player.seekTo function --- test/test_youtube_chapters.py | 275 -------------------------------- youtube_dl/extractor/youtube.py | 91 ++++++++--- 2 files changed, 67 insertions(+), 299 deletions(-) delete mode 100644 test/test_youtube_chapters.py diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py deleted file mode 100644 index e69c57377..000000000 --- a/test/test_youtube_chapters.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import expect_value -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeChapters(unittest.TestCase): - - _TEST_CASES = [ - ( - # https://www.youtube.com/watch?v=A22oy8dFjqc - # pattern: 00:00 - <title> - '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', - 1477, - [{ - 'start_time': 36, - 'end_time': 162, - 'title': 'Bohemian Rhapsody', - }, { - 'start_time': 162, - 'end_time': 413, - 'title': 'Radio Ga Ga', - }, { - 'start_time': 413, - 'end_time': 454, - 'title': 'Ay Oh!', - }, { - 'start_time': 454, - 'end_time': 728, - 'title': 'Hammer To Fall', - }, { - 'start_time': 728, - 'end_time': 963, - 'title': 'Crazy Little Thing Called Love', - }, { - 'start_time': 963, - 'end_time': 1038, - 'title': 'We Will Rock You', - }, { - 'start_time': 1038, - 'end_time': 1272, - 'title': 'We Are The Champions', - }, { - 'start_time': 1272, - 'end_time': 1477, - 'title': 'Is This The World We Created...?', - }] - ), - ( - # https://www.youtube.com/watch?v=ekYlRhALiRQ - # pattern: <num>. <title> 0:00 - '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', - 4009, - [{ - 'start_time': 0, - 'end_time': 707, - 'title': '1. Those Beaten Paths of Confusion', - }, { - 'start_time': 707, - 'end_time': 1590, - 'title': '2. Beyond the Shadows of Emptiness & Nothingness', - }, { - 'start_time': 1590, - 'end_time': 2157, - 'title': '3. Poison Yourself...With Thought', - }, { - 'start_time': 2157, - 'end_time': 2672, - 'title': '4. The Agents of Transformation', - }, { - 'start_time': 2672, - 'end_time': 3187, - 'title': '5. Drowning in the Pain of Consciousness', - }, { - 'start_time': 3187, - 'end_time': 4009, - 'title': '6. Deny the Disease of Life', - }] - ), - ( - # https://www.youtube.com/watch?v=WjL4pSzog9w - # pattern: 00:00 <title> - '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', - 2705, - [{ - 'start_time': 0, - 'end_time': 428, - 'title': 'Christening Unborn Deformities', - }, { - 'start_time': 428, - 'end_time': 976, - 'title': 'Taste of Purity', - }, { - 'start_time': 976, - 'end_time': 1485, - 'title': 'Sculpting Sins of a Universal Tongue', - }, { - 'start_time': 1485, - 'end_time': 1884, - 'title': 'Birth', - }, { - 'start_time': 1884, - 'end_time': 2275, - 'title': 'Neves', - }, { - 'start_time': 2275, - 'end_time': 2705, - 'title': 'Libations in Limbo', - }] - ), - ( - # https://www.youtube.com/watch?v=o3r1sn-t3is - # pattern: <title> 00:00 <note> - 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', - 5640, - [{ - 'start_time': 45, - 'end_time': 266, - 'title': 'I-E-A-I-A-I-O', - }, { - 'start_time': 266, - 'end_time': 331, - 'title': 'Suite-Pee (Incomplete)', - }, { - 'start_time': 331, - 'end_time': 522, - 'title': 'Attack (First live performance since 2011)', - }, { - 'start_time': 522, - 'end_time': 752, - 'title': 'Prison Song', - }, { - 'start_time': 752, - 'end_time': 932, - 'title': 'Know (First live performance since 2011)', - }, { - 'start_time': 932, - 'end_time': 1153, - 'title': 'Aerials', - }, { - 'start_time': 1153, - 'end_time': 1209, - 'title': 'Soldier Side - Intro', - }, { - 'start_time': 1209, - 'end_time': 1472, - 'title': 'B.Y.O.B.', - }, { - 'start_time': 1472, - 'end_time': 1668, - 'title': 'Soil', - }, { - 'start_time': 1668, - 'end_time': 1838, - 'title': 'Darts', - }, { - 'start_time': 1838, - 'end_time': 2105, - 'title': 'Radio/Video', - }, { - 'start_time': 2105, - 'end_time': 2288, - 'title': 'Hypnotize', - }, { - 'start_time': 2288, - 'end_time': 2460, - 'title': 'Temper (First live performance since 1999)', - }, { - 'start_time': 2460, - 'end_time': 2577, - 'title': 'CUBErt', - }, { - 'start_time': 2577, - 'end_time': 2787, - 'title': 'Needles', - }, { - 'start_time': 2787, - 'end_time': 2978, - 'title': 'Deer Dance', - }, { - 'start_time': 2978, - 'end_time': 3085, - 'title': 'Bounce', - }, { - 'start_time': 3085, - 'end_time': 3232, - 'title': 'Suggestions', - }, { - 'start_time': 3232, - 'end_time': 3493, - 'title': 'Psycho', - }, { - 'start_time': 3493, - 'end_time': 3675, - 'title': 'Chop Suey!', - }, { - 'start_time': 3675, - 'end_time': 3854, - 'title': 'Lonely Day', - }, { - 'start_time': 3854, - 'end_time': 4090, - 'title': 'Question!', - }, { - 'start_time': 4090, - 'end_time': 4420, - 'title': 'Lost in Hollywood', - }, { - 'start_time': 4420, - 'end_time': 4577, - 'title': 'Vicinity of Obscenity (First live performance since 2012)', - }, { - 'start_time': 4577, - 'end_time': 4802, - 'title': 'Forest', - }, { - 'start_time': 4802, - 'end_time': 5037, - 'title': 'Cigaro', - }, { - 'start_time': 5037, - 'end_time': 5273, - 'title': 'Toxicity (with Chino Moreno)', - }, { - 'start_time': 5273, - 'end_time': 5640, - 'title': 'Sugar', - }] - ), - ( - # https://www.youtube.com/watch?v=PkYLQbsqCE8 - # pattern: <num> - <title> [<latinized title>] 0:00:00 - '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', - 1138, - [{ - 'start_time': 0, - 'end_time': 490, - 'title': '1 - Во прах [Vo prakh]', - }, { - 'start_time': 490, - 'end_time': 870, - 'title': '2 - Искупление [Iskupleniye]', - }, { - 'start_time': 870, - 'end_time': 1138, - 'title': '3 - Из серпов луны...[Iz serpov luny]', - }] - ), - ( - # https://www.youtube.com/watch?v=xZW70zEasOk - # time point more than duration - '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', - 283, - [] - ), - ] - - def test_youtube_chapters(self): - for description, duration, expected_chapters in self._TEST_CASES: - ie = YoutubeIE() - expect_value( - self, ie._extract_chapters_from_description(description, duration), - expected_chapters, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5f6769878..edaca0658 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1223,6 +1223,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id + def _extract_chapters_from_json(self, data, video_id, duration): + chapters_list = try_get( + data, + lambda x: x['playerOverlays'] + ['playerOverlayRenderer'] + ['decoratedPlayerBarRenderer'] + ['decoratedPlayerBarRenderer'] + ['playerBar'] + ['chapteredPlayerBarRenderer'] + ['chapters'], + list) + if not chapters_list: + return + + def chapter_time(chapter): + return float_or_none( + try_get( + chapter, + lambda x: x['chapterRenderer']['timeRangeStartMillis'], + int), + scale=1000) + chapters = [] + for next_num, chapter in enumerate(chapters_list, start=1): + start_time = chapter_time(chapter) + if start_time is None: + continue + end_time = (chapter_time(chapters_list[next_num]) + if next_num < len(chapters_list) else duration) + if end_time is None: + continue + title = try_get( + chapter, lambda x: x['chapterRenderer']['title']['simpleText'], + compat_str) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': title, + }) + return chapters + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): return self._parse_json(self._search_regex( (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), @@ -1597,31 +1637,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'next', {'videoId': video_id}, video_id, fatal=False) if initial_data: - for engagment_pannel in (initial_data.get('engagementPanels') or []): - contents = try_get( - engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], - list) - if not contents: - continue - - def chapter_time(mmlir): - return parse_duration(mmlir.get( - get_text(mmlir.get('timeDescription')))) - - chapters = [] - for next_num, content in enumerate(contents, start=1): - mmlir = content.get('macroMarkersListItemRenderer') or {} - start_time = chapter_time(mmlir) - end_time = chapter_time(try_get( - contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ - if next_num < len(contents) else duration - if not (start_time and end_time): + chapters = self._extract_chapters_from_json( + initial_data, video_id, duration) + if not chapters: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': get_text(mmlir.get('title')), - }) + + def chapter_time(mmlir): + return parse_duration(mmlir.get( + get_text(mmlir.get('timeDescription')))) + + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if not (start_time and end_time): + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + if chapters: info['chapters'] = chapters contents = try_get( From b46483a6ec6a42889fc16d53afd76d147748785f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:35:07 +0100 Subject: [PATCH 171/860] [youtube/test_youtube_signature] fix test --- test/test_youtube_signature.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30eda..b5a4d0d5f 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -86,13 +86,9 @@ class TestPlayerInfo(unittest.TestCase): ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), ) for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) + player_id = YoutubeIE._extract_player_info(player_url) self.assertEqual(player_id, expected_player_id) From 159a3d48dfb2b4ed77dc691433e420506c9340c3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:36:19 +0100 Subject: [PATCH 172/860] [youtube] keep _formats array for format sorting tests --- youtube_dl/extractor/youtube.py | 105 ++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index edaca0658..ed844e2a3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,111 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, ] + _formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + } def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) From efef4ddf51c375c3a9eb12355a61a21d69aec33f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:49:52 +0100 Subject: [PATCH 173/860] [youtube] fix chapter extraction fallback --- youtube_dl/extractor/youtube.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ed844e2a3..65fa777e4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1753,22 +1753,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue def chapter_time(mmlir): - return parse_duration(mmlir.get( - get_text(mmlir.get('timeDescription')))) + return parse_duration( + get_text(mmlir.get('timeDescription'))) + chapters = [] for next_num, content in enumerate(contents, start=1): mmlir = content.get('macroMarkersListItemRenderer') or {} start_time = chapter_time(mmlir) end_time = chapter_time(try_get( contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ if next_num < len(contents) else duration - if not (start_time and end_time): + if start_time is None or end_time is None: continue chapters.append({ 'start_time': start_time, 'end_time': end_time, 'title': get_text(mmlir.get('title')), }) + if chapters: + break if chapters: info['chapters'] = chapters From 65eee5a745f705a7904709accdba47efb852cc6a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 18:12:35 +0100 Subject: [PATCH 174/860] [youtube] improve subtitle extraction --- youtube_dl/extractor/youtube.py | 43 ++++++++++++++------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 65fa777e4..75a007353 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1664,7 +1664,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response, lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) if pctr: - def process_language(container, base_url, caption, query): + def process_language(container, base_url, lang_code, query): lang_subs = [] for fmt in self._SUBTITLE_FORMATS: query.update({ @@ -1674,35 +1674,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': fmt, 'url': update_url_query(base_url, query), }) - subtitles[caption['languageCode']] = lang_subs + container[lang_code] = lang_subs subtitles = {} - for caption_track in pctr['captionTracks']: - base_url = caption_track['baseUrl'] + for caption_track in (pctr.get('captionTracks') or []): + base_url = caption_track.get('baseUrl') + if not base_url: + continue if caption_track.get('kind') != 'asr': - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, { - 'fmt': fmt, - }), - }) - subtitles[caption_track['languageCode']] = lang_subs + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, {}) continue automatic_captions = {} - for translation_language in pctr['translationLanguages']: - translation_language_code = translation_language['languageCode'] - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, { - 'fmt': fmt, - 'tlang': translation_language_code, - }), - }) - automatic_captions[translation_language_code] = lang_subs + for translation_language in (pctr.get('translationLanguages') or []): + translation_language_code = translation_language.get('languageCode') + if not translation_language_code: + continue + process_language( + automatic_captions, base_url, translation_language_code, + {'tlang': translation_language_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles From 8fa7cc387d699899114f7430bcf61837d58557a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 21:35:18 +0100 Subject: [PATCH 175/860] [vidio] improve metadata extraction --- youtube_dl/extractor/vidio.py | 86 ++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r'<span[^>]+data-comment-vote-count=["\'](\d+)', - r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } From c11f7cf9bd6ef239f25e7fb9c54e092ae1490e2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 22:35:28 +0100 Subject: [PATCH 176/860] [vidzi] remove extractor(closes #12629) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/vidzi.py | 68 ------------------------------ 2 files changed, 69 deletions(-) delete mode 100644 youtube_dl/extractor/vidzi.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab8d6a5a5..97b0b4034 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1399,7 +1399,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py deleted file mode 100644 index 42ea4952c..000000000 --- a/youtube_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict From 0e3a9684795c6c53546dace9e917ed11c4ae72a5 Mon Sep 17 00:00:00 2001 From: Viren Rajput <virendra.rajput567@gmail.com> Date: Mon, 1 Feb 2021 04:56:33 +0000 Subject: [PATCH 177/860] [egghead] update API domain(closes #28038) --- youtube_dl/extractor/egghead.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index df11dc206..94dd75b9b 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -12,7 +12,14 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' @@ -28,10 +35,9 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +50,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,7 +62,7 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' @@ -74,7 +79,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': ['free', 'javascript'], }, 'params': { 'skip_download': True, @@ -88,8 +93,8 @@ class EggheadLessonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] From b111a64135244b73b86a1720e9a5212e726afcbf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 2 Feb 2021 19:05:37 +0100 Subject: [PATCH 178/860] [egghead] fix typo --- youtube_dl/extractor/egghead.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index 94dd75b9b..aff9b88c0 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -16,7 +16,7 @@ class EggheadBaseIE(InfoExtractor): def _call_api(self, path, video_id, resource, fatal=True): return self._download_json( 'https://app.egghead.io/api/v1/' + path, - video_id, 'Downloading %s JSON' % resource) + video_id, 'Downloading %s JSON' % resource, fatal=fatal) class EggheadCourseIE(EggheadBaseIE): @@ -79,7 +79,7 @@ class EggheadLessonIE(EggheadBaseIE): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['free', 'javascript'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, From 1e2575df8714ce9056e559058a187ec0ffd2d739 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 00:21:46 +0100 Subject: [PATCH 179/860] Credit @adrianheine for #27732 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index b507cb8df..4a6d7dacd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Adrian Heine \ No newline at end of file From 07f7aad81c47a11483a357e53380fae1ffbadea9 Mon Sep 17 00:00:00 2001 From: Guillem Vela <guillemglez@gmail.com> Date: Thu, 27 Feb 2020 22:18:47 +0100 Subject: [PATCH 180/860] [ccma] improve metadata extraction(closes #27994) - extract age_limit, alt_title, categories, series and episode_number - fix timestamp multiple subtitles extraction --- youtube_dl/extractor/ccma.py | 65 ++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 544647f92..4db51e650 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re from .common import InfoExtractor @@ -8,8 +9,8 @@ from ..utils import ( clean_html, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +25,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +37,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +90,27 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timestamp = datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +125,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +144,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } From ab25f3f43196ca56964ba34ba4674fcb2d08f69a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 17:15:31 +0100 Subject: [PATCH 181/860] [youtube] pass embed URL to get_video_info request --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75a007353..42b0f452c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1397,6 +1397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, + 'eurl': 'https://www.youtube.com/embed/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 1b731ebcaa3ef2a1e52cf6968cf93e08d50fe0d4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 18:13:17 +0100 Subject: [PATCH 182/860] [bravotv] add support for oxygen.com(closes #13357)(closes #22500) --- youtube_dl/extractor/bravotv.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] From 83031d749b11f062b9ba97023c228329e771cbd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 00:25:26 +0700 Subject: [PATCH 183/860] [pornhub:user] Add support for URLs unavailable via /videos page and improve paging (closes #27853) --- youtube_dl/extractor/pornhub.py | 56 +++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2fcbd186f..67e3731c8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,6 +22,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, url_or_none, ) @@ -405,6 +406,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -463,14 +468,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -488,17 +506,37 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num): + note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) From e22ff4e35681a600ed61918beab8ed316728ec39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:09:11 +0700 Subject: [PATCH 184/860] [pornhub] Add support for authentication (closes #18797, closes #21416, closes #24294) --- youtube_dl/extractor/pornhub.py | 106 +++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 67e3731c8..83307a233 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -23,6 +23,7 @@ from ..utils import ( remove_quotes, str_to_int, update_url_query, + urlencode_postdata, url_or_none, ) @@ -53,6 +54,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -164,12 +225,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -181,12 +250,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -427,26 +491,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -506,12 +550,14 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') + self._login(host) + page = self._extract_page(url) VIDEOS = '/videos' - def download_page(base_url, num): - note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) @@ -532,7 +578,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 if is_404(e) and page_num == first_page and VIDEOS in base_url: base_url = base_url.replace(VIDEOS, '') - webpage = download_page(base_url, page_num) + webpage = download_page(base_url, page_num, fallback=True) else: raise except ExtractorError as e: From 1f0910bc2742b16be8425841d5ed6a0fd96f82a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:17:45 +0700 Subject: [PATCH 185/860] [svtplay] Fix video id extraction (closes #28058) --- youtube_dl/extractor/svt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index a0b6ef4db..4acc29fce 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -255,8 +255,10 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) From 2adc0c51cdf38e039fba0ede11f65bbd9c71bde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:20:09 +0700 Subject: [PATCH 186/860] [pornhub] Add placeholder netrc machine --- youtube_dl/extractor/pornhub.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83307a233..83773aebb 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -29,6 +29,8 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) From 89c5a7d5aabd138a14c76453d79d5d66ef573bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:36:57 +0700 Subject: [PATCH 187/860] [pornhub] Implement lazy playlist extraction --- youtube_dl/extractor/pornhub.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83773aebb..b7631e4e1 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -547,13 +547,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): <button[^>]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') - - self._login(host) - + def _entries(self, url, host, item_id): page = self._extract_page(url) VIDEOS = '/videos' @@ -566,7 +560,6 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): def is_404(e): return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 - entries = [] base_url = url has_page = page is not None first_page = page if has_page else 1 @@ -590,11 +583,19 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): From 3c07d007ca5376719a0cfe6b9c6627b38cbd3e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:47:30 +0700 Subject: [PATCH 188/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/ChangeLog b/ChangeLog index 7f2e0aad1..bd753d524 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,45 @@ +version <unreleased> + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + version 2021.01.24.1 Core From cfefb7d854f87e02c971170fcfa08f3ff2cb1bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:49:25 +0700 Subject: [PATCH 189/860] release 2021.02.04 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2dde97a2c..86e48bc4e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c520d1ee0..fa369b744 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 4aacd3bdc..806c7c58d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 91bbed506..1d1a36dda 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a0a2c989a..c19052a7a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index bd753d524..d5d9c00a2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.04 Extractors * [pornhub] Implement lazy playlist extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13bac6e27..e1b85b1d1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -537,6 +537,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -1058,7 +1059,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1103,6 +1103,7 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c52f1d9ca..d898525c9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.24.1' +__version__ = '2021.02.04' From fc88e8f0e3e66f17f787cbc1ea45c87fdc70781e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Thu, 4 Feb 2021 00:57:56 +0100 Subject: [PATCH 190/860] [azmedien] Fix extraction (#28064) --- youtube_dl/extractor/azmedien.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): From 7215691ab7cabc858b17c16928c372da3e35ec59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:07:43 +0700 Subject: [PATCH 191/860] [youtube] Prefer DASH formats (closes #28070) --- youtube_dl/extractor/youtube.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 42b0f452c..a3b10c094 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1549,16 +1549,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: + dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): - if f['format_id'] in itags: - continue filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - formats.append(f) + dash_formats.append(f) + # Until further investigation prefer DASH formats as non-DASH + # may not be available (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 + if dash_formats: + dash_formats_keys = [f['format_id'] for f in dash_formats] + formats = [f for f in formats if f['format_id'] not in dash_formats_keys] + formats.extend(dash_formats) if not formats: if streaming_data.get('licenseInfos'): From c7d407bca205d8eb248b94b611435187265b79da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:09:28 +0700 Subject: [PATCH 192/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index d5d9c00a2..4392a4e6f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version <unreleased> + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + version 2021.02.04 Extractors From a4bdc3112bf0e925afc2e512d5f23f9097f6bc7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:11:33 +0700 Subject: [PATCH 193/860] release 2021.02.04.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 86e48bc4e..19b750f86 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index fa369b744..8acb80b60 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 806c7c58d..66edcf752 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 1d1a36dda..18203fb34 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c19052a7a..20df40cc5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 4392a4e6f..784b73d8d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.04.1 Extractors * [youtube] Prefer DASH formats (#28070) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d898525c9..425f15589 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04' +__version__ = '2021.02.04.1' From 1641b132323b544b9ae0dad06707425eba1f926b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 4 Feb 2021 13:05:35 +0100 Subject: [PATCH 194/860] [youtube] skip OTF formats(#28070) --- youtube_dl/extractor/youtube.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3b10c094..eb5f70763 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1477,6 +1477,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] + itag_qualities = {} player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} @@ -1486,6 +1487,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): continue + itag = str_or_none(fmt.get('itag')) + quality = fmt.get('quality') + if itag and quality: + itag_qualities[itag] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue + fmt_url = fmt.get('url') if not fmt_url: sc = compat_parse_qs(fmt.get('signatureCipher')) @@ -1505,10 +1516,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature - itag = str_or_none(fmt.get('itag')) if itag: itags.append(itag) - quality = fmt.get('quality') dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1549,22 +1558,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: - dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: + continue + if itag in itag_qualities: + f['quality'] = q(itag_qualities[itag]) filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - dash_formats.append(f) - # Until further investigation prefer DASH formats as non-DASH - # may not be available (see [1]) - # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 - if dash_formats: - dash_formats_keys = [f['format_id'] for f in dash_formats] - formats = [f for f in formats if f['format_id'] not in dash_formats_keys] - formats.extend(dash_formats) + formats.append(f) if not formats: if streaming_data.get('licenseInfos'): From 0156ce95c5ba83de6c68a149d352ccecd983a294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 01:54:46 +0700 Subject: [PATCH 195/860] [youtube] Extract abr and vbr (closes #28100) --- youtube_dl/extractor/youtube.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eb5f70763..b5e0f4eaa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -500,6 +500,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', + 'abr': 129.495, }, 'params': { 'youtube_include_dash_manifest': True, @@ -1518,6 +1519,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if itag: itags.append(itag) + tbr = float_or_none( + fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1526,8 +1529,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fps': int_or_none(fmt.get('fps')), 'height': int_or_none(fmt.get('height')), 'quality': q(quality), - 'tbr': float_or_none(fmt.get( - 'averageBitrate') or fmt.get('bitrate'), 1000), + 'tbr': tbr, 'url': fmt_url, 'width': fmt.get('width'), } @@ -1538,7 +1540,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if mobj: dct['ext'] = mimetype2ext(mobj.group(1)) dct.update(parse_codecs(mobj.group(2))) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + no_audio = dct.get('acodec') == 'none' + no_video = dct.get('vcodec') == 'none' + if no_audio: + dct['vbr'] = tbr + if no_video: + dct['abr'] = tbr + if no_audio or no_video: dct['downloader_options'] = { # Youtube throttles chunks >~10M 'http_chunk_size': 10485760, From 0cf09c2b4168cb99800836d8c1ff0d6d8b16fb6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 02:17:03 +0700 Subject: [PATCH 196/860] [youtube] Fix release date extraction (closes #28094) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b5e0f4eaa..c87e54e6b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', 'only_matching': True, }, + { + # https://github.com/ytdl-org/youtube-dl/pull/28094 + 'url': 'OtqTfy26tG0', + 'info_dict': { + 'id': 'OtqTfy26tG0', + 'ext': 'mp4', + 'title': 'Burn Out', + 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', + 'upload_date': '20141120', + 'uploader': 'The Cinematic Orchestra - Topic', + 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', + 'artist': 'The Cinematic Orchestra', + 'track': 'Burn Out', + 'album': 'Every Day', + 'release_data': None, + 'release_year': None, + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1743,7 +1765,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), 'track': mobj.group('track').strip(), 'release_date': release_date, - 'release_year': int(release_year), + 'release_year': int_or_none(release_year), }) initial_data = None From 240585470539d31d9c3785a67861491fa3696451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Sat, 6 Feb 2021 20:46:05 +0100 Subject: [PATCH 197/860] [urplay] Fix extraction (closes #28073) (#28074) --- youtube_dl/extractor/urplay.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 10b817760..5452c7ca1 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -42,8 +42,8 @@ class URPlayIE(InfoExtractor): url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) urplayer_data = self._parse_json(self._html_search_regex( - r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['currentProduct'] + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( From 7a9161578e42abe681c9d3352ecc9a18a9b8df6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 19:18:06 +0700 Subject: [PATCH 198/860] [cda] Detect geo restricted videos (refs #28106) --- youtube_dl/extractor/cda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 6429454fb..1b4362144 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -95,6 +95,9 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): + self.raise_geo_restricted() + need_confirm_age = False if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): From 5fc53690cbe6abb11941a3f4846b566a7472753e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Feb 2021 20:34:41 +0700 Subject: [PATCH 199/860] [archiveorg] Fix and improve extraction (closes #21330, closes #23586, closes #25277, closes #26780, closes #27109, closes #27236, closes #28063) --- youtube_dl/extractor/archiveorg.py | 54 +++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info From 99c68db0a8adc634e2e928ea2756a2ceee3ae863 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 8 Feb 2021 09:20:28 +0100 Subject: [PATCH 200/860] [youtube] add support phone/tablet JS player(closes #26424) --- test/test_youtube_signature.py | 31 ++++++++++--------------------- youtube_dl/extractor/youtube.py | 6 +++++- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index b5a4d0d5f..627d4cb92 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -19,55 +19,46 @@ from youtube_dl.compat import compat_str, compat_urlretrieve _TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', - 'js', 86, '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', - 'js', 85, '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', - 'js', 90, ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', - 'js', 84, 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', - 'js', '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', - 'js', 84, '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', - 'js', 83, '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', - 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', - 'js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) @@ -78,6 +69,10 @@ class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -100,13 +95,13 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_input, expected_sig): +def make_tfunc(url, sig_input, expected_sig): m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): - basename = 'player-%s.%s' % (test_id, stype) + basename = 'player-%s.js' % test_id fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): @@ -114,22 +109,16 @@ def make_tfunc(url, stype, sig_input, expected_sig): ydl = FakeYDL() ie = YoutubeIE(ydl) - if stype == 'js': - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - else: - assert stype == 'swf' - with open(fn, 'rb') as testf: - swfcode = testf.read() - func = ie._parse_sig_swf(swfcode) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) src_sig = ( compat_str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) - test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + test_func.__name__ = str('test_signature_js_' + test_id) setattr(TestSignature, test_func.__name__, test_func) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c87e54e6b..346311d9b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -398,7 +398,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _PLAYER_INFO_RE = ( - r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$', + r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', + r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') @@ -1237,6 +1238,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns From 311ebdd9a57e72116136a464fbc0fa8cad32db42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 8 Feb 2021 15:46:32 +0700 Subject: [PATCH 201/860] [xhamster] Extract formats from xplayer settings and extract filesizes (closes #28114) --- youtube_dl/extractor/xhamster.py | 80 +++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 76aeaf9a4..f73b9778f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,11 +11,14 @@ from ..utils import ( dict_get, extract_attributes, ExtractorError, + float_or_none, int_or_none, parse_duration, + str_or_none, try_get, unified_strdate, url_or_none, + urljoin, ) @@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor): video = initials['videoModel'] title = video['title'] formats = [] - for format_id, formats_dict in video['sources'].items(): + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): if not isinstance(formats_dict, dict): continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) for quality, format_item in formats_dict.items(): if format_id == 'download': # Download link takes some time to be generated, # skipping for now continue - if not isinstance(format_item, dict): - continue - format_url = format_item.get('link') - filesize = int_or_none( - format_item.get('size'), invscale=1000000) - else: - format_url = format_item - filesize = None + format_url = format_item format_url = url_or_none(format_url) - if not format_url: + if not format_url or format_url in format_urls: continue + format_urls.add(format_url) formats.append({ 'format_id': '%s-%s' % (format_id, quality), 'url': format_url, 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), - 'filesize': filesize, + 'filesize': format_sizes.get(quality), 'http_headers': { 'Referer': urlh.geturl(), }, }) - self._sort_formats(formats) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) categories_list = video.get('categories') if isinstance(categories_list, list): From 7f8b8bc418b8831ea1c2ae8de64e3bf0e8b707f8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 8 Feb 2021 15:56:42 +0100 Subject: [PATCH 202/860] [ign] fix extraction(closes #24771) --- youtube_dl/extractor/extractors.py | 4 +- youtube_dl/extractor/ign.py | 371 +++++++++++++++-------------- 2 files changed, 200 insertions(+), 175 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 97b0b4034..84998316c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -470,8 +470,8 @@ from .hungama import ( from .hypem import HypemIE from .ign import ( IGNIE, - OneUPIE, - PCMagIE, + IGNVideoIE, + IGNArticleIE, ) from .iheart import ( IHeartRadioIE, diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index a96ea8010..0d9f50ed2 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,230 +3,255 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + HEADRequest, + determine_ext, int_or_none, parse_iso8601, + strip_or_none, + try_get, ) -class IGNIE(InfoExtractor): +class IGNBaseIE(InfoExtractor): + def _call_api(self, slug): + return self._download_json( + 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + + +class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' - _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' - _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' - - _TESTS = [ - { - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'uploader_id': 'cberidon@ign.com', - } - }, - { - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - 'uploader_id': 'danieljkrupa@gmail.com', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - 'uploader_id': 'togilvie@ign.com', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'md5': '618fedb9c901fd086f6f093564ef8558', - 'info_dict': { - 'id': '078fdd005f6d3c02f63d795faa1b984f', - 'ext': 'mp4', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', - 'timestamp': 1408047180, - 'upload_date': '20140814', - 'uploader_id': 'jamesduggan1990@gmail.com', - }, - }, - { - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'only_matching': True, - }, - { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, - { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, - ] - - def _find_video_id(self, webpage): - res_id = [ - r'"video_id"\s*:\s*"(.*?)"', - r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', - r'data-video-id="(.+?)"', - r'<object id="vid_(.+?)"', - r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', - r'videoId"\s*:\s*"(.+?)"', - r'videoId["\']\s*:\s*["\']([^"\']+?)["\']', - ] - return self._search_regex(res_id, webpage, 'video id', default=None) + _TESTS = [{ + 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + 'md5': 'd2e1586d9987d40fad7867bf96a018ea', + 'info_dict': { + 'id': '8f862beef863986b2785559b9e1aa599', + 'ext': 'mp4', + 'title': 'The Last of Us Review', + 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', + 'tags': 'count:9', + } + }, { + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:817a20299de610bd56f13175386da6fa', + 'timestamp': 1420571160, + 'upload_date': '20150106', + 'tags': 'count:4', + } + }, { + 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name_or_id = mobj.group('name_or_id') - page_type = mobj.group('type') - webpage = self._download_webpage(url, name_or_id) - if page_type != 'video': - multiple_urls = re.findall( - r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', - webpage) - if multiple_urls: - entries = [self.url_result(u, ie='IGN') for u in multiple_urls] - return { - '_type': 'playlist', - 'id': name_or_id, - 'entries': entries, - } - - video_id = self._find_video_id(webpage) - if not video_id: - return self.url_result(self._search_regex( - self._EMBED_RE, webpage, 'embed url')) - return self._get_video_info(video_id) - - def _get_video_info(self, video_id): - api_data = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id) + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - m3u8_url = api_data['refs'].get('m3uUrl') + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = api_data['refs'].get('f4mUrl') + + f4m_url = refs.get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - for asset in api_data['assets']: + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue formats.append({ - 'url': asset['url'], - 'tbr': asset.get('actual_bitrate_kbps'), - 'fps': asset.get('frame_rate'), + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + self._sort_formats(formats) - thumbnails = [{ - 'url': thumbnail['url'] - } for thumbnail in api_data.get('thumbnails', [])] + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) - metadata = api_data['metadata'] + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) return { - 'id': api_data.get('videoId') or video_id, - 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], - 'description': metadata.get('description'), + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': metadata.get('slug') or video_id, - 'uploader_id': metadata.get('creator'), + 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, + 'tags': tags, } -class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' - IE_NAME = '1up.com' - +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': 'c9cc69e07acb675c31a16719f909e347', + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', 'info_dict': { - 'id': '34976', + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', 'ext': 'mp4', - 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', - 'timestamp': 1313099220, - 'upload_date': '20110811', - 'uploader_id': 'IGN', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - result = super(OneUPIE, self)._real_extract(url) - result['id'] = mobj.group('name_or_id') - return result + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) -class PCMagIE(IGNIE): - _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' - IE_NAME = 'pcmag' - - _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' - +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' + _PAGE_TYPE = 'article' _TESTS = [{ - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'uploader_id': 'cozzipix@gmail.com', - } + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, }, { - 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', - 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { - 'id': '042e560ba94823d43afcb12ddf7142ca', - 'ext': 'mp4', - 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', - 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', - 'timestamp': 1412953920, - 'upload_date': '20141010', - 'uploader_id': 'chris_snyder@pcmag.com', - } + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) From a4c7ed6b1e9100be8ef65c44e7e6e43b9314ff5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Feb 2021 22:28:58 +0700 Subject: [PATCH 203/860] [youtube:tab] Improve grid continuation extraction (closes #28130) --- youtube_dl/extractor/youtube.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 346311d9b..c78996629 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2374,9 +2374,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): next_continuation = cls._extract_next_continuation_data(renderer) if next_continuation: return next_continuation - contents = renderer.get('contents') - if not isinstance(contents, list): - return + contents = [] + for key in ('contents', 'items'): + contents.extend(try_get(renderer, lambda x: x[key], list) or []) for content in contents: if not isinstance(content, dict): continue @@ -2509,6 +2509,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue + renderer = continuation_item.get('gridVideoRenderer') + if renderer: + grid_renderer = {'items': continuation_items} + for entry in self._grid_entries(grid_renderer): + yield entry + continuation = self._extract_continuation(grid_renderer) + continue renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} From cd493c5adcb526cdfa2a9d5194269b671a0dc343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Feb 2021 22:32:25 +0700 Subject: [PATCH 204/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index 784b73d8d..5951372b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version <unreleased> + +Extractors +* [youtube:tab] Improve grid continuation extraction (#28130) +* [ign] Fix extraction (#24771) ++ [xhamster] Extract format filesize ++ [xhamster] Extract formats from xplayer settings (#28114) ++ [youtube] Add support phone/tablet JS player (#26424) +* [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, + #27109, #27236, #28063) ++ [cda] Detect geo restricted videos (#28106) +* [urplay] Fix extraction (#28073, #28074) +* [youtube] Fix release date extraction (#28094) ++ [youtube] Extract abr and vbr (#28100) +* [youtube] Skip OTF formats (#28070) + + version 2021.02.04.1 Extractors From 360d5f0daac879a1371c6a45e0d3310ced60e352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Feb 2021 22:34:47 +0700 Subject: [PATCH 205/860] release 2021.02.10 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 19b750f86..ea0a59dca 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8acb80b60..d24855c72 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 66edcf752..8b96a2883 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 18203fb34..e46971047 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 20df40cc5..a9ca379ca 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.10. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 5951372b3..384bd19c2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.10 Extractors * [youtube:tab] Improve grid continuation extraction (#28130) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e1b85b1d1..1373cc4f6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,6 +1,5 @@ # Supported sites - **1tv**: Первый канал - - **1up.com** - **20min** - **220.ro** - **23video** @@ -376,6 +375,8 @@ - **HungamaSong** - **Hypem** - **ign.com** + - **IGNArticle** + - **IGNVideo** - **IHeartRadio** - **iheartradio:podcast** - **imdb**: Internet Movie Database trailers @@ -676,7 +677,6 @@ - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - - **pcmag** - **PearVideo** - **PeerTube** - **People** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 425f15589..79d2be625 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04.1' +__version__ = '2021.02.10' From f28f1b4d6ed053f1dbfbc7fc992162c4ea4f2ce7 Mon Sep 17 00:00:00 2001 From: Kevin Velghe <kevin@paretje.be> Date: Thu, 11 Feb 2021 09:04:16 +0100 Subject: [PATCH 206/860] [canvas] Add new extractor for Dagelijkse Kost (#28119) --- youtube_dl/extractor/canvas.py | 56 ++++++++++++++++++++++++++++-- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8b76a0200..eefbab241 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,19 +7,21 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( - extract_attributes, ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -332,3 +334,51 @@ class VrtNUIE(GigyaBaseIE): 'display_id': display_id, 'season_number': int_or_none(page.get('episode_season')), }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 84998316c..e4c475fd8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -163,6 +163,7 @@ from .canvas import ( CanvasIE, CanvasEenIE, VrtNUIE, + DagelijkseKostIE, ) from .carambatv import ( CarambaTVIE, From f94d76499362017f673520286bc3848916735275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Feb 2021 05:03:15 +0700 Subject: [PATCH 207/860] [ard] Improve formats extraction (closes #28155) --- youtube_dl/extractor/ard.py | 44 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf5b3f13..143fc51e9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -284,20 +284,42 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) From 6d32c6c6d3b0588b193eaeb4178592219c3b4df8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 14 Feb 2021 16:22:45 +0100 Subject: [PATCH 208/860] [xboxclips] fix extraction(closes #27151) --- youtube_dl/extractor/xboxclips.py | 45 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index d9c277bc3..25f487e1e 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -1,40 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, + month_by_abbreviation, parse_filesize, - unified_strdate, ) class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', - 'title': 'Iabdulelah playing Titanfall', + 'title': 'iAbdulElah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } - } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + if '/video.php' in url: + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) - video_url = self._html_search_regex( - r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'<title>XboxClips \| ([^<]+)', webpage, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( @@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - return { + info.update({ 'id': video_id, - 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, - } + }) + return info From d8085580f63ad3b146a31712ff76cf41d5a4558a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 19:48:26 +0100 Subject: [PATCH 209/860] [kakao] improve info extraction and detect geo restriction(closes #26577) --- youtube_dl/extractor/kakao.py | 64 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 32935bb28..31ce7a85c 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, + str_or_none, strip_or_none, + try_get, unified_timestamp, update_url_query, ) @@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', @@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor): 'fields': ','.join([ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } @@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - query['tid'] = impress.get('tid', '') + query.update({ + 'fields': '-*,code,message,url', + 'tid': impress.get('tid') or '', + }) formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in (clip.get('videoOutputList') or []): try: profile_name = fmt['profile'] if profile_name == 'AUDIO': continue - query.update({ - 'profile': profile_name, - 'fields': '-*,url', - }) - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header, fatal=False) - - if fmt_url_json is None: + query['profile'] = profile_name + try: + fmt_url_json = self._download_json( + api_base + 'raw/videolocation', display_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, headers=player_header) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() continue fmt_url = fmt_url_json['url'] @@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor): pass self._sort_formats(formats) - thumbs = [] - for thumb in clip.get('clipChapterThumbnailList', []): - thumbs.append({ - 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 - }) - top_thumbnail = clip.get('thumbnailUrl') - if top_thumbnail: - thumbs.append({ - 'url': top_thumbnail, - 'preference': 10, - }) - return { 'id': display_id, 'title': title, 'description': strip_or_none(clip.get('description')), - 'uploader': clip_link.get('channel', {}).get('name'), - 'uploader_id': clip_link.get('channelId'), - 'thumbnails': thumbs, + 'uploader': try_get(clip_link, lambda x: x['channel']['name']), + 'uploader_id': str_or_none(clip_link.get('channelId')), + 'thumbnail': clip.get('thumbnailUrl'), 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), 'view_count': int_or_none(clip.get('playCount')), From be2e9b76eea73b073f00871ea831ee3f4a1000b3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 22:10:06 +0100 Subject: [PATCH 210/860] [videopress] add support for video.wordpress.com --- youtube_dl/extractor/videopress.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index e5f964d39..6376ff096 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -4,21 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, + int_or_none, parse_age_limit, qualities, random_birthday, - try_get, unified_timestamp, urljoin, ) class VideoPressIE(InfoExtractor): - _VALID_URL = r'https?://videopress\.com/embed/(?P[\da-zA-Z]+)' + _ID_REGEX = r'[\da-zA-Z]{8}' + _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' + _VALID_URL = r'https?://%s(?P%s)' % (_PATH_REGEX, _ID_REGEX) _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor): # 17+, requires birth_* params 'url': 'https://videopress.com/embed/iH3gstfZ', 'only_matching': True, + }, { + 'url': 'https://video.wordpress.com/embed/kUJmAcSf', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + r']+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), webpage) def _real_extract(self, url): video_id = self._match_id(url) query = random_birthday('birth_year', 'birth_month', 'birth_day') + query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width' video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, video_id, query=query) title = video['title'] - def base_url(scheme): - return try_get( - video, lambda x: x['file_url_base'][scheme], compat_str) - - base_url = base_url('https') or base_url('http') + file_url_base = video.get('file_url_base') or {} + base_url = file_url_base.get('https') or file_url_base.get('http') QUALITIES = ('std', 'dvd', 'hd') quality = qualities(QUALITIES) formats = [] - for format_id, f in video['files'].items(): + for format_id, f in (video.get('files') or {}).items(): if not isinstance(f, dict): continue for ext, path in f.items(): @@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor): 'ext': determine_ext(path, ext), 'quality': quality(format_id), }) - original_url = try_get(video, lambda x: x['original'], compat_str) + original_url = video.get('original') if original_url: formats.append({ 'url': original_url, 'format_id': 'original', 'quality': len(QUALITIES), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) From 4b5410c5c841b826965ea76d62bea82a26a9e1b8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Feb 2021 13:06:54 +0100 Subject: [PATCH 211/860] [ccma] fix timestamp parsing in python 2 --- youtube_dl/extractor/ccma.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 4db51e650..e6ae49352 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, + extract_timezone, int_or_none, parse_duration, parse_resolution, @@ -97,8 +99,9 @@ class CCMAIE(InfoExtractor): timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) try: - timestamp = datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) except TypeError: pass From 07eb8f19169c58bce0e784607ea350ae16ed5363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Feb 2021 05:57:53 +0700 Subject: [PATCH 212/860] [youtube] Fix controversial videos when authenticated with cookies (closes #28174) --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c78996629..7db4503e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1042,6 +1042,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # controversial video, only works with bpctr when authenticated with cookies + 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', + 'only_matching': True, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1405,7 +1410,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = self._match_id(url) base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id - webpage = self._download_webpage(webpage_url, video_id, fatal=False) + webpage = self._download_webpage( + webpage_url + '&bpctr=9999999999', video_id, fatal=False) player_response = None if webpage: From 56c63c8c02d7b9aabbced8d150badb6b520825d2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 16 Feb 2021 10:08:43 +0100 Subject: [PATCH 213/860] [zhihu] Add new extractor(closes #28177) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/zhihu.py | 69 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/zhihu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e4c475fd8..4347f1b74 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1603,5 +1603,6 @@ from .zattoo import ( ZattooLiveIE, ) from .zdf import ZDFIE, ZDFChannelIE +from .zhihu import ZhihuIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zhihu.py b/youtube_dl/extractor/zhihu.py new file mode 100644 index 000000000..d1ed55be3 --- /dev/null +++ b/youtube_dl/extractor/zhihu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none + + +class ZhihuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.zhihu.com/zvideo/1342930761977176064', + 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464', + 'info_dict': { + 'id': '1342930761977176064', + 'ext': 'mp4', + 'title': '写春联也太难了吧!', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': '桥半舫', + 'timestamp': 1612959715, + 'upload_date': '20210210', + 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365', + 'duration': 146.333, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + zvideo = self._download_json( + 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id) + title = zvideo['title'] + video = zvideo.get('video') or {} + + formats = [] + for format_id, q in (video.get('playlist') or {}).items(): + play_url = q.get('url') or q.get('play_url') + if not play_url: + continue + formats.append({ + 'asr': int_or_none(q.get('sample_rate')), + 'filesize': int_or_none(q.get('size')), + 'format_id': format_id, + 'fps': int_or_none(q.get('fps')), + 'height': int_or_none(q.get('height')), + 'tbr': float_or_none(q.get('bitrate')), + 'url': play_url, + 'width': int_or_none(q.get('width')), + }) + self._sort_formats(formats) + + author = zvideo.get('author') or {} + url_token = author.get('url_token') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'), + 'uploader': author.get('name'), + 'timestamp': int_or_none(zvideo.get('published_at')), + 'uploader_id': author.get('id'), + 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None, + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(zvideo.get('play_count')), + 'like_count': int_or_none(zvideo.get('liked_count')), + 'comment_count': int_or_none(zvideo.get('comment_count')), + } From 844e4cbc547f2a2f76053786522bdd6b53bf9ae1 Mon Sep 17 00:00:00 2001 From: Stephen Stair Date: Sun, 16 Aug 2020 17:07:14 -0700 Subject: [PATCH 214/860] [storyfire] Add new extractor(closes #25628)(closes #26349) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/storyfire.py | 151 +++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 youtube_dl/extractor/storyfire.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4347f1b74..51f6d38e9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1141,6 +1141,11 @@ from .srgssr import ( from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py new file mode 100644 index 000000000..9c698626f --- /dev/null +++ b/youtube_dl/extractor/storyfire.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools + +from .common import InfoExtractor +from ..utils import ( + # HEADRequest, + int_or_none, + OnDemandPagedList, + smuggle_url, +) + + +class StoryFireBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' + + def _call_api(self, path, video_id, resource, query=None): + return self._download_json( + 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, + 'Downloading %s JSON metadata' % resource, query=query) + + def _parse_video(self, video): + title = video['title'] + vimeo_id = self._search_regex( + r'https?://player\.vimeo\.com/external/(\d+)', + video['vimeoVideoURL'], 'vimeo id') + + # video_url = self._request_webpage( + # HEADRequest(video['vimeoVideoURL']), video_id).geturl() + # formats = [] + # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: + # formats.extend(self._extract_m3u8_formats( + # v_url, video_id, 'mp4', 'm3u8_native', + # m3u8_id='hls' + suffix, fatal=False)) + # formats.extend(self._extract_mpd_formats( + # v_url.replace('.m3u8', '.mpd'), video_id, + # mpd_id='dash' + suffix, fatal=False)) + # self._sort_formats(formats) + + uploader_id = video.get('hostID') + + return { + '_type': 'url_transparent', + 'id': vimeo_id, + 'title': title, + 'description': video.get('description'), + 'url': smuggle_url( + 'https://player.vimeo.com/video/' + vimeo_id, { + 'http_headers': { + 'Referer': 'https://storyfire.com/', + } + }), + # 'formats': formats, + 'thumbnail': video.get('storyImage'), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likesCount')), + 'comment_count': int_or_none(video.get('commentsCount')), + 'duration': int_or_none(video.get('videoDuration')), + 'timestamp': int_or_none(video.get('publishDate')), + 'uploader': video.get('username'), + 'uploader_id': uploader_id, + 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), + } + + +class StoryFireIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P[0-9a-f]{24})' + _TEST = { + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': 'caec54b9e4621186d6079c7ec100c1eb', + 'info_dict': { + 'id': '378954662', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', + 'uploader': 'whang!', + 'upload_date': '20191212', + 'duration': 418, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'] + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._call_api( + 'generic/video-detail', video_id, 'video')['video'] + return self._parse_video(video) + + +class StoryFireUserIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P[^/]+)/video' + _TEST = { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + }, + 'playlist_mincount': 151, + } + _PAGE_SIZE = 20 + + def _fetch_page(self, user_id, page): + videos = self._call_api( + 'publicVideos', user_id, 'page %d' % (page + 1), { + 'skip': page * self._PAGE_SIZE, + })['videos'] + for video in videos: + yield self._parse_video(video) + + def _real_extract(self, url): + user_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, user_id), self._PAGE_SIZE) + return self.playlist_result(entries, user_id) + + +class StoryFireSeriesIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13, + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0, + }] + + def _extract_videos(self, stories): + for story in stories.values(): + if story.get('hasVideo'): + yield self._parse_video(story) + + def _real_extract(self, url): + series_id = self._match_id(url) + stories = self._call_api( + 'seriesStories', series_id, 'series stories') + return self.playlist_result(self._extract_videos(stories), series_id) From 646052e416577cc805b7ba169c49158716541570 Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 16 Feb 2021 15:22:51 -0500 Subject: [PATCH 215/860] [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase (#28112) --- youtube_dl/postprocessor/embedthumbnail.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 5a3359588..3990908b6 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -89,10 +89,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): + atomicparsley = next((x + for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + + if atomicparsley is None: raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = [encodeFilename('AtomicParsley', True), + cmd = [encodeFilename(atomicparsley, True), encodeFilename(filename, True), encodeArgument('--artwork'), encodeFilename(thumbnail_filename, True), From a363fb5d28da7c1b651e6de98b9e799544a4df73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Feb 2021 04:03:54 +0700 Subject: [PATCH 216/860] [yandexmusic:playlist] Request missing tracks in chunks (closes #27355, closes #28184) --- youtube_dl/extractor/yandexmusic.py | 35 +++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 7893f363e..84969f8e1 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import hashlib +import itertools +import re from .common import InfoExtractor from ..compat import compat_str @@ -209,17 +210,27 @@ class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): missing_track_ids = [ track_id for track_id in track_ids if track_id not in present_track_ids] - missing_tracks = self._call_api( - 'track-entries', tld, url, item_id, - 'Downloading missing tracks JSON', { - 'entries': ','.join(missing_track_ids), - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - 'strict': 'true', - }) - if missing_tracks: - tracks.extend(missing_tracks) + # Request missing tracks in chunks to avoid exceeding max HTTP header size, + # see https://github.com/ytdl-org/youtube-dl/issues/27355 + _TRACKS_PER_CHUNK = 250 + for chunk_num in itertools.count(0): + start = chunk_num * _TRACKS_PER_CHUNK + end = start + _TRACKS_PER_CHUNK + missing_track_ids_req = missing_track_ids[start:end] + assert missing_track_ids_req + missing_tracks = self._call_api( + 'track-entries', tld, url, item_id, + 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), { + 'entries': ','.join(missing_track_ids_req), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) + if missing_tracks: + tracks.extend(missing_tracks) + if end >= len(missing_track_ids): + break return tracks From 8980f53b4227bc213048fce52c634830dd25e4bb Mon Sep 17 00:00:00 2001 From: PrinceOfPuppers Date: Tue, 2 Feb 2021 01:46:39 -0500 Subject: [PATCH 217/860] [youtube] Fix uploader extraction in flat playlist mode (#28045) --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7db4503e0..e0b15f859 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -308,7 +308,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'^([\d,]+)', re.sub(r'\s', '', view_count_text), 'view count', default=None)) uploader = try_get( - renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + renderer, + (lambda x: x['ownerText']['runs'][0]['text'], + lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) return { '_type': 'url_transparent', 'ie_key': YoutubeIE.ie_key(), From 70baa7bfae8890c8274af7f3c7e2a704d300a326 Mon Sep 17 00:00:00 2001 From: PrinceOfPuppers Date: Mon, 15 Feb 2021 14:38:41 -0500 Subject: [PATCH 218/860] [test_youtube_lists] Actualize youtube flat playlist test (closes #28045) --- test/test_youtube_lists.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c4f0abbea..cf2fdf14f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -12,6 +12,7 @@ from test.helper import FakeYDL from youtube_dl.extractor import ( YoutubePlaylistIE, + YoutubeTabIE, YoutubeIE, ) @@ -57,14 +58,22 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) - def test_youtube_flat_playlist_titles(self): + def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() dl.params['extract_flat'] = True - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') + ie = YoutubeTabIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc') self.assertIsPlaylist(result) - for entry in result['entries']: - self.assertTrue(entry.get('title')) + entries = list(result['entries']) + self.assertTrue(len(entries) == 1) + video = entries[0] + self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['ie_key'], 'Youtube') + self.assertEqual(video['id'], 'BaW_jenozKc') + self.assertEqual(video['url'], 'BaW_jenozKc') + self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐') + self.assertEqual(video['duration'], 10) + self.assertEqual(video['uploader'], 'Philipp Hagemeister') if __name__ == '__main__': From e20ec43094c09c41d71cef512c882a9d66163cd2 Mon Sep 17 00:00:00 2001 From: dmsummers Date: Thu, 20 Feb 2020 14:33:05 -0600 Subject: [PATCH 219/860] [simplecast] Add new extractor(closes #24107) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/generic.py | 16 +++ youtube_dl/extractor/simplecast.py | 160 +++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+) create mode 100644 youtube_dl/extractor/simplecast.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 51f6d38e9..60c032c7d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1057,6 +1057,11 @@ from .shared import ( VivoIE, ) from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) from .sina import SinaIE from .sixplay import SixPlayIE from .skyit import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 09e680c96..c2b1b3bdf 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -129,6 +129,7 @@ from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE +from .simplecast import SimplecastIE class GenericIE(InfoExtractor): @@ -2238,6 +2239,15 @@ class GenericIE(InfoExtractor): 'duration': 159, }, }, + { + # Simplecast player embed + 'url': 'https://www.bio.org/podcast', + 'info_dict': { + 'id': 'podcast', + 'title': 'I AM BIO Podcast | BIO', + }, + 'playlist_mincount': 52, + }, ] def report_following_redirect(self, new_url): @@ -2792,6 +2802,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') + # Look for Simplecast embeds + simplecast_urls = SimplecastIE._extract_urls(webpage) + if simplecast_urls: + return self.playlist_from_matches( + simplecast_urls, video_id, video_title) + # Look for BBC iPlayer embed matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) if matches: diff --git a/youtube_dl/extractor/simplecast.py b/youtube_dl/extractor/simplecast.py new file mode 100644 index 000000000..2d0b3c06d --- /dev/null +++ b/youtube_dl/extractor/simplecast.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class SimplecastBaseIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _API_BASE = 'https://api.simplecast.com/' + + def _call_api(self, path_tmpl, video_id): + return self._download_json( + self._API_BASE + path_tmpl % video_id, video_id) + + def _call_search_api(self, resource, resource_id, resource_url): + return self._download_json( + 'https://api.simplecast.com/%ss/search' % resource, resource_id, + data=urlencode_postdata({'url': resource_url})) + + def _parse_episode(self, episode): + episode_id = episode['id'] + title = episode['title'].strip() + audio_file = episode.get('audio_file') or {} + audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] + + season = episode.get('season') or {} + season_href = season.get('href') + season_id = None + if season_href: + season_id = self._search_regex( + r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, + season_href, 'season id', default=None) + + webpage_url = episode.get('episode_url') + channel_url = None + if webpage_url: + channel_url = self._search_regex( + r'(https?://[^/]+\.simplecast\.com)', + webpage_url, 'channel url', default=None) + + return { + 'id': episode_id, + 'display_id': episode.get('slug'), + 'title': title, + 'url': clean_podcast_url(audio_file_url), + 'webpage_url': webpage_url, + 'channel_url': channel_url, + 'series': try_get(episode, lambda x: x['podcast']['title']), + 'season_number': int_or_none(season.get('number')), + 'season_id': season_id, + 'thumbnail': episode.get('image_url'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode.get('number')), + 'description': strip_or_none(episode.get('description')), + 'timestamp': parse_iso8601(episode.get('published_at')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), + } + + +class SimplecastIE(SimplecastBaseIE): + IE_NAME = 'simplecast' + _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P%s)' % SimplecastBaseIE._UUID_REGEX + _COMMON_TEST_INFO = { + 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', + 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'ext': 'mp3', + 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', + 'episode_number': 1, + 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', + 'season_number': 1, + 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', + 'series': 'The RE:BIND.io Podcast', + 'duration': 5343, + 'timestamp': 1580979475, + 'upload_date': '20200206', + 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', + } + _TESTS = [{ + 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': _COMMON_TEST_INFO, + }, { + 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'''(?x)]+src=["\'] + ( + https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/%s + ))''' % SimplecastBaseIE._UUID_REGEX, webpage) + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('episodes/%s', episode_id) + return self._parse_episode(episode) + + +class SimplecastEpisodeIE(SimplecastBaseIE): + IE_NAME = 'simplecast:episode' + _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': SimplecastIE._COMMON_TEST_INFO, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = self._call_search_api( + 'episode', mobj.group(1), mobj.group(0)) + return self._parse_episode(episode) + + +class SimplecastPodcastIE(SimplecastBaseIE): + IE_NAME = 'simplecast:podcast' + _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' + _TESTS = [{ + 'url': 'https://the-re-bind-io-podcast.simplecast.com', + 'playlist_mincount': 33, + 'info_dict': { + 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', + 'title': 'The RE:BIND.io Podcast', + }, + }, { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + subdomain = self._match_id(url) + site = self._call_search_api('site', subdomain, url) + podcast = site['podcast'] + podcast_id = podcast['id'] + podcast_title = podcast.get('title') + + def entries(): + episodes = self._call_api('podcasts/%s/episodes', podcast_id) + for episode in (episodes.get('collection') or []): + info = self._parse_episode(episode) + info['series'] = podcast_title + yield info + + return self.playlist_result(entries(), podcast_id, podcast_title) From a7356dffe90ed68958d839da073f1321f87a4feb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 17 Feb 2021 18:33:33 +0100 Subject: [PATCH 220/860] [dplay] Add support for discoveryplus.com (closes #24698) --- youtube_dl/extractor/dplay.py | 123 ++++++++++++++++++++++------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 99 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 47501dbe6..540505719 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -151,56 +152,79 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = 'Bearer ' + self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, }) disco_base = 'https://%s/' % disco_host - token = self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] headers = { 'Referer': url, - 'Authorization': 'Bearer ' + token, } - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise video_id = video['data']['id'] info = video['data']['attributes'] title = info['name'].strip() formats = [] try: - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - display_id, headers=headers)['data']['attributes']['streaming'] + streaming = self._download_video_playback_info( + disco_base, video_id, headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code == 'access.denied.missingpackage': - self.raise_login_required() - raise ExtractorError(info['errors'][0]['detail'], expected=True) + self._process_errors(e, geo_countries) raise - for format_id, format_dict in streaming.items(): + for format_dict in streaming: if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') if not format_url: continue + format_id = format_dict.get('type') ext = determine_ext(format_url) if format_id == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -268,3 +292,46 @@ class DPlayIE(InfoExtractor): host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) + + +class DiscoveryPlusIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video/(?P[^/]+/[^/]+)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + 'wisteriaProperties': { + 'platform': 'desktop', + }, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 60c032c7d..acf8cf73b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -288,7 +288,10 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import DPlayIE +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE From 3997efb65ef16dbd8c4792e79e797cbcab0fbec1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 17 Feb 2021 19:50:04 +0100 Subject: [PATCH 221/860] [dplay] add support for de.hgtv.com (closes #28182) --- youtube_dl/extractor/dplay.py | 37 +++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 540505719..0f0632f26 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -11,11 +11,13 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + strip_or_none, unified_timestamp, ) class DPlayIE(InfoExtractor): + _PATH_REGEX = r'/(?P[^/]+/[^/?#]+)' _VALID_URL = r'''(?x)https?:// (?P (?:www\.)?(?Pd @@ -25,7 +27,7 @@ class DPlayIE(InfoExtractor): ) )| (?Pes|it)\.dplay\.com - )/[^/]+/(?P[^/]+/[^/?#]+)''' + )/[^/]+''' + _PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -272,7 +274,7 @@ class DPlayIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': info.get('description'), + 'description': strip_or_none(info.get('description')), 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, @@ -295,7 +297,7 @@ class DPlayIE(InfoExtractor): class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video/(?P[^/]+/[^/]+)' + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -335,3 +337,32 @@ class DiscoveryPlusIE(DPlayIE): display_id = self._match_id(url) return self._get_disco_api_info( url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') + + +class HGTVDeIE(DPlayIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', + 'info_dict': { + 'id': '151205', + 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', + 'ext': 'mp4', + 'title': 'Wer braucht schon eine Toilette', + 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', + 'duration': 1177.024, + 'timestamp': 1595705400, + 'upload_date': '20200725', + 'creator': 'HGTV', + 'series': 'Tiny House - klein, aber oho', + 'season_number': 3, + 'episode_number': 3, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index acf8cf73b..62819ddcf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -291,6 +291,7 @@ from .douyutv import ( from .dplay import ( DPlayIE, DiscoveryPlusIE, + HGTVDeIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE From 08c2fbb8443de3949af205d08015e5c6048d2e86 Mon Sep 17 00:00:00 2001 From: bopol Date: Wed, 17 Feb 2021 22:29:32 +0100 Subject: [PATCH 222/860] [youtube] Add support for redirect.invidious.io (#28193) Co-authored-by: Sergey M --- youtube_dl/extractor/youtube.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e0b15f859..f9e554ca9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -335,8 +335,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| - # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances + # invidious-redirect websites + (?:www\.)?redirect\.invidious\.io/| (?:(?:www|dev)\.)?invidio\.us/| + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md (?:(?:www|no)\.)?invidiou\.sh/| (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| @@ -906,6 +908,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, }, + { + 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m + 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA', + 'only_matching': True, + }, { # DRM protected 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', From 9fc5eafb8e384453a49f7cfe73147be491f0b19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Feb 2021 04:59:56 +0700 Subject: [PATCH 223/860] [youtube] Improve _VALID_URL (refs #28193) --- youtube_dl/extractor/youtube.py | 99 ++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f9e554ca9..ff32758df 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -326,54 +326,57 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + ) _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| - (?:www\.)?deturl\.com/www\.youtube\.com/| - (?:www\.)?pwnyoutube\.com/| - (?:www\.)?hooktube\.com/| - (?:www\.)?yourepeat\.com/| - tube\.majestyc\.net/| - # invidious-redirect websites - (?:www\.)?redirect\.invidious\.io/| - (?:(?:www|dev)\.)?invidio\.us/| - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md - (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi)\.)?invidious\.snopyta\.org/| - (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.mastodon\.host/| - (?:www\.)?invidious\.zapashcanon\.fr/| - (?:www\.)?invidious\.kavin\.rocks/| - (?:www\.)?invidious\.tube/| - (?:www\.)?invidiou\.site/| - (?:www\.)?invidious\.site/| - (?:www\.)?invidious\.xyz/| - (?:www\.)?invidious\.nixnet\.xyz/| - (?:www\.)?invidious\.drycat\.fr/| - (?:www\.)?tube\.poal\.co/| - (?:www\.)?tube\.connect\.cafe/| - (?:www\.)?vid\.wxzm\.sx/| - (?:www\.)?vid\.mint\.lgbt/| - (?:www\.)?yewtu\.be/| - (?:www\.)?yt\.elukerio\.org/| - (?:www\.)?yt\.lelux\.fi/| - (?:www\.)?invidious\.ggc-project\.de/| - (?:www\.)?yt\.maisputain\.ovh/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.toot\.koeln/| - (?:www\.)?invidious\.fdn\.fr/| - (?:www\.)?watch\.nettohikari\.com/| - (?:www\.)?kgg2m7yk5aybusll\.onion/| - (?:www\.)?qklhadlycap4cnod\.onion/| - (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| - (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| - (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| - (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| - (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| - (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/| - youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| + (?:www\.)?deturl\.com/www\.youtube\.com| + (?:www\.)?pwnyoutube\.com| + (?:www\.)?hooktube\.com| + (?:www\.)?yourepeat\.com| + tube\.majestyc\.net| + %(invidious)s| + youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ @@ -388,6 +391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): youtu\.be| # just youtu.be/xxxx vid\.plus| # or vid.plus/xxxx zwearz\.com/watch| # or zwearz.com/watch/xxxx + %(invidious)s )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) @@ -400,7 +404,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) ) (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + $""" % { + 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, + 'invidious': '|'.join(_INVIDIOUS_SITES), + } _PLAYER_INFO_RE = ( r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', From 40edffae3d9f86ca696dda6c8a4c9c0497cb6d76 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Feb 2021 11:55:14 +0100 Subject: [PATCH 224/860] [ninegag] unscape title(#28201) --- youtube_dl/extractor/ninegag.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 440f865bc..14390823b 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -2,10 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, try_get, + unescapeHTML, url_or_none, ) @@ -14,7 +15,7 @@ class NineGagIE(InfoExtractor): IE_NAME = '9gag' _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TEST = { + _TESTS = [{ 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', @@ -29,7 +30,11 @@ class NineGagIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, } - } + }, { + # HTML escaped title + 'url': 'https://9gag.com/gag/av5nvyb', + 'only_matching': True, + }] def _real_extract(self, url): post_id = self._match_id(url) @@ -43,7 +48,7 @@ class NineGagIE(InfoExtractor): 'The given url does not contain a video', expected=True) - title = post['title'] + title = unescapeHTML(post['title']) duration = None formats = [] From b92bb0e02a09930cad3c4f6a406eb503c941af61 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Feb 2021 16:00:22 +0100 Subject: [PATCH 225/860] [viki] improve extraction(closes #26522)(closes #28203) - extract uploader_url and episode_number - report login required error - extract 480p formats - fix API v4 calls --- youtube_dl/extractor/viki.py | 69 +++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a311f21ef..2e9cbf148 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -21,6 +21,7 @@ from ..utils import ( parse_iso8601, sanitized_Request, std_headers, + try_get, ) @@ -30,7 +31,7 @@ class VikiBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' _APP = '100005a' - _APP_VERSION = '2.2.5.1428709186' + _APP_VERSION = '6.0.0' _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False @@ -41,7 +42,7 @@ class VikiBaseIE(InfoExtractor): _ERRORS = { 'geo': 'Sorry, this content is not available in your region.', 'upcoming': 'Sorry, this content is not yet available.', - # 'paywall': 'paywall', + 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } def _prepare_call(self, path, timestamp=None, post_data=None): @@ -62,7 +63,8 @@ class VikiBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note, + headers={'x-viki-app-ver': self._APP_VERSION}) error = resp.get('error') if error: @@ -82,11 +84,13 @@ class VikiBaseIE(InfoExtractor): expected=True) def _check_errors(self, data): - for reason, status in data.get('blocking', {}).items(): + for reason, status in (data.get('blocking') or {}).items(): if status and reason in self._ERRORS: message = self._ERRORS[reason] if reason == 'geo': self.raise_geo_restricted(msg=message) + elif reason == 'paywall': + self.raise_login_required(message) raise ExtractorError('%s said: %s' % ( self.IE_NAME, message), expected=True) @@ -131,13 +135,19 @@ class VikiIE(VikiBaseIE): 'info_dict': { 'id': '1023585v', 'ext': 'mp4', - 'title': 'Heirs Episode 14', - 'uploader': 'SBS', - 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'title': 'Heirs - Episode 14', + 'uploader': 'SBS Contents Hub', + 'timestamp': 1385047627, 'upload_date': '20131121', 'age_limit': 13, + 'duration': 3570, + 'episode_number': 14, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'Blocked in the US', + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', @@ -153,7 +163,8 @@ class VikiIE(VikiBaseIE): 'uploader': 'Arirang TV', 'like_count': int, 'age_limit': 0, - } + }, + 'skip': 'Sorry. There was an error loading this video', }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { @@ -171,7 +182,7 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '94e0e34fd58f169f40c184f232356cfe', + 'md5': '0a53dc252e6e690feccd756861495a8c', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -183,6 +194,10 @@ class VikiIE(VikiBaseIE): 'uploader': 'group8', 'like_count': int, 'age_limit': 13, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', }, 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { @@ -209,7 +224,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': 'adf9e321a0ae5d0aace349efaaff7691', + 'md5': '41faaba0de90483fb4848952af7c7d0d', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -220,6 +235,10 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }] def _real_extract(self, url): @@ -229,36 +248,33 @@ class VikiIE(VikiBaseIE): 'https://www.viki.com/api/videos/' + video_id, video_id, 'Downloading video JSON', headers={ 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '4.0.57', + 'x-viki-app-ver': '3.0.0', }) video = resp['video'] self._check_errors(video) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + episode_number = int_or_none(video.get('number')) if not title: - title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = video.get('container', {}).get('titles', {}) + title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') title = '%s - %s' % (container_title, title) description = self.dict_selection(video.get('descriptions', {}), 'en') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('created_at')) - uploader = video.get('author') - like_count = int_or_none(video.get('likes', {}).get('count')) - age_limit = parse_age_limit(video.get('rating')) + like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) thumbnails = [] - for thumbnail_id, thumbnail in video.get('images', {}).items(): + for thumbnail_id, thumbnail in (video.get('images') or {}).items(): thumbnails.append({ 'id': thumbnail_id, 'url': thumbnail.get('url'), }) subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): subtitles[subtitle_lang] = [{ 'ext': subtitles_format, 'url': self._prepare_call( @@ -269,13 +285,15 @@ class VikiIE(VikiBaseIE): 'id': video_id, 'title': title, 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader': video.get('author'), + 'uploader_url': video.get('author_url'), 'like_count': like_count, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(video.get('rating')), 'thumbnails': thumbnails, 'subtitles': subtitles, + 'episode_number': episode_number, } formats = [] @@ -360,7 +378,7 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', }, 'playlist_mincount': 71, }, { @@ -371,6 +389,7 @@ class VikiChannelIE(VikiBaseIE): 'description': 'md5:05bf5471385aa8b21c18ad450e350525', }, 'playlist_count': 127, + 'skip': 'Page not found', }, { 'url': 'http://www.viki.com/news/24569c-showbiz-korea', 'only_matching': True, From cf2dbec6301177a1fddf72862de05fa912d9869d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Feb 2021 21:13:56 +0100 Subject: [PATCH 226/860] [vimeo] add support for unlisted video source format extraction --- youtube_dl/extractor/vimeo.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 15cd06268..bd2663fe0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -226,10 +226,12 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'is_live': is_live, } - def _extract_original_format(self, url, video_id): + def _extract_original_format(self, url, video_id, unlisted_hash=None): + query = {'action': 'load_download_config'} + if unlisted_hash: + query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, - query={'action': 'load_download_config'}, + url, video_id, fatal=False, query=query, headers={'X-Requested-With': 'XMLHttpRequest'}) if download_data: source_file = download_data.get('source_file') @@ -509,6 +511,11 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, + }, + { + # requires passing unlisted_hash(a52724358e) to load_download_config request + 'url': 'https://vimeo.com/392479337/a52724358e', + 'only_matching': True, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header @@ -673,7 +680,8 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(redirect_url, video_id, headers) - vod = config.get('video', {}).get('vod', {}) + video = config.get('video') or {} + vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: @@ -733,7 +741,7 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = [] source_format = self._extract_original_format( - 'https://vimeo.com/' + video_id, video_id) + 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) if source_format: formats.append(source_format) From 21e872b19ada61337770160a124c4387d6c77e08 Mon Sep 17 00:00:00 2001 From: Isaac-the-Man Date: Sun, 10 Jan 2021 10:37:54 -0500 Subject: [PATCH 227/860] [samplefocus] Add new extractor(closes #27763) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/samplefocus.py | 100 ++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/samplefocus.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 62819ddcf..1a39c25c5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1029,6 +1029,7 @@ from .safari import ( SafariApiIE, SafariCourseIE, ) +from .samplefocus import SampleFocusIE from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE diff --git a/youtube_dl/extractor/samplefocus.py b/youtube_dl/extractor/samplefocus.py new file mode 100644 index 000000000..806c3c354 --- /dev/null +++ b/youtube_dl/extractor/samplefocus.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_element_by_attribute, + int_or_none, +) + + +class SampleFocusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', + 'md5': '48c8d62d60be467293912e0e619a5120', + 'info_dict': { + 'id': '40316', + 'display_id': 'lil-peep-sad-emo-guitar', + 'ext': 'mp3', + 'title': 'Lil Peep Sad Emo Guitar', + 'thumbnail': r're:^https?://.+\.png', + 'license': 'Standard License', + 'uploader': 'CapsCtrl', + 'uploader_id': 'capsctrl', + 'like_count': int, + 'comment_count': int, + 'categories': ['Samples', 'Guitar', 'Electric guitar'], + }, + }, { + 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', + 'only_matching': True + }, { + 'url': 'https://samplefocus.com/samples/young-chop-kick', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sample_id = self._search_regex( + r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', + webpage, 'sample id', group='id') + + title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( + r'

    (.+?)

    ', webpage, 'title') + + mp3_url = self._search_regex( + r']+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P(?:(?!\2).)+)', + webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( + r']+itemprop=(["\'])contentUrl\1[^>]*>', + webpage, 'mp3 url', group=0))['content'] + + thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( + r']+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P(?:(?!\1).)+)', + webpage, 'mp3', fatal=False, group='url') + + comments = [] + for author_id, author, body in re.findall(r'(?s)]+class="comment-author">]+href="/users/([^"]+)">([^"]+).+?]+class="comment-body">([^>]+)

    ', webpage): + comments.append({ + 'author': author, + 'author_id': author_id, + 'text': body, + }) + + uploader_id = uploader = None + mobj = re.search(r'>By ]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) + if mobj: + uploader_id, uploader = mobj.groups() + + breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) + categories = [] + if breadcrumb: + for _, name in re.findall(r']+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): + categories.append(name) + + def extract_count(klass): + return int_or_none(self._html_search_regex( + r']+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, + webpage, klass, fatal=False)) + + return { + 'id': sample_id, + 'title': title, + 'url': mp3_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'license': self._html_search_regex( + r']+href=(["\'])/license\1[^>]*>(?P[^<]+)<', + webpage, 'license', fatal=False, group='license'), + 'uploader_id': uploader_id, + 'like_count': extract_count('sample-%s-favorites' % sample_id), + 'comment_count': extract_count('comments'), + 'comments': comments, + 'categories': categories, + } From 3037ab00c7ddbe4bedaff51420e4ea1e8d0ccccb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Feb 2021 10:24:58 +0100 Subject: [PATCH 228/860] [youtube] fixup m4a_dash formats(closes #28165) --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ff32758df..72d9fbbc6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1603,6 +1603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Youtube throttles chunks >~10M 'http_chunk_size': 10485760, } + if dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' formats.append(dct) hls_manifest_url = streaming_data.get('hlsManifestUrl') From f90d825a6be852b6a3fa39b0948cc9b94154963e Mon Sep 17 00:00:00 2001 From: SirCipherz Date: Sun, 21 Feb 2021 16:05:33 +0000 Subject: [PATCH 229/860] [peertube] Add support for canard.tube (#28190) --- youtube_dl/extractor/peertube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index c2ca71c71..32ff51653 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -413,7 +413,8 @@ class PeerTubeIE(InfoExtractor): peertube3\.cpy\.re| peertube2\.cpy\.re| videos\.tcit\.fr| - peertube\.cpy\.re + peertube\.cpy\.re| + canard\.tube )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _API_BASE = 'https://%s/api/v1/videos/%s/%s' From 696183e1333aa8f2f1241e149759edf410f94c79 Mon Sep 17 00:00:00 2001 From: piplongrun Date: Sun, 21 Feb 2021 17:19:37 +0100 Subject: [PATCH 230/860] [youporn] Extract duration (#28019) Co-authored-by: Sergey M --- youtube_dl/extractor/youporn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 534270bac..2b5771828 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -25,6 +25,7 @@ class YouPornIE(InfoExtractor): 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101217', 'average_rating': int, @@ -153,6 +154,8 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', @@ -194,6 +197,7 @@ class YouPornIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, 'average_rating': average_rating, From 919d7646004ad8480016b9dec0f6033759244520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 21 Feb 2021 23:21:38 +0700 Subject: [PATCH 231/860] [youporn] Skip test --- youtube_dl/extractor/youporn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 2b5771828..33114363d 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -55,6 +55,7 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404', }, { 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'only_matching': True, From 36abc16c3cf050a3368367038d40cce27504c28a Mon Sep 17 00:00:00 2001 From: Adrian Heine Date: Sat, 9 Jan 2021 22:06:24 +0100 Subject: [PATCH 232/860] [apa] Fix extraction --- youtube_dl/extractor/apa.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index 98ccdaa4a..1dd35dd9c 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, - url_or_none, ) @@ -17,14 +16,10 @@ class APAIE(InfoExtractor): 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'info_dict': { - 'id': 'jjv85FdZ', + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', }, }, { 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', @@ -48,7 +43,7 @@ class APAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('https://uvp.apa.at/player/%s' % video_id, video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -59,18 +54,12 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json( - self._search_regex( - r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), - video_id, transform_source=js_to_json) + sources = self._parse_json("{" + self._search_regex( + r'("hls"\s*:\s*"[^"]+"\s*,\s*"progressive"\s*:\s*"[^"]+")', webpage, 'sources') + + "}", video_id, transform_source=js_to_json) formats = [] - for source in sources: - if not isinstance(source, dict): - continue - source_url = url_or_none(source.get('file')) - if not source_url: - continue + for (format, source_url) in sources.items(): ext = determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -83,7 +72,7 @@ class APAIE(InfoExtractor): self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'"poster"\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='url') return { From aa9118a373a6e9cfb9fda24533df86286eccc468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Feb 2021 02:29:50 +0700 Subject: [PATCH 233/860] [apa] Improve extraction (closes #27750) --- youtube_dl/extractor/apa.py | 38 ++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index 1dd35dd9c..cbc1c0ecb 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -6,12 +6,13 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, - js_to_json, + int_or_none, + url_or_none, ) class APAIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?Phttps?://[^/]+\.apa\.at)/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -41,9 +42,11 @@ class APAIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, base_url = mobj.group('id', 'base_url') - webpage = self._download_webpage('https://uvp.apa.at/player/%s' % video_id, video_id) + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -54,30 +57,39 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json("{" + self._search_regex( - r'("hls"\s*:\s*"[^"]+"\s*,\s*"progressive"\s*:\s*"[^"]+")', webpage, 'sources') - + "}", video_id, transform_source=js_to_json) + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') formats = [] - for (format, source_url) in sources.items(): + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) + if not source_url: + continue ext = determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) formats.append({ 'url': source_url, + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'"poster"\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='url') - return { 'id': video_id, - 'title': video_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, 'formats': formats, } From 44b2d5f5fc80a291b093c8bf20e2ad7ac58b3536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Feb 2021 02:40:00 +0700 Subject: [PATCH 234/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/ChangeLog b/ChangeLog index 384bd19c2..69ce51890 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +version + +Core ++ [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase + (#28112) + +Extractors +* [apa] Fix and improve extraction (#27750) ++ [youporn] Extract duration (#28019) ++ [peertube] Add support for canard.tube (#28190) +* [youtube] Fixup m4a_dash formats (#28165) ++ [samplefocus] Add support for samplefocus.com (#27763) ++ [vimeo] Add support for unlisted video source format extraction +* [viki] Improve extraction (#26522, #28203) + * Extract uploader URL and episode number + * Report login required error + + Extract 480p formats + * Fix API v4 calls +* [ninegag] Unescape title (#28201) +* [youtube] Improve URL regular expression (#28193) ++ [youtube] Add support for redirect.invidious.io (#28193) ++ [dplay] Add support for de.hgtv.com (#28182) ++ [dplay] Add support for discoveryplus.com (#24698) ++ [simplecast] Add support for simplecast.com (#24107) +* [youtube] Fix uploader extraction in flat playlist mode (#28045) +* [yandexmusic:playlist] Request missing tracks in chunks (#27355, #28184) ++ [storyfire] Add support for storyfire.com (#25628, #26349) ++ [zhihu] Add support for zhihu.com (#28177) +* [youtube] Fix controversial videos when authenticated with cookies (#28174) +* [ccma] Fix timestamp parsing in python 2 ++ [videopress] Add support for video.wordpress.com +* [kakao] Improve info extraction and detect geo restriction (#26577) +* [xboxclips] Fix extraction (#27151) +* [ard] Improve formats extraction (#28155) ++ [canvas] Add support for dagelijksekost.een.be (#28119) + + version 2021.02.10 Extractors From 0a04e03a0245d78593844e7b7930920051b9cc27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Feb 2021 02:42:16 +0700 Subject: [PATCH 235/860] release 2021.02.22 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 11 +++++++++++ youtube_dl/version.py | 2 +- 8 files changed, 25 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index ea0a59dca..60879f0ac 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.10 + [debug] youtube-dl version 2021.02.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d24855c72..b38d39ab4 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 8b96a2883..3235de44b 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index e46971047..a3255623a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.10 + [debug] youtube-dl version 2021.02.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a9ca379ca..124b020c3 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 69ce51890..2912d776c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.02.22 Core + [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1373cc4f6..2452c1f7f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -212,6 +212,7 @@ - **curiositystream** - **curiositystream:collection** - **CWTV** + - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** - **dailymotion** - **dailymotion:playlist** @@ -233,6 +234,7 @@ - **DiscoveryGo** - **DiscoveryGoPlaylist** - **DiscoveryNetworksDe** + - **DiscoveryPlus** - **DiscoveryVR** - **Disney** - **dlive:stream** @@ -353,6 +355,7 @@ - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** + - **HGTVDe** - **HiDive** - **HistoricFilms** - **history:player** @@ -803,6 +806,7 @@ - **safari:course**: safaribooksonline.com online courses - **SAKTV** - **SaltTV** + - **SampleFocus** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -825,6 +829,9 @@ - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** + - **simplecast** + - **simplecast:episode** + - **simplecast:podcast** - **Sina** - **sky.it** - **sky:news** @@ -877,6 +884,9 @@ - **Steam** - **Stitcher** - **StitcherShow** + - **StoryFire** + - **StoryFireSeries** + - **StoryFireUser** - **Streamable** - **streamcloud.eu** - **StreamCZ** @@ -1198,5 +1208,6 @@ - **ZattooLive** - **ZDF** - **ZDFChannel** + - **Zhihu** - **zingmp3**: mp3.zing.vn - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 79d2be625..f89530293 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.10' +__version__ = '2021.02.22' From 2090dbdc8c51d18760957e248f5ff152209f9236 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Feb 2021 23:08:40 +0100 Subject: [PATCH 236/860] [youtube] fix get_video_info request --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 72d9fbbc6..2496d27f1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1450,7 +1450,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, - 'eurl': 'https://www.youtube.com/embed/' + video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 7422a2194fcbc179083c6927a2fcca278fed39c5 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 15 Oct 2020 14:24:17 +0200 Subject: [PATCH 237/860] [gedidigital] Add new extractor(closes #7347)(closes #26946) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/gedidigital.py | 161 ++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 youtube_dl/extractor/gedidigital.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a39c25c5..dc6a06771 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -421,6 +421,7 @@ from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py new file mode 100644 index 000000000..1b47a4e27 --- /dev/null +++ b/youtube_dl/extractor/gedidigital.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,3}/(?P\d+)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P[^']+)',\s*'(?P[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } From d81421af4b4c3f8f6e197ad4a06fcdb948484c24 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Feb 2021 23:02:15 +0100 Subject: [PATCH 238/860] [gedidigital] improve asset id matching --- youtube_dl/extractor/gedidigital.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 1b47a4e27..6c4153b40 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -32,7 +32,7 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,3}/(?P\d+)''' + )\.it(?:/[^/]+){2,3}?/(?P\d+)(?:[/?&#]|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', From 8cb4b71909e720a758a17dd519d198e77884a14a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 23 Feb 2021 18:37:06 +0700 Subject: [PATCH 239/860] [tmz] Fix and improve extraction (closes #24603, closes #24687, closes #28211) --- youtube_dl/extractor/tmz.py | 101 ++++++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py index 419f9d92e..3d1bf75ff 100644 --- a/youtube_dl/extractor/tmz.py +++ b/youtube_dl/extractor/tmz.py @@ -2,55 +2,110 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from .kaltura import KalturaIE +from ..utils import ( + int_or_none, + unified_timestamp, +) class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '4d22a51ef205b6c06395d8394f72d560', - 'info_dict': { - 'id': '0_okj015ty', - 'ext': 'mp4', - 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', - 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'timestamp': 1394747163, - 'uploader_id': 'batchUser', - 'upload_date': '20140313', - } - }, { 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'md5': '31f9223e20eef55954973359afa61a20', + 'info_dict': { + 'id': 'P6YjLBLk', + 'ext': 'mp4', + 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + 'description': 'md5:b714359fc18607715ebccbd2da8ff488', + 'timestamp': 1467831837, + 'upload_date': '20160706', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, { + 'url': 'http://www.tmz.com/videos/0_okj015ty/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).replace('-', '_') - return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + tmz_video_id = self._search_regex( + r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', + webpage, 'video id', default=None) + video = self._download_json( + 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, + fatal=False) + if video: + message = video['message'] + info = { + '_type': 'url_transparent', + 'title': message.get('title'), + 'description': message.get('description'), + 'timestamp': unified_timestamp(message.get('published_at')), + 'duration': int_or_none(message.get('duration')), + } + jwplatform_id = message.get('jwplayer_media_id') + if jwplatform_id: + info.update({ + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': JWPlatformIE.ie_key(), + }) + else: + kaltura_entry_id = message.get('kaltura_entry_id') or video_id + kaltura_partner_id = message.get('kaltura_partner_id') or '591531' + info.update({ + 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), + 'ie_key': KalturaIE.ie_key(), + }) + return info + + return self.url_result( + 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/]+)/?' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { - 'id': '0_6snoelag', - 'ext': 'mov', + 'id': 'PAKZa97W', + 'ext': 'mp4', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429467813, + 'timestamp': 1429466400, 'upload_date': '20150419', - 'uploader_id': 'batchUser', - } + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + tmz_url = self._search_regex( + r'clickLink\s*\(\s*["\'](?P%s)' % TMZIE._VALID_URL, webpage, + 'video id', default=None, group='url') + if tmz_url: + return self.url_result(tmz_url, ie=TMZIE.ie_key()) + embedded_video_info = self._parse_json(self._html_search_regex( r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], + ie=TMZIE.ie_key()) From 295860ff00c5d8caf94badd4f04671f6a631fcae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 23 Feb 2021 12:39:46 +0100 Subject: [PATCH 240/860] [tf1] improve extraction(closes #27980)(closes #28040) --- youtube_dl/extractor/tf1.py | 127 +++++++++++++++++------------------- youtube_dl/extractor/wat.py | 95 ++++++++++++--------------- 2 files changed, 101 insertions(+), 121 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 55e2a0721..23c2808a1 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,92 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) class TF1IE(InfoExtractor): - """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P[^/]+)/videos/(?P[^/?&#]+)\.html' _TESTS = [{ - 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - 'info_dict': { - 'id': '10635995', - 'ext': 'mp4', - 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', - 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', - 'info_dict': { - 'id': 'le-grand-mysterioso-chuggington-7085291-739', - 'ext': 'mp4', - 'title': 'Le grand Mystérioso - Chuggington', - 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', - 'upload_date': '20150103', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'skip': 'HTTP Error 410: Gone', - }, { - 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', - 'only_matching': True, - }, { - 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', - 'only_matching': True, - }, { - 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', - 'only_matching': True, - }, { 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', 'info_dict': { 'id': '13641379', 'ext': 'mp4', 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, + 'format': 'bestvideo', }, + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + program_slug, slug = re.match(self._VALID_URL, url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] - webpage = self._download_webpage(url, video_id) + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) - wat_id = None + decoration = video.get('decoration') or {} - data = self._parse_json( - self._search_regex( - r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|)', webpage, - 'data', default='{}'), video_id, fatal=False) + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) - if data: - try: - wat_id = next( - video.get('streamId') - for key, video in data.items() - if isinstance(video, dict) - and video.get('slug') == video_id) - if not isinstance(wat_id, compat_str) or not wat_id.isdigit(): - wat_id = None - except StopIteration: - pass - - if not wat_id: - wat_id = self._html_search_regex( - (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2'), - webpage, 'wat id', group='id') - - return self.url_result('wat:%s' % wat_id, 'Wat') + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index f6940b371..147931d73 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -4,9 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - unified_strdate, - HEADRequest, + ExtractorError, int_or_none, + try_get, + unified_strdate, ) @@ -29,6 +30,7 @@ class WatIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', @@ -40,8 +42,10 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', }, 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', }, ] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) @@ -49,71 +53,52 @@ class WatIE(InfoExtractor): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( - 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - self.report_warning( - '%s returned error: %s' % (self.IE_NAME, error_desc)) + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc) - chapters = video_info['chapters'] - if chapters: - first_chapter = chapters[0] - - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url - else: - first_chapter = video_info - - title = first_chapter['title'] - - def extract_url(path_template, url_type): - req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) - if head: - red_url = head.geturl() - if req_url != red_url: - return red_url - return None + title = video_info['title'] formats = [] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - formats.extend(self._extract_mpd_formats( - mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), - video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None - duration = None - files = video_info['files'] - if files: - duration = int_or_none(files[0].get('duration')) + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + formats.extend(self._extract_mpd_formats( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + elif f == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': first_chapter.get('preview'), - 'description': first_chapter.get('description'), - 'view_count': int_or_none(video_info.get('views')), - 'upload_date': upload_date, - 'duration': duration, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } From 1631fca1ee1c3312027c702854d741bbb8025dcd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 23 Feb 2021 13:50:18 +0100 Subject: [PATCH 241/860] [wat] detect DRM protected videos(closes #27958) --- youtube_dl/extractor/wat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 147931d73..f1bccc2d6 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -64,7 +64,7 @@ class WatIE(InfoExtractor): if error_desc: if video_info.get('error_code') == 'GEOBLOCKED': self.raise_geo_restricted(error_desc, video_info.get('geoList')) - raise ExtractorError(error_desc) + raise ExtractorError(error_desc, expected=True) title = video_info['title'] @@ -86,6 +86,8 @@ class WatIE(InfoExtractor): delivery = video_data.get('delivery') or {} extract_formats({delivery.get('format'): delivery.get('url')}) if not formats: + if delivery.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) if manifest_urls: From 44603290e5002153f3ebad6230cc73aef42cc2cd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Feb 2021 18:34:28 +0100 Subject: [PATCH 242/860] [dplay] Extract Ad-Free uplynk URLs(#28160) --- youtube_dl/extractor/dplay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 0f0632f26..bbb199094 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -330,6 +330,7 @@ class DiscoveryPlusIE(DPlayIE): 'videoId': video_id, 'wisteriaProperties': { 'platform': 'desktop', + 'product': 'dplus_us', }, }).encode('utf-8'))['data']['attributes']['streaming'] From 9662e4964b8d1b8d23c79f90d91b9be87d10029f Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 24 Feb 2021 22:17:29 +0100 Subject: [PATCH 243/860] [vvvvid] extract series sublists playlist_title (#27601) (#27618) --- youtube_dl/extractor/vvvvid.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 778ce8b76..d62404cf3 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -247,9 +247,13 @@ class VVVVIDShowIE(VVVVIDIE): show_info = self._download_info( show_id, 'info/', show_title, fatal=False) + if not show_title: + base_url += "/title" + entries = [] for season in (seasons or []): episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') for episode in episodes: if episode.get('playable') is False: continue @@ -259,12 +263,13 @@ class VVVVIDShowIE(VVVVIDIE): continue info = self._extract_common_video_info(episode) info.update({ - '_type': 'url', + '_type': 'url_transparent', 'ie_key': VVVVIDIE.ie_key(), 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), 'season_id': season_id, + 'playlist_title': playlist_title, }) entries.append(info) From ef28e33249f650b3f8d40c3e62b9df2c6103b360 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Feb 2021 22:29:35 +0100 Subject: [PATCH 244/860] [vvvvid] reduce season request payload size --- youtube_dl/extractor/vvvvid.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index d62404cf3..7c94c4ee2 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -75,12 +75,15 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _download_info(self, show_id, path, video_id, fatal=True): + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) response = self._download_json( 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query={ - 'conn_id': self._conn_id, - }, fatal=fatal) + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) if not (response or fatal): return if response.get('result') == 'error': @@ -98,7 +101,8 @@ class VVVVIDIE(InfoExtractor): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, 'season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) vid = int(video_id) video_data = list(filter( From 3c58f9e0b9d8471212406e012727374db084932b Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Sat, 11 Nov 2017 19:30:10 +0100 Subject: [PATCH 245/860] [srgssr] improve extraction - extract subtitle - fix extraction for new videos - update srf download domains closes #14717 closes #14725 closes #27231 closes #28238 --- youtube_dl/extractor/rts.py | 15 ++- youtube_dl/extractor/srgssr.py | 206 +++++++++++++++++++++------------ 2 files changed, 143 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 48f17b828..aed35f8a9 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -6,11 +6,12 @@ import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, - determine_ext, + urljoin, ) @@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE): _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', @@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE): media_type = 'video' if 'video' in all_info else 'audio' # check for errors - self.get_media_data('rts', media_type, media_id) + self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] @@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): @@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE): format_id += '-%dk' % rate formats.append({ 'format_id': format_id, - 'url': 'http://download-video.rts.ch/' + media_url, + 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, parse_iso8601, qualities, + try_get, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P + video|audio + ): + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor): 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } def _get_tokenized_src(self, url, video_id, format_id): - sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = token.get('token', {}).get('authparams') + auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] - media_data = self.get_media_data(bu, media_type, media_id) - - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') - timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': asset_url, - 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, - }) + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + formats.extend(self._extract_akamai_formats( + format_url, media_id)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) self._sort_formats(formats) + subtitles = {} + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + return { 'id': media_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, 'formats': formats, } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'timestamp': 1444709160, + 'duration': 336.816, }, 'params': { # rtmp download @@ -159,19 +203,32 @@ class SRGSSRPlayIE(InfoExtractor): 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, + 'duration': 1796.76, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor): bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From 678d46f6bbcc8426723d48c49eb25cf202753245 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Feb 2021 10:42:41 +0100 Subject: [PATCH 246/860] [bandaichannel] Add new extractor(closes #21404) --- youtube_dl/extractor/bandaichannel.py | 37 +++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/bandaichannel.py diff --git a/youtube_dl/extractor/bandaichannel.py b/youtube_dl/extractor/bandaichannel.py new file mode 100644 index 000000000..d67285913 --- /dev/null +++ b/youtube_dl/extractor/bandaichannel.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from ..utils import extract_attributes + + +class BandaiChannelIE(BrightcoveNewIE): + IE_NAME = 'bandaichannel' + _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' + _TESTS = [{ + 'url': 'https://www.b-ch.com/titles/514/001', + 'md5': 'a0f2d787baa5729bed71108257f613a4', + 'info_dict': { + 'id': '6128044564001', + 'ext': 'mp4', + 'title': 'メタルファイターMIKU 第1話', + 'timestamp': 1580354056, + 'uploader_id': '5797077852001', + 'upload_date': '20200130', + 'duration': 1387.733, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + attrs = extract_attributes(self._search_regex( + r'(]+\bid="bcplayer"[^>]*>)', webpage, 'player')) + bc = self._download_json( + 'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'], + video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc'] + return self._parse_brightcove_metadata(bc, bc['id']) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dc6a06771..07a8af055 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -90,6 +90,7 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE +from .bandaichannel import BandaiChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, From 38fe5e239ad602b32c111f40ad7c51b3e029be3c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Feb 2021 12:31:18 +0100 Subject: [PATCH 247/860] [urplay] fix episode data extraction(closes #28292) --- youtube_dl/extractor/urplay.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 5452c7ca1..d6c79147e 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -21,6 +21,11 @@ class URPlayIE(InfoExtractor): 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', 'timestamp': 1513292400, 'upload_date': '20171214', + 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', + 'duration': 2269, + 'categories': ['Kultur & historia'], + 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], + 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -31,6 +36,10 @@ class URPlayIE(InfoExtractor): 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', 'timestamp': 1440086400, 'upload_date': '20150820', + 'series': 'Tripp, Trapp, Träd', + 'duration': 865, + 'tags': ['Sova'], + 'episode': 'Sovkudde', }, }, { 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', @@ -41,9 +50,11 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._html_search_regex( + vid = int(video_id) + accessible_episodes = self._parse_json(self._html_search_regex( r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( From bee618268014480bb3dd7887986b456c8e9c0236 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 1 Mar 2021 14:00:03 +0100 Subject: [PATCH 248/860] [stretchinternet] Fix extraction(closes #28297) --- youtube_dl/extractor/stretchinternet.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index 4dbead2ba..ec08eae55 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none class StretchInternetIE(InfoExtractor): @@ -11,22 +10,28 @@ class StretchInternetIE(InfoExtractor): 'info_dict': { 'id': '573272', 'ext': 'mp4', - 'title': 'University of Mary Wrestling vs. Upper Iowa', - 'timestamp': 1575668361, - 'upload_date': '20191206', + 'title': 'UNIVERSITY OF MARY WRESTLING VS UPPER IOWA', + # 'timestamp': 1575668361, + # 'upload_date': '20191206', + 'uploader_id': '99997', } } def _real_extract(self, url): video_id = self._match_id(url) + media_url = self._download_json( + 'https://core.stretchlive.com/trinity/event/tcg/' + video_id, + video_id)[0]['media'][0]['url'] event = self._download_json( - 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, - video_id)[0] + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={'eventID': video_id, 'token': 'asdf'})['event'] return { 'id': video_id, 'title': event['title'], - 'timestamp': int_or_none(event.get('dateCreated'), 1000), - 'url': 'https://' + event['media'][0]['url'], + # TODO: parse US timezone abbreviations + # 'timestamp': event.get('dateTimeString'), + 'url': 'https://' + media_url, + 'uploader_id': event.get('ownerID'), } From 3fb14cd214fdadfae195745b26498e012f78be8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:03:17 +0700 Subject: [PATCH 249/860] [zdf] Rework extractors (closes #11606, closes #13473, closes #17354, closes #21185, closes #26711, closes #27068, closes #27930, closes #28198, closes #28199, closes #28274) * Generalize unique video ids for zdf based extractors * Improve extraction * Fix 3sat and phoenix --- youtube_dl/extractor/dreisat.py | 220 +++++--------------------------- youtube_dl/extractor/phoenix.py | 149 ++++++++++++++++----- youtube_dl/extractor/zdf.py | 192 ++++++++++++++++++---------- 3 files changed, 276 insertions(+), 285 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 848d387d1..5a07c18f4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,193 +1,43 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) +from .zdf import ZDFIE -class DreiSatIE(InfoExtractor): +class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html + 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, + }, { + 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'info_dict': { + 'id': '140913_sendung_schweizweit', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'timestamp': 1410623100, + 'upload_date': '20140913' }, - ] - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params - - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, + 'params': { + 'skip_download': True, } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) + }, { + # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html + 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index e435c28e1..dbbfce983 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,45 +1,128 @@ +# coding: utf-8 from __future__ import unicode_literals -from .dreisat import DreiSatIE +import re + +from .youtube import YoutubeIE +from .zdf import ZDFBaseIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + merge_dicts, + unified_timestamp, + xpath_text, +) -class PhoenixIE(DreiSatIE): +class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ - (?: - phoenix/die_sendungen/(?:[^/]+/)? - )? - (?P[0-9]+)''' - _TESTS = [ - { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html + 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613906100, + 'upload_date': '20210221', + 'uploader': 'Phoenix', + 'channel': 'corona nachgehakt', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', - 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', + 'info_dict': { + 'id': 'hMQtqFYjomk', + 'ext': 'mp4', + 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', + 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', + 'duration': 3509, + 'upload_date': '20201219', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', + 'only_matching': True, + }, { + # no media + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + article_id = self._match_id(url) - internal_id = self._search_regex( - r'
    {.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) - - -class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') - _TESTS = [{ - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', - 'info_dict': { - 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', - 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', - 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - }, - }, { - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', - 'only_matching': True, - }] + def _call_api(self, url, video_id, item, api_token=None, referrer=None): + headers = {} + if api_token: + headers['Api-Auth'] = 'Bearer %s' % api_token + if referrer: + headers['Referer'] = referrer + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, headers=headers) @staticmethod def _extract_subtitles(src): @@ -109,20 +79,11 @@ class ZDFIE(ZDFBaseIE): }) formats.append(f) - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - - if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') - + def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( - urljoin(url, ptmd_path), player, url, video_id, 'metadata') + ptmd_url, video_id, 'metadata', api_token, referrer) + + content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] formats = [] track_uris = set() @@ -140,7 +101,7 @@ class ZDFIE(ZDFBaseIE): continue for track in tracks: self._extract_format( - video_id, formats, track_uris, { + content_id, formats, track_uris, { 'url': track.get('uri'), 'type': f.get('type'), 'mimeType': f.get('mimeType'), @@ -149,6 +110,103 @@ class ZDFIE(ZDFBaseIE): }) self._sort_formats(formats) + duration = float_or_none(try_get( + ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) + + return { + 'extractor_key': ZDFIE.ie_key(), + 'id': content_id, + 'duration': duration, + 'formats': formats, + 'subtitles': self._extract_subtitles(ptmd), + } + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613948400, + 'upload_date': '20210221', + }, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', + }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'info_dict': { + 'id': '151025_magie_farben2_tex', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html + 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, + }] + + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'ngplayer_2_4') + + info = self._extract_ptmd( + urljoin(url, ptmd_path), video_id, player['apiToken'], url) + thumbnails = [] layouts = try_get( content, lambda x: x['teaserImageRef']['layouts'], dict) @@ -169,33 +227,33 @@ class ZDFIE(ZDFBaseIE): }) thumbnails.append(thumbnail) - return { - 'id': video_id, + return merge_dicts(info, { 'title': title, 'description': content.get('leadParagraph') or content.get('teasertext'), 'duration': int_or_none(t.get('duration')), 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(ptmd), - 'formats': formats, - } + }) def _extract_regular(self, url, player, video_id): content = self._call_api( - player['content'], player, url, video_id, 'content') + player['content'], video_id, 'content', player['apiToken'], url) return self._extract_entry(player['content'], player, content, video_id) def _extract_mobile(self, video_id): - document = self._download_json( + video = self._download_json( 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, - video_id)['document'] + video_id) + + document = video['document'] title = document['titel'] + content_id = document['basename'] formats = [] format_urls = set() for f in document['formitaeten']: - self._extract_format(video_id, formats, format_urls, f) + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -213,12 +271,12 @@ class ZDFIE(ZDFBaseIE): }) return { - 'id': video_id, + 'id': content_id, 'title': title, 'description': document.get('beschreibung'), 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(try_get( - document, lambda x: x['meta']['editorialDate'], compat_str)), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), 'thumbnails': thumbnails, 'subtitles': self._extract_subtitles(document), 'formats': formats, From 0002888627b40264994e8a37fc3a17cbd3551af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:16:41 +0700 Subject: [PATCH 250/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 2912d776c..07f26f2cf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Extractors +* [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, + #27930, #28198, #28199, #28274) + * Generalize cross-extractor video ids for zdf based extractors + * Improve extraction + * Fix 3sat and phoenix +* [stretchinternet] Fix extraction (#28297) +* [urplay] Fix episode data extraction (#28292) ++ [bandaichannel] Add support for b-ch.com (#21404) +* [srgssr] Improve extraction (#14717, #14725, #27231, #28238) + + Extract subtitle + * Fix extraction for new videos + * Update srf download domains +* [vvvvid] Reduce season request payload size ++ [vvvvid] Extract series sublists playlist title (#27601, #27618) ++ [dplay] Extract Ad-Free uplynk URLs (#28160) ++ [wat] Detect DRM protected videos (#27958) +* [tf1] Improve extraction (#27980, #28040) +* [tmz] Fix and improve extraction (#24603, #24687, 28211) ++ [gedidigital] Add support for Gedi group sites (#7347, #26946) +* [youtube] Fix get_video_info request + + version 2021.02.22 Core From 7c06216abff092d43e47b584699d435c40a8115e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:19:42 +0700 Subject: [PATCH 251/860] release 2021.03.02 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 60879f0ac..9544eaa6c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.22 + [debug] youtube-dl version 2021.03.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index b38d39ab4..c32ebdf56 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 3235de44b..2b5e0f08f 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a3255623a..13a54982e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.22 + [debug] youtube-dl version 2021.03.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 124b020c3..dbca582ee 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 07f26f2cf..fbf97a582 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.02 Extractors * [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2452c1f7f..2c00ec406 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -82,6 +82,7 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 + - **bandaichannel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -330,6 +331,7 @@ - **Gaskrank** - **Gazeta** - **GDCVault** + - **GediDigital** - **generic**: Generic downloader that works on some sites - **Gfycat** - **GiantBomb** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f89530293..bfe98aa9f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.22' +__version__ = '2021.03.02' From e465b25c1fb0e72b97a032220399d4a959662095 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 24 Feb 2021 11:52:30 +0000 Subject: [PATCH 252/860] [bbc] add support for BBC Reel videos(closes #21870, closes #23660, closes #28268) --- youtube_dl/extractor/bbc.py | 59 ++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b4daee54e..a0c557929 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -5,10 +5,15 @@ import itertools import re from .common import InfoExtractor +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_urlparse, +) from ..utils import ( + ExtractorError, clean_html, dict_get, - ExtractorError, float_or_none, get_element_by_class, int_or_none, @@ -21,11 +26,6 @@ from ..utils import ( urlencode_postdata, urljoin, ) -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_urlparse, -) class BBCCoUkIE(InfoExtractor): @@ -793,6 +793,20 @@ class BBCIE(BBCCoUkIE): 'description': 'Learn English words and phrases from this story', }, 'add_ie': [BBCCoUkIE.ie_key()], + }, { + # BBC Reel + 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness', + 'info_dict': { + 'id': 'p07c6sb9', + 'ext': 'mp4', + 'title': 'How positive thinking is harming your happiness', + 'alt_title': 'The downsides of positive thinking', + 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'duration': 235, + 'thumbnail': r're:https?://.+/p07c9dsr.jpg', + 'upload_date': '20190604', + 'categories': ['Psychology'], + }, }] @classmethod @@ -980,6 +994,37 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) + initial_data = self._parse_json(self._html_search_regex( + r']+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P(?:(?!\2).)+)', + webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False) + if initial_data: + init_data = try_get( + initial_data, lambda x: x['initData']['items'][0], dict) or {} + smp_data = init_data.get('smpData') or {} + clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {} + version_id = clip_data.get('versionID') + if version_id: + title = smp_data['title'] + formats, subtitles = self._download_media_selector(version_id) + self._sort_formats(formats) + image_url = smp_data.get('holdingImageURL') + display_date = init_data.get('displayDate') + topic_title = init_data.get('topicTitle') + + return { + 'id': version_id, + 'title': title, + 'formats': formats, + 'alt_title': init_data.get('shortTitle'), + 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None, + 'description': smp_data.get('summary') or init_data.get('shortSummary'), + 'upload_date': display_date.replace('-', '') if display_date else None, + 'subtitles': subtitles, + 'duration': int_or_none(clip_data.get('duration')), + 'categories': [topic_title] if topic_title else None, + } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # There are several setPayload calls may be present but the video # seems to be always related to the first one @@ -1041,7 +1086,7 @@ class BBCIE(BBCCoUkIE): thumbnail = None image_url = current_programme.get('image_url') if image_url: - thumbnail = image_url.replace('{recipe}', '1920x1920') + thumbnail = image_url.replace('{recipe}', 'raw') return { 'id': programme_id, 'title': title, From e1adb3ed4fc911a8177280fe87109e7b54a52fa2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 2 Mar 2021 11:21:49 +0100 Subject: [PATCH 253/860] [bbc] correct catched exception type --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index a0c557929..92e6f1bea 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -943,7 +943,7 @@ class BBCIE(BBCCoUkIE): else: entry['title'] = info['title'] entry['formats'].extend(info['formats']) - except Exception as e: + except ExtractorError as e: # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) From 8f56907afa693290a6b2e05fb7ffc2f15dca33e2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 2 Mar 2021 12:04:31 +0100 Subject: [PATCH 254/860] [9c9media] fix extraction for videos with multiple ContentPackages(closes #28309) --- youtube_dl/extractor/ninecninemedia.py | 4 +--- youtube_dl/extractor/rds.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index a569c889e..cfc220314 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -23,11 +23,9 @@ class NineCNineMediaIE(InfoExtractor): destination_code, content_id = re.match(self._VALID_URL, url).groups() api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) content = self._download_json(api_base_url, content_id, query={ - '$include': '[Media,Season,ContentPackages]', + '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]', }) title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 8c016a77d..0c497856e 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -15,17 +15,17 @@ class RDSIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' _TESTS = [{ - 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + # has two 9c9media ContentPackages, the web player selects the first ContentPackage + 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606', 'info_dict': { - 'id': '604333', - 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'id': '2083309', + 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande', 'ext': 'flv', - 'title': 'Fowler Jr. prend la direction de Jacksonville', - 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', - 'timestamp': 1430397346, - 'upload_date': '20150430', - 'duration': 154.354, - 'age_limit': 0, + 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande', + 'description': 'md5:83fa38ecc4a79b19e433433254077f25', + 'timestamp': 1606129030, + 'upload_date': '20201123', + 'duration': 773.039, } }, { 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', From 061c03013311eff75ac381cb4060204ce91b2510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Mar 2021 11:42:59 +0700 Subject: [PATCH 255/860] [youtube:tab] Switch continuation to browse API (closes #28289, closes #28327) Until further investigation. --- youtube_dl/extractor/youtube.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2496d27f1..eb5a58807 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2478,24 +2478,37 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): headers = { 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20201112.04.01', + 'content-type': 'application/json', } if identity_token: headers['x-youtube-identity-token'] = identity_token + data = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + } + for page_num in itertools.count(1): if not continuation: break + data['continuation'] = continuation['continuation'] + data['clickTracking'] = { + 'clickTrackingParams': continuation['itct'] + } count = 0 retries = 3 while count <= retries: try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - browse = self._download_json( - 'https://www.youtube.com/browse_ajax', None, - 'Downloading page %d%s' - % (page_num, ' (retry #%d)' % count if count else ''), - headers=headers, query=continuation) + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), + headers=headers, data=json.dumps(data).encode('utf8')) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -2503,9 +2516,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if count <= retries: continue raise - if not browse: - break - response = try_get(browse, lambda x: x[1]['response'], dict) if not response: break From 8c9766f4bf78ca777e8de9d4809584d8e88098ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Mar 2021 11:44:49 +0700 Subject: [PATCH 256/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index fbf97a582..366d322f5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +version + +Extractors +* [youtube:tab] Switch continuation to browse API (#28289, #28327) +* [9c9media] Fix extraction for videos with multiple ContentPackages (#28309) ++ [bbc] Add support for BBC Reel videos (#21870, #23660, #28268) + + version 2021.03.02 Extractors From f68692b004f1c65f08a9a7d9c2ee4ab2ec255ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Mar 2021 11:47:34 +0700 Subject: [PATCH 257/860] release 2021.03.03 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 9544eaa6c..a8eba3214 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.02 + [debug] youtube-dl version 2021.03.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c32ebdf56..7d59a9f2d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 2b5e0f08f..523408f03 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 13a54982e..6e9e094e4 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.02 + [debug] youtube-dl version 2021.03.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dbca582ee..46af4e420 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 366d322f5..238ca3965 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.03 Extractors * [youtube:tab] Switch continuation to browse API (#28289, #28327) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bfe98aa9f..a1c68e384 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.02' +__version__ = '2021.03.03' From ec64ec9651848e9173ec033a9a27809e4b5063bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Mar 2021 12:41:49 +0100 Subject: [PATCH 258/860] [voxmedia] fix volume embed extraction(closes #28338) --- youtube_dl/extractor/voxmedia.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index b318e15d4..661208125 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + try_get, + unified_timestamp, ) @@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE): setup = self._parse_json(self._search_regex( r'setup\s*=\s*({.+});', webpage, 'setup'), video_id) - video_data = setup.get('video') or {} + player_setup = setup.get('player_setup') or setup + video_data = player_setup.get('video') or {} + formatted_metadata = video_data.get('formatted_metadata') or {} info = { 'id': video_id, - 'title': video_data.get('title_short'), + 'title': player_setup.get('title') or video_data.get('title_short'), 'description': video_data.get('description_long') or video_data.get('description_short'), - 'thumbnail': video_data.get('brightcove_thumbnail') + 'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'), + 'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')), } - asset = setup.get('asset') or setup.get('params') or {} + asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {} formats = [] hls_url = asset.get('hls_url') @@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE): if formats: self._sort_formats(formats) info['formats'] = formats + info['duration'] = int_or_none(asset.get('duration')) return info for provider_video_type in ('ooyala', 'youtube', 'brightcove'): @@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor): }, { # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', - 'md5': '4c8f4a0937752b437c3ebc0ed24802b5', + 'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68', 'info_dict': { 'id': 'Gy8Md3Eky38', 'ext': 'mp4', @@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor): 'uploader_id': 'TheVerge', 'upload_date': '20141021', 'uploader': 'The Verge', + 'timestamp': 1413907200, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor): # Volume embed, Youtube 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'info_dict': { - 'id': 'YCjDnX-Xzhg', + 'id': '22986359b', 'ext': 'mp4', 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination", 'description': 'md5:fc1317922057de31cd74bce91eb1c66c', - 'uploader_id': 'voxdotcom', 'upload_date': '20150915', - 'uploader': 'Vox', + 'timestamp': 1442332800, + 'duration': 285, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', + 'timestamp': 1402938000, + 'upload_date': '20140616', + 'duration': 4114, }, 'add_ie': ['VoxMediaVolume'], }] From b8b622fbebb158db95edb05a8cc248668194b430 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Mar 2021 17:57:16 +0100 Subject: [PATCH 259/860] [trovo] Add Origin header to VOD formats(closes #28346) --- youtube_dl/extractor/trovo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py index 43745213d..de0107aa9 100644 --- a/youtube_dl/extractor/trovo.py +++ b/youtube_dl/extractor/trovo.py @@ -153,6 +153,7 @@ class TrovoVodIE(TrovoBaseIE): 'protocol': 'm3u8_native', 'tbr': int_or_none(play_info.get('bitrate')), 'url': play_url, + 'http_headers': {'Origin': 'https://trovo.live'}, }) self._sort_formats(formats) From 7f064d50db957d551dccde73ab73318f53ab3b17 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 7 Mar 2021 08:32:37 +0100 Subject: [PATCH 260/860] [cbs] add support for Paramount+ (closes #28342) --- youtube_dl/extractor/cbs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 4a19a73d2..c79e55a75 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -52,6 +52,9 @@ class CBSIE(CBSBaseIE): }, { 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'only_matching': True, }] def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): From c6a14755bb9629967fb12536ee8660ca67ff4345 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 8 Mar 2021 16:53:50 +0100 Subject: [PATCH 261/860] [bilibili] fix video info extraction(closes #28341) --- youtube_dl/extractor/bilibili.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4dc597e16..589fdc1ce 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -156,6 +156,7 @@ class BiliBiliIE(InfoExtractor): cid = js['result']['cid'] headers = { + 'Accept': 'application/json', 'Referer': url } headers.update(self.geo_verification_headers()) From 7dc513487fb0babb5257fa72df87c3f24967f2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 02:54:10 +0700 Subject: [PATCH 262/860] [pornhub] Extract formats from get_media end point (#28395) --- youtube_dl/extractor/pornhub.py | 40 +++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b7631e4e1..fdf8b1b0d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -394,6 +394,21 @@ class PornHubIE(PornHubBaseIE): upload_date = None formats = [] + + def add_format(format_url, height=None): + tbr = None + mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + for video_url, height in video_urls: if not upload_date: upload_date = self._search_regex( @@ -410,18 +425,19 @@ class PornHubIE(PornHubBaseIE): video_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) continue - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - 'tbr': tbr, - }) + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) + continue + add_format(video_url) self._sort_formats(formats) video_uploader = self._html_search_regex( From 1a1ccd9a6e8e9e90ab129e89c2524ab3eb9ed2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 02:56:01 +0700 Subject: [PATCH 263/860] [pornhub] Detect flagged videos --- youtube_dl/extractor/pornhub.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index fdf8b1b0d..2a7818e41 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -167,6 +167,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -265,7 +266,8 @@ class PornHubIE(PornHubBaseIE): webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)
    ', + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) From 477bff69065872fff6bab5c3a1b0512018fbb6eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 03:36:31 +0700 Subject: [PATCH 264/860] Introduce release_timestamp meta field (refs #28386) --- youtube_dl/YoutubeDL.py | 20 ++++++++++++-------- youtube_dl/extractor/common.py | 4 +++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ecac31f7a..8f65c6499 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1511,14 +1511,18 @@ class YoutubeDL(object): if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8eb110f4e..d3b6724df 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -230,8 +230,10 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. + timestamp: UNIX timestamp of the moment the video became available + (uploaded). upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. From 15c24b0346e3951b43dbf29631bfe65292f53ac5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 03:40:56 +0700 Subject: [PATCH 265/860] [lbry] Extract release_timestamp (closes #28386) --- youtube_dl/extractor/lbry.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 413215a99..95782366b 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -60,6 +60,7 @@ class LBRYBaseIE(InfoExtractor): 'description': stream_value.get('description'), 'license': stream_value.get('license'), 'timestamp': int_or_none(stream.get('timestamp')), + 'release_timestamp': int_or_none(stream_value.get('release_time')), 'tags': stream_value.get('tags'), 'duration': int_or_none(media.get('duration')), 'channel': try_get(signing_channel, lambda x: x['value']['title']), @@ -92,6 +93,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', 'timestamp': 1595694354, 'upload_date': '20200725', + 'release_timestamp': 1595340697, + 'release_date': '20200721', 'width': 1280, 'height': 720, } @@ -106,6 +109,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:661ac4f1db09f31728931d7b88807a61', 'timestamp': 1591312601, 'upload_date': '20200604', + 'release_timestamp': 1591312421, + 'release_date': '20200604', 'tags': list, 'duration': 2570, 'channel': 'The LBRY Foundation', From bae7dbf78be3a03d8454d1b17bfdbf1bfa0de715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 03:41:21 +0700 Subject: [PATCH 266/860] [bandcamp] Extract release_timestamp --- youtube_dl/extractor/bandcamp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Ben Prunty', 'timestamp': 1396508491, 'upload_date': '20140403', + 'release_timestamp': 1396483200, 'release_date': '20140403', 'duration': 260.877, 'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Mastodon', 'timestamp': 1322005399, 'upload_date': '20111122', + 'release_timestamp': 1076112000, 'release_date': '20040207', 'duration': 120.79, 'track': 'Hail to Fire', @@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, From 64ed3af328929f83b02ed57df5cb4f863fdd0389 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 10 Mar 2021 11:45:30 +0100 Subject: [PATCH 267/860] [lbry] add support for channel filters(closes #28385) --- youtube_dl/extractor/lbry.py | 46 ++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 95782366b..ae43d56ea 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -6,8 +6,10 @@ import json from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, @@ -186,17 +188,18 @@ class LBRYChannelIE(LBRYBaseIE): }] _PAGE_SIZE = 50 - def _fetch_page(self, claim_id, url, page): + def _fetch_page(self, claim_id, url, params, page): page += 1 + page_params = { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + } + page_params.update(params) result = self._call_api_proxy( - 'claim_search', claim_id, { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - 'stream_types': self._SUPPORTED_STREAM_TYPES, - }, 'page %d' % page) + 'claim_search', claim_id, page_params, 'page %d' % page) for item in (result.get('items') or []): stream_claim_name = item.get('name') stream_claim_id = item.get('claim_id') @@ -217,8 +220,31 @@ class LBRYChannelIE(LBRYBaseIE): result = self._resolve_url( 'lbry://' + display_id, display_id, 'channel') claim_id = result['claim_id'] + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url), + functools.partial(self._fetch_page, claim_id, url, params), self._PAGE_SIZE) result_value = result.get('value') or {} return self.playlist_result( From fc2c6d53239d4b4a6bac5383441152117ccf3c6f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 10 Mar 2021 13:16:21 +0100 Subject: [PATCH 268/860] [shahid] fix format extraction(closes #28383) --- youtube_dl/extractor/shahid.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 5c2a6206b..b5e093bd2 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -51,13 +51,16 @@ class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { - 'id': '275286', + 'id': '816924', 'ext': 'mp4', - 'title': 'مجلس الشباب الموسم 1 كليب 1', - 'timestamp': 1506988800, - 'upload_date': '20171003', + 'title': 'متحف الدحيح الموسم 1 كليب 1', + 'timestamp': 1602806400, + 'upload_date': '20201016', + 'description': 'برومو', + 'duration': 22, + 'categories': ['كوميديا'], }, 'params': { # m3u8 download @@ -109,12 +112,15 @@ class ShahidIE(ShahidBaseIE): page_type = 'episode' playout = self._call_api( - 'playout/url/' + video_id, video_id)['playout'] + 'playout/new/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) - formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + formats = self._extract_m3u8_formats(re.sub( + # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html + r'aws\.manifestfilter=[\w:;,-]+&?', + '', playout['url']), video_id, 'mp4') self._sort_formats(formats) # video = self._call_api( From 9c644a641922e5ac3b5b4a1c9386fa599973e885 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 09:51:01 +0100 Subject: [PATCH 269/860] [fujitv] fix HLS formats extension(closes #28416) --- youtube_dl/extractor/fujitv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/youtube_dl/extractor/fujitv.py +++ b/youtube_dl/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: From 43d986acd8bf7247725fc9de34648c0eda560daf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 10:14:28 +0100 Subject: [PATCH 270/860] [tver] improve title extraction(closes #28418) --- youtube_dl/extractor/tver.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 931d4d650..a54f49319 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, + strip_or_none, try_get, ) @@ -25,6 +26,10 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/episode/79622438', 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -47,8 +52,12 @@ class TVerIE(InfoExtractor): } if service == 'cx': + title = main['title'] + subtitle = strip_or_none(main.get('subtitle')) + if subtitle: + title += ' - ' + subtitle info.update({ - 'title': main.get('subtitle') or main['title'], + 'title': title, 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), 'ie_key': 'FujiTVFODPlus7', }) From ef414343e5fa2bc4fddae3097ecde5a8e32c2d4c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 10:48:58 +0100 Subject: [PATCH 271/860] [peertube] improve thumbnail extraction(closes #28419) --- youtube_dl/extractor/peertube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 32ff51653..d9b13adc2 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -599,11 +599,13 @@ class PeerTubeIE(InfoExtractor): else: age_limit = None + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), @@ -621,5 +623,6 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, + 'webpage_url': webpage_url, } From 1182f9567b86f2af747cdb8769ab87649c8ce4c2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 18:11:11 +0100 Subject: [PATCH 272/860] [pinterest] reduce the number of HLS format requests --- youtube_dl/extractor/pinterest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py index b249c9eda..42528d746 100644 --- a/youtube_dl/extractor/pinterest.py +++ b/youtube_dl/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor): title = (data.get('title') or data.get('grid_title') or video_id).strip() + urls = [] formats = [] duration = None if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) - if not format_url: + if not format_url or format_url in urls: continue + urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': From 60845121ca2f49172e7cd941c0cb43363cb86e46 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Mar 2021 15:19:24 +0100 Subject: [PATCH 273/860] [sportdeutschland] fix extraction(closes #21856)(closes #28425) --- youtube_dl/extractor/sportdeutschland.py | 147 +++++++++++++---------- 1 file changed, 85 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -1,82 +1,105 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + clean_html, + float_or_none, + int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, + try_get, ) class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' + _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sport_id = mobj.group('sport') - - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( - sport_id, video_id) - req = sanitized_Request(api_url, headers={ - 'Accept': 'application/vnd.vidibus.v2.html+json', - 'Referer': url, - }) - data = self._download_json(req, video_id) - + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) asset = data['asset'] - categories = [data['section']['title']] - - formats = [] - smil_url = asset['video'] - if '.smil' in smil_url: - m3u8_url = smil_url.replace('.smil', '.m3u8') - formats.extend( - self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) - - smil_doc = self._download_xml( - smil_url, video_id, note='Downloading SMIL metadata') - base_url_el = smil_doc.find('./head/meta') - if base_url_el: - base_url = base_url_el.attrib['base'] - formats.extend([{ - 'format_id': 'rmtp', - 'url': base_url if base_url_el else n.attrib['src'], - 'play_path': n.attrib['src'], - 'ext': 'flv', - 'preference': -100, - 'format_note': 'Seems to fail at example stream', - } for n in smil_doc.findall('./body/video')]) - else: - formats.append({'url': smil_url}) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': asset['title'], - 'thumbnail': asset.get('image'), - 'description': asset.get('teaser'), - 'duration': asset.get('duration'), - 'categories': categories, - 'view_count': asset.get('views'), - 'rtmp_live': asset.get('live'), - 'timestamp': parse_iso8601(asset.get('date')), + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] + if playlist_id: + if self._downloader.params.get('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats: + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) + else: + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info From 1860d0f41cf50b1a0876174c4e1ee7adbbd4a0f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:26:54 +0700 Subject: [PATCH 274/860] [southpark] Fix extraction and add support for southparkstudios.com (closes #26763, closes #28413) --- youtube_dl/extractor/southpark.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index da75a43a7..0774da06e 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, }] + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' From b509d24b2fef8d5994b7d925db51befcbbf996fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:36:11 +0700 Subject: [PATCH 275/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 238ca3965..924d202b0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ Introduce release_timestamp meta field (#28386) + +Extractors ++ [southpark] Add support for southparkstudios.com (#28413) +* [southpark] Fix extraction (#26763, #28413) +* [sportdeutschland] Fix extraction (#21856, #28425) +* [pinterest] Reduce the number of HLS format requests +* [peertube] Improve thumbnail extraction (#28419) +* [tver] Improve title extraction (#28418) +* [fujitv] Fix HLS formats extension (#28416) +* [shahid] Fix format extraction (#28383) ++ [lbry] Add support for channel filters (#28385) ++ [bandcamp] Extract release timestamp ++ [lbry] Extract release timestamp (#28386) +* [pornhub] Detect flagged videos ++ [pornhub] Extract formats from get_media end point (#28395) +* [bilibili] Fix video info extraction (#28341) ++ [cbs] Add support for Paramount+ (#28342) ++ [trovo] Add Origin header to VOD formats (#28346) +* [voxmedia] Fix volume embed extraction (#28338) + + version 2021.03.03 Extractors From ebfd66c4b1d6ffabd8a5bc52737f2bacac341d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:38:16 +0700 Subject: [PATCH 276/860] release 2021.03.14 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index a8eba3214..9cc79eff4 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.03 + [debug] youtube-dl version 2021.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7d59a9f2d..3296e44a8 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 523408f03..f74c29736 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 6e9e094e4..ae9e273ef 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.03 + [debug] youtube-dl version 2021.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 46af4e420..04fbea2f6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 924d202b0..73fe316b9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.14 Core + Introduce release_timestamp meta field (#28386) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a1c68e384..5a540119c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.03' +__version__ = '2021.03.14' From 9955bb4a2704f98b74a448c82dfd690ec6775b8d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Mar 2021 14:49:23 +0100 Subject: [PATCH 277/860] [rtve] improve extraction - extract all formats - fix RTVE Infantil extraction(closes #24851) - extract is_live and series --- youtube_dl/extractor/rtve.py | 232 ++++++++++++++++------------------- 1 file changed, 104 insertions(+), 128 deletions(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 +import io import re -import time +import sys from .common import InfoExtractor from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import ( determine_ext, ExtractorError, float_or_none, + qualities, remove_end, remove_start, - sanitized_Request, std_headers, ) - -def _decrypt_url(png): - encrypted_data = compat_b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index - 4:] - length = compat_struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8 + length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index + 1:] - if url_data[0] == 'H' and url_data[3] == '%': - # remove useless HQ%% at the start - url_data = url_data[4:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, + 'series': 'Balonmano', }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', - 'ext': 'flv', - 'title': 'TODO', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', }, - 'skip': 'The f4m manifest can\'t be used yet', }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104 ', + 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor): def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - manager_info = self._download_json( + self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info') - self._manager = manager_info['manager'] + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png_request = sanitized_Request(png_url) - png_request.add_header('Referer', url) - png = self._download_webpage(png_request, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - ext = determine_ext(video_url) - - formats = [] - if not video_url.endswith('.f4m') and ext != 'm3u8': - if '?' not in video_url: - video_url = video_url.replace('resources/', 'auth/resources/') - video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': video_url, - }) - self._sort_formats(formats) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) subtitles = None - if info.get('sbtFile') is not None: - subtitles = self.extract_subtitles(video_id, info['sbtFile']) + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), - 'page_url': url, 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), scale=1000), + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P[^/]*)/video/(?P[^/]*)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '915319587b33720b8e0357caaa6617e6', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', - 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - webpage = self._download_webpage(url, video_id) - vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': info['title'], - 'url': video_url, - 'thumbnail': info.get('image'), - 'duration': float_or_none(info.get('duration'), scale=1000), - } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor): 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - start_time = time.gmtime() video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') - title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - m3u8_url = _decrypt_url(png) - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'formats': formats, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } From 3be098010f667b14075e3dfad1e74e5e2becc8ea Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Mar 2021 20:08:46 +0100 Subject: [PATCH 278/860] [applepodcasts] fix extraction(closes #28445) --- youtube_dl/extractor/applepodcasts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor): ember_data = self._parse_json(self._search_regex( r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', webpage, 'ember data'), episode_id) + ember_data = ember_data.get(episode_id) or ember_data episode = ember_data['data']['attributes'] description = episode.get('description') or {} From 357bfe251d7f4f8bb9319bc6531a3813b5a355a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Mar 2021 20:42:20 +0700 Subject: [PATCH 279/860] [svtplay] Improve extraction (closes #28448) --- youtube_dl/extractor/svt.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 4acc29fce..aba9bb447 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,18 +146,19 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) + (?:.*?modalId=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ - 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'url': 'https://www.svtplay.se/video/30479064', 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': 'jNwpV9P', + 'id': '8zVbDPA', 'ext': 'mp4', - 'title': 'Det här är himlen', - 'timestamp': 1586044800, - 'upload_date': '20200405', - 'duration': 3515, + 'title': 'Designdrömmar i Stenungsund', + 'timestamp': 1615770000, + 'upload_date': '20210315', + 'duration': 3519, 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { @@ -173,6 +174,9 @@ class SVTPlayIE(SVTPlayBaseIE): # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B 'skip_download': True, }, + }, { + 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -219,7 +223,8 @@ class SVTPlayIE(SVTPlayBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id, svt_id = mobj.group('id', 'svt_id') + video_id = mobj.group('id') + svt_id = mobj.group('svt_id') or mobj.group('modal_id') if svt_id: return self._extract_by_video_id(svt_id) @@ -254,6 +259,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From f912d6c8cf5a68d576abe4426e12554c3404a7dd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Mar 2021 21:43:53 +0100 Subject: [PATCH 280/860] [mlb] fix video extracion(#21241) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/mlb.py | 189 +++++++++++++++++++++++++---- 2 files changed, 172 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 07a8af055..c2f67323b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -673,7 +673,10 @@ from .mixcloud import ( MixcloudUserIE, MixcloudPlaylistIE, ) -from .mlb import MLBIE +from .mlb import ( + MLBIE, + MLBVideoIE, +) from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index b907f6b49..b69301d97 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,15 +1,91 @@ from __future__ import unicode_literals -from .nhl import NHLBaseIE +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + try_get, +) -class MLBIE(NHLBaseIE): +class MLBBaseIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + video = self._download_video_data(display_id) + video_id = video['id'] + title = video['title'] + feed = self._get_feed(video) + + formats = [] + for playback in (feed.get('playbacks') or []): + playback_url = playback.get('url') + if not playback_url: + continue + name = playback.get('name') + ext = determine_ext(playback_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=name, fatal=False)) + else: + f = { + 'format_id': name, + 'url': playback_url, + } + mobj = re.search(r'_(\d+)K_(\d+)X(\d+)', name) + if mobj: + f.update({ + 'height': int(mobj.group(3)), + 'tbr': int(mobj.group(1)), + 'width': int(mobj.group(2)), + }) + mobj = re.search(r'_(\d+)x(\d+)_(\d+)_(\d+)K\.mp4', playback_url) + if mobj: + f.update({ + 'fps': int(mobj.group(3)), + 'height': int(mobj.group(2)), + 'tbr': int(mobj.group(4)), + 'width': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnails = [] + for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []): + src = cut.get('src') + if not src: + continue + thumbnails.append({ + 'height': int_or_none(cut.get('height')), + 'url': src, + 'width': int_or_none(cut.get('width')), + }) + + language = (video.get('language') or 'EN').lower() + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': video.get('description'), + 'duration': parse_duration(feed.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video.get(self._TIMESTAMP_KEY)), + 'subtitles': self._extract_mlb_subtitles(feed, language), + } + + +class MLBIE(MLBBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*(?Pmlb)\.com/ + (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:[^/]+/)*c-| + (?:[^/]+/)*video/[^/]+/c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -18,7 +94,6 @@ class MLBIE(NHLBaseIE): (?P\d+) ) ''' - _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', @@ -76,18 +151,6 @@ class MLBIE(NHLBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - { - 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', - 'md5': 'e09e37b552351fddbf4d9e699c924d68', - 'info_dict': { - 'id': '75609783', - 'ext': 'mp4', - 'title': 'Must C: Pillar climbs for catch', - 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429139220, - 'upload_date': '20150415', - } - }, { 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', 'only_matching': True, @@ -113,8 +176,92 @@ class MLBIE(NHLBaseIE): 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, }, - { - 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', - 'only_matching': True, - } ] + _TIMESTAMP_KEY = 'date' + + @staticmethod + def _get_feed(video): + return video + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for keyword in (feed.get('keywordsAll') or []): + keyword_type = keyword.get('type') + if keyword_type and keyword_type.startswith('closed_captions_location_'): + cc_location = keyword.get('value') + if cc_location: + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + return subtitles + + def _download_video_data(self, display_id): + return self._download_json( + 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id, + display_id) + + +class MLBVideoIE(MLBBaseIE): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/(?:[^/]+/)*video/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.mlb.com/mariners/video/ackley-s-spectacular-catch-c34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', + 'info_dict': { + 'id': 'c04a8863-f569-42e6-9f87-992393657614', + 'ext': 'mp4', + 'title': "Ackley's spectacular catch", + 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', + 'duration': 66, + 'timestamp': 1405995000, + 'upload_date': '20140722', + 'thumbnail': r're:^https?://.+', + }, + } + _TIMESTAMP_KEY = 'timestamp' + + @classmethod + def suitable(cls, url): + return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url) + + @staticmethod + def _get_feed(video): + return video['feeds'][0] + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for cc_location in (feed.get('closedCaptions') or []): + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + + def _download_video_data(self, display_id): + # https://www.mlb.com/data-service/en/videos/[SLUG] + return self._download_json( + 'https://fastball-gateway.mlb.com/graphql', + display_id, query={ + 'query': '''{ + mediaPlayback(ids: "%s") { + description + feeds(types: CMS) { + closedCaptions + duration + image { + cuts { + width + height + src + } + } + playbacks { + name + url + } + } + id + timestamp + title + } +}''' % display_id, + })['data']['mediaPlayback'][0] From fa6bf0a7112e83d36567072985d56440bb34de72 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Mar 2021 12:37:22 +0100 Subject: [PATCH 281/860] [vvvvid] fix kenc format extraction(closes #28473) --- youtube_dl/extractor/vvvvid.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 7c94c4ee2..bc196f8a0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -182,17 +182,20 @@ class VVVVIDIE(InfoExtractor): if not embed_code: continue embed_code = ds(embed_code) - if video_type in ('video/rcs', 'video/kenc'): - if video_type == 'video/kenc': - kenc = self._download_json( - 'https://www.vvvvid.it/kenc', video_id, query={ - 'action': 'kt', - 'conn_id': self._conn_id, - 'url': embed_code, - }, fatal=False) or {} - kenc_message = kenc.get('message') - if kenc_message: - embed_code += '?' + ds(kenc_message) + if video_type == 'video/kenc': + embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif video_type == 'video/rcs': formats.extend(self._extract_akamai_formats(embed_code, video_id)) elif video_type == 'video/youtube': info.update({ From 7e79ba7dd6e6649dd2ce3a74004b2044f2182881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 20 Mar 2021 05:45:36 +0700 Subject: [PATCH 282/860] [vimeo:album] Fix extraction for albums with number of videos multiple to page size (closes #28486) --- youtube_dl/extractor/vimeo.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bd2663fe0..955651bec 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -939,11 +939,15 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): } if hashed_pass: query['_hashed_pass'] = hashed_pass - videos = self._download_json( - 'https://api.vimeo.com/albums/%s/videos' % album_id, - album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorization, - })['data'] + try: + videos = self._download_json( + 'https://api.vimeo.com/albums/%s/videos' % album_id, + album_id, 'Downloading page %d' % api_page, query=query, headers={ + 'Authorization': 'jwt ' + authorization, + })['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return for video in videos: link = video.get('link') if not link: From 21ccd0d7f46002acc61eb21bd0d4e492064c7fe1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Mar 2021 09:10:38 +0100 Subject: [PATCH 283/860] [tiktok] detect private videos(closes #28453) --- youtube_dl/extractor/tiktok.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index ea1beb8af..4faa6de54 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -107,9 +107,12 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( + page_props = self._parse_json(self._search_regex( r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s* Date: Mon, 22 Mar 2021 14:56:58 +0100 Subject: [PATCH 284/860] [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) Co-authored-by: Sergey M --- youtube_dl/extractor/vgtv.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index fe7a26b62..22e99e8f0 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -23,6 +23,8 @@ class VGTVIE(XstreamIE): 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se': 'abtv', + # obsolete URL schemas, kept in order to save one HTTP redirect 'tv.aftonbladet.se/abtv': 'abtv', 'www.aftonbladet.se/tv': 'abtv', } @@ -140,6 +142,10 @@ class VGTVIE(XstreamIE): 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', 'only_matching': True, }, + { + 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', + 'only_matching': True, + }, { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, From 8117d613acdd0a2874e52bfa52c3574f46e3a4fb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Mar 2021 15:58:44 +0100 Subject: [PATCH 285/860] [zingmp3] fix extraction(closes #11589, closes #16409, closes #16968, closes #27205) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/zingmp3.py | 212 ++++++++++++++++------------- 2 files changed, 119 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c2f67323b..8b55947f6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1624,5 +1624,8 @@ from .zattoo import ( ) from .zdf import ZDFIE, ZDFChannelIE from .zhihu import ZhihuIE -from .zingmp3 import ZingMp3IE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) from .zype import ZypeIE diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index adfdcaabf..207c04f5e 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -1,93 +1,94 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - update_url_query, ) -class ZingMp3BaseInfoExtractor(InfoExtractor): +class ZingMp3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P\w+)\.html' + _GEO_COUNTRIES = ['VN'] - def _extract_item(self, item, page_type, fatal=True): - error_message = item.get('msg') - if error_message: - if not fatal: - return - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), - expected=True) + def _extract_item(self, item, fatal): + item_id = item['id'] + title = item.get('name') or item['title'] formats = [] - for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): - if not source_url or source_url == 'require vip': + for k, v in (item.get('source') or {}).items(): + if not v: continue - if not re.match(r'https?://', source_url): - source_url = '//' + source_url - source_url = self._proto_relative_url(source_url, 'http:') - quality_num = int_or_none(quality) - f = { - 'format_id': quality, - 'url': source_url, - } - if page_type == 'video': - f.update({ - 'height': quality_num, - 'ext': 'mp4', - }) + if k in ('mp4', 'hls'): + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, item_id, 'mp4', + 'm3u8_native', m3u8_id=k, fatal=False)) + elif k == 'mp4': + formats.append({ + 'format_id': 'mp4-' + res, + 'url': video_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)p', res, 'resolution', default=None)), + }) else: - f.update({ - 'abr': quality_num, + formats.append({ 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', }) - formats.append(f) + if not formats: + if not fatal: + return + msg = item['msg'] + if msg == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(msg, expected=True) + self._sort_formats(formats) - cover = item.get('cover') + subtitles = None + lyric = item.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } + + album = item.get('album') or {} return { - 'title': (item.get('name') or item.get('title')).strip(), + 'id': item_id, + 'title': title, 'formats': formats, - 'thumbnail': 'http:/' + cover if cover else None, - 'artist': item.get('artist'), + 'thumbnail': item.get('thumbnail'), + 'subtitles': subtitles, + 'duration': int_or_none(item.get('duration')), + 'track': title, + 'artist': item.get('artists_names'), + 'album': album.get('name') or album.get('title'), + 'album_artist': album.get('artists_names'), } - def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): - player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') - items = player_json['data'] - if 'item' in items: - items = items['item'] - - if len(items) == 1: - # one single song - data = self._extract_item(items[0], page_type) - data['id'] = id - - return data - else: - # playlist of songs - entries = [] - - for i, item in enumerate(items, 1): - entry = self._extract_item(item, page_type, fatal=False) - if not entry: - continue - entry['id'] = '%s-%d' % (id, i) - entries.append(entry) - - return { - '_type': 'playlist', - 'id': id, - 'title': playlist_title, - 'entries': entries, - } + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), + page_id, query={'play_song': 1}) + data_path = self._search_regex( + r'data-xml="([^"]+)', webpage, 'data path') + return self._process_data(self._download_json( + 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) -class ZingMp3IE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P\w+)\.html' +class ZingMp3IE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -95,49 +96,66 @@ class ZingMp3IE(ZingMp3BaseInfoExtractor): 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }] + }, + 'duration': 255, + 'track': 'Xa Mãi Xa', + 'artist': 'Bảo Thy', + 'album': 'Special Album', + 'album_artist': 'Bảo Thy', }, }, { - 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', - 'md5': '870295a9cd8045c0e15663565902618d', + 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'e9c972b693aa88301ef981c8151c4343', 'info_dict': { - 'id': 'ZW6BAEA0', - 'title': 'Let It Go (Frozen OST)', + 'id': 'ZO8ZF7C7', + 'title': 'Sương Hoa Đưa Lối', 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 207, + 'track': 'Sương Hoa Đưa Lối', + 'artist': 'K-ICM, RYO', }, }, { - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', - }, - 'playlist_count': 10, - 'skip': 'removed at the request of the owner', - }, { - 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'only_matching': True, }] IE_NAME = 'zingmp3' IE_DESC = 'mp3.zing.vn' - def _real_extract(self, url): - page_id = self._match_id(url) + def _process_data(self, data): + return self._extract_item(data, True) - webpage = self._download_webpage(url, page_id) - player_json_url = self._search_regex([ - r'data-xml="([^"]+)', - r'&xmlURL=([^&]+)&' - ], webpage, 'player xml url') +class ZingMp3AlbumIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' + _TESTS = [{ + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái', + }, + 'playlist_count': 10, + }, { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + }, { + 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3:album' - playlist_title = None - page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') - if page_type == 'video': - player_json_url = update_url_query(player_json_url, {'format': 'json'}) - else: - player_json_url = player_json_url.replace('/xml/', '/html5xml/') - if page_type == 'album': - playlist_title = self._og_search_title(webpage) - - return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) + def _process_data(self, data): + def entries(): + for item in (data.get('items') or []): + entry = self._extract_item(item, False) + if entry: + yield entry + info = data.get('info') or {} + return self.playlist_result( + entries(), info.get('id'), info.get('name') or info.get('title')) From 5208ae92fc3e2916cdccae45c6b9a516be3d5796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 24 Mar 2021 02:57:35 +0700 Subject: [PATCH 286/860] [youtube] Fix default value for youtube_include_dash_manifest (closes #28523) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eb5a58807..badca3977 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1617,7 +1617,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_id'] = itag formats.append(f) - if self._downloader.params.get('youtube_include_dash_manifest'): + if self._downloader.params.get('youtube_include_dash_manifest', True): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats( From a40002444e64957594a1305bb2740fddb477beeb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Mar 2021 15:10:19 +0100 Subject: [PATCH 287/860] [bbc] fix BBC IPlayer Episodes/Group extraction(closes #28360) --- youtube_dl/extractor/bbc.py | 205 ++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 3 +- 2 files changed, 192 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 92e6f1bea..e8d000bbb 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,17 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import json import re from .common import InfoExtractor from ..compat import ( compat_etree_Element, compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, dict_get, float_or_none, @@ -811,7 +816,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) else super(BBCIE, cls).suitable(url)) @@ -1338,21 +1343,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor): playlist_id, title, description) -class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', 'title': 'The Disappearance', - 'description': 'French thriller serial about a missing teenager.', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', }, - 'playlist_mincount': 6, - 'skip': 'This programme is not currently available on BBC iPlayer', + 'playlist_mincount': 8, }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ # Available for over a year unlike 30 days for most other programmes 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', 'info_dict': { @@ -1361,14 +1494,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

    ([^<]+)

    ', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

    ', - webpage, 'description', fatal=False, group='value') - return title, description + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b55947f6..e0fd0b648 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -95,7 +95,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, - BBCCoUkIPlayerPlaylistIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, BBCIE, ) From eafcadea261dba64c44c5c17ea8a47ac17256617 Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Wed, 24 Mar 2021 23:33:19 +0900 Subject: [PATCH 288/860] [extractor] escape forgotten dot for hostnames in regular expression (#28530) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/mtv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2b1b3bdf..f99d887ca 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2953,7 +2953,7 @@ class GenericIE(InfoExtractor): webpage) if not mobj: mobj = re.search( - r'data-video-link=["\'](?Phttp://m.mlb.com/video/[^"\']+)', + r'data-video-link=["\'](?Phttp://m\.mlb\.com/video/[^"\']+)', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f5e30d22d..600cf2d89 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -320,7 +320,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) if mobj: return mobj.group('url') From d1069d33b4ad3987acc2452756459065ce635d68 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Sun, 12 Apr 2020 23:27:58 +0200 Subject: [PATCH 289/860] [zoom] Add new extractor(closes #16597, closes #27002, closes #28531) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/zoom.py | 68 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/zoom.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e0fd0b648..b2b39e4dd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1629,4 +1629,5 @@ from .zingmp3 import ( ZingMp3IE, ZingMp3AlbumIE, ) +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py new file mode 100644 index 000000000..db073d91d --- /dev/null +++ b/youtube_dl/extractor/zoom.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + parse_filesize, + urlencode_postdata, +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P[A-Za-z0-9_.-]+)' + _TEST = { + 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', + 'info_dict': { + 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'ext': 'mp4', + 'title': 'China\'s "two sessions" and the new five-year plan', + } + } + + def _real_extract(self, url): + base_url, play_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, play_id) + + try: + form = self._form_hidden_inputs('password_form', webpage) + except ExtractorError: + form = None + if form: + password = self._downloader.params.get('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + webpage = self._download_webpage(url, play_id) + + data = self._parse_json(self._search_regex( + r'(?s)window\.__data__\s*=\s*({.+?});', + webpage, 'data'), play_id, js_to_json) + + return { + 'id': play_id, + 'title': data['topic'], + 'url': data['viewMp4Url'], + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'http_headers': { + 'Referer': base_url, + }, + 'filesize_approx': parse_filesize(data.get('fileSize')), + } From c2fbfb49da2657002fafcf4c609f8d91030a6ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 25 Mar 2021 00:03:00 +0700 Subject: [PATCH 290/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 73fe316b9..1c3313280 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors ++ [zoom] Add support for zoom.us (#16597, #27002, #28531) +* [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) +* [youtube] Fix default value for youtube_include_dash_manifest (#28523) +* [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) ++ [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) ++ [tiktok] Detect private videos (#28453) +* [vimeo:album] Fix extraction for albums with number of videos multiple + to page size (#28486) +* [vvvvid] Fix kenc format extraction (#28473) +* [mlb] Fix video extraction (#21241) +* [svtplay] Improve extraction (#28448) +* [applepodcasts] Fix extraction (#28445) +* [rtve] Improve extraction + + Extract all formats + * Fix RTVE Infantil extraction (#24851) + + Extract is_live and series + + version 2021.03.14 Core From 76da1c954aebba4af8def73dd2319fab2e27e50a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 25 Mar 2021 00:04:10 +0700 Subject: [PATCH 291/860] release 2021.03.25 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 8 files changed, 19 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 9cc79eff4..7feb0298c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.14 + [debug] youtube-dl version 2021.03.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 3296e44a8..49e18173d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index f74c29736..a1486b133 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ae9e273ef..7eaf5a202 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.14 + [debug] youtube-dl version 2021.03.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 04fbea2f6..20042d98c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1c3313280..1b49e411a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.25 Extractors + [zoom] Add support for zoom.us (#16597, #27002, #28531) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2c00ec406..d2ad937a4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -90,7 +90,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles - - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:iplayer:episodes** + - **bbc.co.uk:iplayer:group** - **bbc.co.uk:playlist** - **BBVTV** - **Beatport** @@ -522,6 +523,7 @@ - **mixcloud:playlist** - **mixcloud:user** - **MLB** + - **MLBVideo** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -1212,4 +1214,6 @@ - **ZDFChannel** - **Zhihu** - **zingmp3**: mp3.zing.vn + - **zingmp3:album** + - **zoom** - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5a540119c..e87b820fa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.14' +__version__ = '2021.03.25' From 8562218350a79d4709da8593bb0c538aa0824acf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Mar 2021 19:28:51 +0100 Subject: [PATCH 292/860] [ard] improve clip id extraction(#22724)(closes #28528) --- youtube_dl/extractor/ard.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 143fc51e9..d57c5ba0f 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -335,7 +335,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?PY3JpZDovL[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -365,22 +365,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id = self._match_id(url) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ + video_id, data=json.dumps({ 'query': '''{ - playerPage(client:"%s", clipId: "%s") { + playerPage(client: "ard", clipId: "%s") { blockedByFsk broadcastedOn maturityContentRating @@ -410,7 +410,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % video_id, }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] @@ -435,7 +435,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, - 'display_id': display_id, 'title': title, 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), From 605e7b5e47c60c3ed7c2ca71df4d6bbd49fa8a77 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 25 Mar 2021 12:53:18 +0100 Subject: [PATCH 293/860] [youtube:tab] fix playlist/comunity continuation items extraction(closes #28266) --- youtube_dl/extractor/youtube.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index badca3977..faf3a344e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -24,6 +24,7 @@ from ..jsinterp import JSInterpreter from ..utils import ( ExtractorError, clean_html, + dict_get, float_or_none, int_or_none, mimetype2ext, @@ -2541,13 +2542,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation = self._extract_continuation(continuation_renderer) continue + on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) continuation_items = try_get( - response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) if continuation_items: continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('gridVideoRenderer') + renderer = self._extract_grid_item_renderer(continuation_item) if renderer: grid_renderer = {'items': continuation_items} for entry in self._grid_entries(grid_renderer): @@ -2561,6 +2563,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(video_list_renderer) continue + renderer = continuation_item.get('backstagePostThreadRenderer') + if renderer: + continuation_renderer = {'contents': continuation_items} + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue break From c78591187080e7316c0042309fe956bfd0d38d30 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 25 Mar 2021 17:06:57 +0100 Subject: [PATCH 294/860] [vimeo] fix unlisted video extraction(closes #28414) --- youtube_dl/extractor/vimeo.py | 39 ++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 955651bec..5800962ea 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -24,6 +24,7 @@ from ..utils import ( merge_dicts, OnDemandPagedList, parse_filesize, + parse_iso8601, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -278,7 +279,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P[0-9]+) - (?:/[\da-f]+)? + (?:/(?P[\da-f]{10}))? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -577,11 +578,37 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - # Extract ID from URL - video_id = self._match_id(url) + video_id, unlisted_hash = re.match(self._VALID_URL, url).groups() + if unlisted_hash: + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + video = self._download_json( + 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), + video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + orig_url = url is_pro = 'vimeopro.com/' in url is_player = '://player.vimeo.com/video/' in url @@ -756,6 +783,8 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None info_dict = { From cc777dcaa0f3331626f33a7e4c61d804c43f4b5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 27 Mar 2021 17:37:45 +0100 Subject: [PATCH 295/860] [picarto] fix live stream extraction(closes #28532) --- youtube_dl/extractor/picarto.py | 96 ++++++++++++--------------------- 1 file changed, 35 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 8099ef1d6..e6c51e16b 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,22 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time - from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, - try_get, - update_url_query, - urlencode_postdata, ) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)(?:/(?P[a-zA-Z0-9]+))?' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -34,65 +27,46 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + channel_id = self._match_id(url) - metadata = self._download_json( - 'https://api.picarto.tv/v1/channel/name/' + channel_id, - channel_id) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ + 'query': '''{ + channel(name: "%s") { + adult + id + online + stream_name + title + } + getLoadBalancerUrl(channel_name: "%s") { + url + } +}''' % (channel_id, channel_id), + })['data'] + metadata = data['channel'] - if metadata.get('online') is False: + if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) + title = metadata['title'] cdn_data = self._download_json( - 'https://picarto.tv/process/channel', channel_id, - data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Downloading load balancing info') + data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + channel_id, 'Downloading load balancing info') - token = mobj.group('token') or 'public' - params = { - 'con': int(time.time() * 1000), - 'token': token, - } - - prefered_edge = cdn_data.get('preferedEdge') formats = [] - - for edge in cdn_data['edges']: - edge_ep = edge.get('ep') - if not edge_ep or not isinstance(edge_ep, compat_str): + for source in (cdn_data.get('source') or []): + source_url = source.get('url') + if not source_url: continue - edge_id = edge.get('id') - for tech in cdn_data['techs']: - tech_label = tech.get('label') - tech_type = tech.get('type') - preference = 0 - if edge_id == prefered_edge: - preference += 1 - format_id = [] - if edge_id: - format_id.append(edge_id) - if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': - format_id.append('hls') - formats.extend(self._extract_m3u8_formats( - update_url_query( - 'https://%s/hls/%s/index.m3u8' - % (edge_ep, channel_id), params), - channel_id, 'mp4', preference=preference, - m3u8_id='-'.join(format_id), fatal=False)) - continue - elif tech_type == 'video/mp4' or tech_label == 'MP4': - format_id.append('mp4') - formats.append({ - 'url': update_url_query( - 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), - params), - 'format_id': '-'.join(format_id), - 'preference': preference, - }) - else: - # rtmp format does not seem to work - continue + source_type = source.get('type') + if source_type == 'html5/application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'html5/video/mp4': + formats.append({ + 'url': source_url, + }) self._sort_formats(formats) mature = metadata.get('adult') @@ -103,10 +77,10 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(metadata.get('title') or channel_id), + 'title': self._live_title(title.strip()), 'is_live': True, - 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), 'channel': channel_id, + 'channel_id': metadata.get('id'), 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, From 49fc0a567febda65709cc5154ff046684a3b8427 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 27 Mar 2021 19:11:41 +0100 Subject: [PATCH 296/860] [youtube] fix video's channel extraction(closes #28562) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index faf3a344e..e48c5a7d2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1896,7 +1896,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['channel'] = get_text(try_get( vsir, lambda x: x['owner']['videoOwnerRenderer']['title'], - compat_str)) + dict)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], From 87a8bde7775ebc31175ebb111015b4052b50b7db Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Mar 2021 08:46:33 +0100 Subject: [PATCH 297/860] [sbs] add support for ondemand watch URLs(closes #28566) --- youtube_dl/extractor/sbs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index f722528cd..0a806ee4e 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -43,6 +43,9 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', + 'only_matching': True, }] def _real_extract(self, url): From da762c4e329f6158956ddd51ac294e9183e5ce89 Mon Sep 17 00:00:00 2001 From: Chris Hranj Date: Mon, 29 Mar 2021 15:05:19 -0400 Subject: [PATCH 298/860] [instagram] Improve title extraction and extract duration (#28469) Co-authored-by: Sergey M. --- youtube_dl/extractor/instagram.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1eeddc3b6..12e10143c 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, get_element_by_attribute, int_or_none, lowercase_escape, @@ -32,6 +33,7 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', @@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by britneyspears', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', @@ -86,6 +89,24 @@ class InstagramIE(InfoExtractor): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + }, { + # IGTV + 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', + 'info_dict': { + 'id': 'BkfuX9UB-eK', + 'ext': 'mp4', + 'title': 'Fingerboarding Tricks with @cass.fb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 53.83, + 'timestamp': 1530032919, + 'upload_date': '20180626', + 'uploader_id': 'instagram', + 'uploader': 'Instagram', + 'like_count': int, + 'comment_count': int, + 'comments': list, + 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', + } }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -159,7 +180,9 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') + title = media.get('title') thumbnail = media.get('display_src') or media.get('display_url') + duration = float_or_none(media.get('video_duration')) timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') @@ -200,9 +223,10 @@ class InstagramIE(InfoExtractor): continue entries.append({ 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, + 'title': node.get('title') or 'Video %d' % edge_num, 'url': node_video_url, 'thumbnail': node.get('display_url'), + 'duration': float_or_none(node.get('video_duration')), 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), 'view_count': int_or_none(node.get('video_view_count')), @@ -239,8 +263,9 @@ class InstagramIE(InfoExtractor): 'id': video_id, 'formats': formats, 'ext': 'mp4', - 'title': 'Video by %s' % uploader_id, + 'title': title or 'Video by %s' % uploader_id, 'description': description, + 'duration': duration, 'thumbnail': thumbnail, 'timestamp': timestamp, 'uploader_id': uploader_id, From 287e50b56b4c71da8fd0c3ffdeca9bff5ab0b005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Mar 2021 03:37:43 +0700 Subject: [PATCH 299/860] [francetvinfo] Improve video id extraction (closes #28584) --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3ca415077..7cc88bf18 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -399,7 +399,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), webpage, 'video id') return self._make_url_result(video_id) From 955894e72fd8d4fdce5d85fc006d548278e6d9eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 30 Mar 2021 10:00:40 +0100 Subject: [PATCH 300/860] [vlive] fix inkey request(closes #28589) --- youtube_dl/extractor/vlive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e2f5d81b8..42da34d44 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -106,7 +106,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unable to log in', expected=True) def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID, 'gcc': 'KR'} + query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} if fields: query['fields'] = fields try: From 207bc35d348efdfdfe2bd7119e004a1acf0ab3d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 31 Mar 2021 02:58:01 +0700 Subject: [PATCH 301/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1b49e411a..1297c19f7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Extractors +* [vlive] Fix inkey request (#28589) +* [francetvinfo] Improve video id extraction (#28584) ++ [instagram] Extract duration (#28469) +* [instagram] Improve title extraction (#28469) ++ [sbs] Add support for ondemand watch URLs (#28566) +* [youtube] Fix video's channel extraction (#28562) +* [picarto] Fix live stream extraction (#28532) +* [vimeo] Fix unlisted video extraction (#28414) +* [youtube:tab] Fix playlist/community continuation items extraction (#28266) +* [ard] Improve clip id extraction (#22724, #28528) + + version 2021.03.25 Extractors From 8f493de9fb3a7f123bdf887163efa06ce9d6b051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 31 Mar 2021 02:59:07 +0700 Subject: [PATCH 302/860] release 2021.03.31 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 7feb0298c..2ac4df8db 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.25 + [debug] youtube-dl version 2021.03.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 49e18173d..5ad5590bf 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index a1486b133..ea96c4c20 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7eaf5a202..ed3abd45c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.25 + [debug] youtube-dl version 2021.03.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 20042d98c..c1067ee1f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1297c19f7..4c094b771 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.31 Extractors * [vlive] Fix inkey request (#28589) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e87b820fa..bcfdae23d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.25' +__version__ = '2021.03.31' From 28bab774a0df2c80b689c277390da8617131db35 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 30 Mar 2021 21:44:41 +0100 Subject: [PATCH 303/860] [youtube] imporve age-restricted video extraction(#28578) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e48c5a7d2..6a92938a5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1432,7 +1432,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999', video_id, fatal=False) + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None if webpage: From b97fb2edac25182ff3dcf4cb8537517a1ec9e4de Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 31 Mar 2021 20:07:13 +0100 Subject: [PATCH 304/860] [vimeo] fix password protected review extraction(closes #27591) --- youtube_dl/extractor/vimeo.py | 64 ++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5800962ea..a90cf0630 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -75,25 +75,28 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') - def _verify_video_password(self, url, video_id, webpage): + def _get_video_password(self): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + return password + + def _verify_video_password(self, url, video_id, password, token, vuid): if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) self._set_vimeo_cookie('vuid', vuid) return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + url + '/password', video_id, 'Verifying the password', + 'Wrong password', data=urlencode_postdata({ + 'password': password, + 'token': token, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( @@ -332,9 +335,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '54469442', 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', - 'uploader': 'The BLN & Business of Software', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', - 'uploader_id': 'theblnbusinessofsoftware', + 'uploader': 'Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', + 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, }, @@ -469,6 +472,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Unable to download JSON metadata'], + 'skip': 'this page is no longer available.', }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -551,9 +555,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return urls[0] if urls else None def _verify_player_video_password(self, url, video_id, headers): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + password = self._get_video_password() data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -697,7 +699,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if re.search(r']+?id="pw_form"', webpage) is not None: if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') - self._verify_video_password(redirect_url, video_id, webpage) + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) return self._real_extract( smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) else: @@ -1091,10 +1096,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - clip_data = self._download_json( - page_url.replace('/review/', '/review/data/'), - video_id)['clipData'] - config_url = clip_data['configUrl'] + data = self._download_json( + page_url.replace('/review/', '/review/data/'), video_id) + if data.get('isLocked') is True: + video_password = self._get_video_password() + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id) + webpage = self._verify_video_password( + 'https://vimeo.com/' + video_id, video_id, + video_password, viewer['xsrft'], viewer['vuid']) + clip_page_config = self._parse_json(self._search_regex( + r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', + webpage, 'clip page config'), video_id) + config_url = clip_page_config['player']['config_url'] + clip_data = clip_page_config.get('clip') or {} + else: + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( From 14f29f087e6097feb46bdb84878924bc410a57eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:05:10 +0700 Subject: [PATCH 305/860] [youtube] Setup CONSENT cookie when needed (closes #28604) --- youtube_dl/extractor/youtube.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6a92938a5..b940c0bad 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -249,7 +249,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _real_initialize(self): + self._initialize_consent() if self._downloader is None: return if not self._login(): From e789bb1aa4cb627d3d7ca79a5e5daa8d2f58cda6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:43:08 +0700 Subject: [PATCH 306/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4c094b771..ee2dc88bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +version + +Extractors +* [youtube] Setup CONSENT cookie when needed (#28604) +* [vimeo] Fix password protected review extraction (#27591) +* [youtube] Improve age-restricted video extraction (#28578) + + version 2021.03.31 Extractors From ca304beb1538e54c5a18fdd50846ed2259d63b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:47:11 +0700 Subject: [PATCH 307/860] release 2021.04.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2ac4df8db..98ec799e8 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.31 + [debug] youtube-dl version 2021.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5ad5590bf..5387a6cd1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index ea96c4c20..945c80366 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ed3abd45c..0acc8b679 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.31 + [debug] youtube-dl version 2021.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c1067ee1f..42c3126a3 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index ee2dc88bc..4304ecd9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.01 Extractors * [youtube] Setup CONSENT cookie when needed (#28604) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bcfdae23d..0457d1a15 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.31' +__version__ = '2021.04.01' From 37488630703944b4f2bda84a26391ae61d29e15b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Apr 2021 11:50:30 +0100 Subject: [PATCH 308/860] [youtube:tab] Add support for hashtag videos extraction(closes #28308) --- youtube_dl/extractor/youtube.py | 137 ++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b940c0bad..1f5497e24 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1959,7 +1959,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): invidio\.us )/ (?: - (?:channel|c|user|feed)/| + (?:channel|c|user|feed|hashtag)/| (?:playlist|watch)\?.*?\blist=| (?!(?:watch|embed|v|e)\b) ) @@ -2245,6 +2245,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, }] @classmethod @@ -2392,6 +2399,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for entry in self._post_thread_entries(renderer): yield entry + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + @staticmethod def _build_continuation_query(continuation, ctp=None): query = { @@ -2442,55 +2457,60 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not tab_content: return slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - if not slr_renderer: - return - is_channels_tab = tab.get('title') == 'Channels' - continuation = None - slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): + if slr_renderer: + is_channels_tab = tab.get('title') == 'Channels' + continuation = None + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer, not is_channels_tab): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer, not is_channels_tab): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + if not continuation: + continuation = self._extract_continuation(is_renderer) if not continuation: - continuation = self._extract_continuation(is_renderer) - - if not continuation: - continuation = self._extract_continuation(slr_renderer) + continuation = self._extract_continuation(slr_renderer) + else: + rich_grid_renderer = tab_content.get('richGridRenderer') + if not rich_grid_renderer: + return + for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): + yield entry + continuation = self._extract_continuation(rich_grid_renderer) headers = { 'x-youtube-client-name': '1', @@ -2586,6 +2606,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(continuation_renderer) continue + renderer = continuation_item.get('richItemRenderer') + if renderer: + for entry in self._rich_grid_entries(continuation_items): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @@ -2642,7 +2668,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = title = description = None + playlist_id = item_id + title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -2651,12 +2678,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title += ' - %s' % tab_title description = renderer.get('description') playlist_id = renderer.get('externalId') - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - description = None - playlist_id = item_id + else: + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + else: + renderer = try_get( + data, lambda x: x['header']['hashtagHeaderRenderer'], dict) + if renderer: + title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( self._entries(selected_tab, identity_token), playlist_id=playlist_id, playlist_title=title, From c5aa8f36bf636c3db81afd556d0e95d91b72b9c7 Mon Sep 17 00:00:00 2001 From: Vid Date: Thu, 18 Mar 2021 18:53:06 +0100 Subject: [PATCH 309/860] [arnes] Add new extractor(closes #28483) --- youtube_dl/extractor/arnes.py | 101 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/arnes.py diff --git a/youtube_dl/extractor/arnes.py b/youtube_dl/extractor/arnes.py new file mode 100644 index 000000000..c0032fcab --- /dev/null +++ b/youtube_dl/extractor/arnes.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + remove_start, +) + + +class ArnesIE(InfoExtractor): + IE_NAME = 'video.arnes.si' + IE_DESC = 'Arnes Video' + _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P[0-9a-zA-Z]{12})' + _TESTS = [{ + 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10', + 'md5': '4d0f4d0a03571b33e1efac25fd4a065d', + 'info_dict': { + 'id': 'a1qrWTOQfVoU', + 'ext': 'mp4', + 'title': 'Linearna neodvisnost, definicija', + 'description': 'Linearna neodvisnost, definicija', + 'license': 'PRIVATE', + 'creator': 'Polona Oblak', + 'timestamp': 1585063725, + 'upload_date': '20200324', + 'channel': 'Polona Oblak', + 'channel_id': 'q6pc04hw24cj', + 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj', + 'duration': 596.75, + 'view_count': int, + 'tags': ['linearna_algebra'], + 'start_time': 10, + } + }, { + 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC', + 'only_matching': True, + }] + _BASE_URL = 'https://video.arnes.si' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + self._BASE_URL + '/api/public/video/' + video_id, video_id)['data'] + title = video['title'] + + formats = [] + for media in (video.get('media') or []): + media_url = media.get('url') + if not media_url: + continue + formats.append({ + 'url': self._BASE_URL + media_url, + 'format_id': remove_start(media.get('format'), 'FORMAT_'), + 'format_note': media.get('formatTranslation'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + self._sort_formats(formats) + + channel = video.get('channel') or {} + channel_id = channel.get('url') + thumbnail = video.get('thumbnailUrl') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._BASE_URL + thumbnail, + 'description': video.get('description'), + 'license': video.get('license'), + 'creator': video.get('author'), + 'timestamp': parse_iso8601(video.get('creationTime')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'duration': float_or_none(video.get('duration'), 1000), + 'view_count': int_or_none(video.get('views')), + 'tags': video.get('hashtags'), + 'start_time': int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b2b39e4dd..8cf348772 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -72,6 +72,7 @@ from .arte import ( ArteTVEmbedIE, ArteTVPlaylistIE, ) +from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, From 392c467f95cbf89114235038e1938c72d97144d9 Mon Sep 17 00:00:00 2001 From: Allan Daemon Date: Mon, 15 May 2017 00:04:39 -0300 Subject: [PATCH 310/860] [palcomp3] Add new extractor(closes #13120) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/palcomp3.py | 148 +++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 youtube_dl/extractor/palcomp3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8cf348772..65fefabe8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -879,6 +879,11 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py new file mode 100644 index 000000000..fb29d83f9 --- /dev/null +++ b/youtube_dl/extractor/palcomp3.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, +) + + +class PalcoMP3BaseIE(InfoExtractor): + _GQL_QUERY_TMPL = '''{ + artist(slug: "%s") { + %s + } +}''' + _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { + %s + }''' + _MUSIC_FIELDS = '''duration + hls + mp3File + musicID + plays + title''' + + def _call_api(self, artist_slug, artist_fields): + return self._download_json( + 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ + 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), + })['data'] + + def _parse_music(self, music): + music_id = compat_str(music['musicID']) + title = music['title'] + + formats = [] + hls_url = music.get('hls') + if hls_url: + formats.append({ + 'url': hls_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + mp3_file = music.get('mp3File') + if mp3_file: + formats.append({ + 'url': mp3_file, + }) + + return { + 'id': music_id, + 'title': title, + 'formats': formats, + 'duration': int_or_none(music.get('duration')), + 'view_count': int_or_none(music.get('plays')), + } + + def _real_initialize(self): + self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS + + def _real_extract(self, url): + artist_slug, music_slug = re.match(self._VALID_URL, url).groups() + artist_fields = self._ARTIST_FIELDS_TMPL % music_slug + music = self._call_api(artist_slug, artist_fields)['artist']['music'] + return self._parse_music(music) + + +class PalcoMP3IE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:song' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', + 'info_dict': { + 'id': '3162927', + 'ext': 'mp3', + 'title': 'Nossas Composições - CUIDA BEM DELA', + 'duration': 210, + 'view_count': int, + } + }] + + @classmethod + def suitable(cls, url): + return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + + +class PalcoMP3ArtistIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:artist' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com.br/condedoforro/', + 'info_dict': { + 'id': '358396', + 'title': 'Conde do Forró', + }, + 'playlist_mincount': 188, + }] + _ARTIST_FIELDS_TMPL = '''artistID + musics { + nodes { + %s + } + } + name''' + + @ classmethod + def suitable(cls, url): + return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) + + def _real_extract(self, url): + artist_slug = self._match_id(url) + artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] + + def entries(): + for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + yield self._parse_music(music) + + return self.playlist_result( + entries(), str_or_none(artist.get('artistID')), artist.get('name')) + + +class PalcoMP3VideoIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:video' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)/?#clipe' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_pD1nR2qqPg', + 'ext': 'mp4', + 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', + 'description': 'md5:7043342c09a224598e93546e98e49282', + 'upload_date': '20161107', + 'uploader_id': 'maiaramaraisaoficial', + 'uploader': 'Maiara e Maraisa', + } + }] + _MUSIC_FIELDS = 'youtubeID' + + def _parse_music(self, music): + youtube_id = music['youtubeID'] + return self.url_result(youtube_id, 'Youtube', youtube_id) From 04d4a3b136060158438c3f2c1b31c884c6961712 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Apr 2021 19:03:45 +0100 Subject: [PATCH 311/860] [screencastomatic] fix extraction(closes #11976, closes #24489) --- youtube_dl/extractor/screencastomatic.py | 48 +++++++++++++++--------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index b5e76c9af..0afdc1715 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + get_element_by_class, + int_or_none, + remove_start, + strip_or_none, + unified_strdate, +) class ScreencastOMaticIE(InfoExtractor): - _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { @@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', - 'duration': 369.163, + 'duration': 369, + 'upload_date': '20141216', } - } + }, { + 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', + 'only_matching': True, + }, { + 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - jwplayer_data = self._parse_json( - self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), - video_id, transform_source=js_to_json) - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict.update({ - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/' + video_id, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': get_element_by_class('overlayTitle', webpage), + 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, + 'duration': int_or_none(self._search_regex( + r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', + webpage, 'duration', default=None)), + 'upload_date': unified_strdate(remove_start( + get_element_by_class('overlayPublished', webpage), 'Published: ')), }) - return info_dict + return info From 1df2596f81695bf452ffbfd89596d115d9b2daf5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 3 Apr 2021 07:54:02 +0100 Subject: [PATCH 312/860] [extractor/common] fix _get_cookies method for python 2(#20673, #23256, #20326, closes #28640) --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d3b6724df..fcbf18ee6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2899,7 +2899,10 @@ class InfoExtractor(object): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) + cookie = req.get_header('Cookie') + if cookie and sys.version_info[0] == 2: + cookie = str(cookie) + return compat_cookies.SimpleCookie(cookie) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 654b4f4ff2718f38b3182c1188c5d569c14cc70a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 3 Apr 2021 08:23:35 +0100 Subject: [PATCH 313/860] [youtube] prioritize information from YoutubeIE for playlist entries(closes #28619, closes #28636) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f5497e24..2e027528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,7 +329,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (lambda x: x['ownerText']['runs'][0]['text'], lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) return { - '_type': 'url_transparent', + '_type': 'url', 'ie_key': YoutubeIE.ie_key(), 'id': video_id, 'url': video_id, From aee6feb02adaa316455ea9497e92cc82b720f231 Mon Sep 17 00:00:00 2001 From: RomanEmelyanov Date: Sun, 4 Apr 2021 11:14:37 +0300 Subject: [PATCH 314/860] [youku] Update ccode(closes #17852, closes #28447, closes #28460) (#28648) --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 61d1ab209..880c89687 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0590', + 'ccode': '0532', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From e165f5641fdf62975d3b6a40132a475c9cbaea2a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 15:52:14 +0100 Subject: [PATCH 315/860] [extractor/common] fix JSON-LD VideoObject author extraction --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fcbf18ee6..8ef22779a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,6 +70,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1282,7 +1283,7 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': str_or_none(e.get('author')), + 'uploader': try_get(e, lambda x: x['author']['name'], compat_str), 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), From 3ae9c0f410b1d4f63e8bada67dd62a8d2852be32 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 16:26:56 +0100 Subject: [PATCH 316/860] [vimeo] improve extraction(closes #28591) --- youtube_dl/extractor/vimeo.py | 239 ++++++++++++++-------------------- 1 file changed, 100 insertions(+), 139 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a90cf0630..102687b82 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,15 +16,14 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, js_to_json, int_or_none, merge_dicts, OnDemandPagedList, parse_filesize, parse_iso8601, - RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, @@ -127,10 +125,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue @@ -146,7 +145,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -192,17 +191,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): f['preference'] = -40 subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -322,6 +319,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', }, 'params': { 'format': 'best[protocol=https]', @@ -400,6 +398,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, } }, { @@ -572,6 +576,37 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) headers = std_headers.copy() @@ -580,48 +615,19 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - # Extract ID from URL - video_id, unlisted_hash = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') if unlisted_hash: - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest' - })['token'] - video = self._download_json( - 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), - video_id, headers={ - 'Authorization': 'jwt ' + token, - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) - get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) - info.update({ - 'description': video.get('description'), - 'license': video.get('license'), - 'release_timestamp': get_timestamp('release'), - 'timestamp': get_timestamp('created'), - 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), - }) - connections = try_get( - video, lambda x: x['metadata']['connections'], dict) or {} - for k in ('comment', 'like'): - info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) - return info + return self._extract_from_api(video_id, unlisted_hash) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -641,14 +647,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '://player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r']+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -657,70 +674,40 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r']+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) video = config.get('video') or {} vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -735,14 +722,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description - if not video_description: - video_description = self._html_search_regex( - r'(?s)]*>(.*?)', - webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -751,25 +734,14 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self._downloader.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r']+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None - formats = [] source_format = self._extract_original_format( @@ -788,31 +760,20 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } + }) - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) - - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', From 6beb1ac65b03415764c487fd139298f22e1e0313 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 19:16:17 +0100 Subject: [PATCH 317/860] [extractor/common] keep support for non standard JSON-LD VideoObject author values --- youtube_dl/extractor/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ef22779a..78ff5b6d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,7 +70,6 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1276,6 +1275,7 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' + author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), @@ -1283,7 +1283,11 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': try_get(e, lambda x: x['author']['name'], compat_str), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), From 162bf9e10a4e6a08f5ed156a68054ef9b4d2b60e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 19:49:24 +0100 Subject: [PATCH 318/860] [compat] add compat_SimpleCookie --- youtube_dl/compat.py | 9 +++++++++ youtube_dl/extractor/common.py | 9 +++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6c3d49d45..8bbebebcf 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -73,6 +73,15 @@ try: except ImportError: # Python 2 import Cookie as compat_cookies +if sys.version_info[0] == 2: + class compat_SimpleCookie(compat_cookies.SimpleCookie): + def load(self, rawdata): + if isinstance(rawdata, unicode): + rawdata = str(rawdata) + return super(compat_SimpleCookie, self).load(rawdata) +else: + compat_SimpleCookie = compat_cookies.SimpleCookie + try: import html.entities as compat_html_entities except ImportError: # Python 2 diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78ff5b6d0..af289d705 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,13 +17,13 @@ import math from ..compat import ( compat_cookiejar_Cookie, - compat_cookies, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_os_name, + compat_SimpleCookie, compat_str, compat_urllib_error, compat_urllib_parse_unquote, @@ -2901,13 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + """ Return a compat_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - cookie = req.get_header('Cookie') - if cookie and sys.version_info[0] == 2: - cookie = str(cookie) - return compat_cookies.SimpleCookie(cookie) + return compat_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 760c911299aa607ca967d6d4be2985528bacf29f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 5 Apr 2021 07:16:50 +0100 Subject: [PATCH 319/860] [compat] add compat_SimpleCookie to __all__ array --- youtube_dl/compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8bbebebcf..8a5262dc8 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3002,6 +3002,7 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_SimpleCookie', 'compat_Struct', 'compat_b64decode', 'compat_basestring', From 25b1287323f5836c9416a8183096adc63809d5ce Mon Sep 17 00:00:00 2001 From: guredora Date: Sun, 4 Apr 2021 22:12:07 +0900 Subject: [PATCH 320/860] [line] add support live.line.me (closes #17205)(closes #28658) --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/line.py | 142 ++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 65fefabe8..d5cd364e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -595,7 +595,11 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import LineTVIE +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 7f5fa446e..2526daa77 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, +) class LineTVIE(InfoExtractor): @@ -88,3 +94,137 @@ class LineTVIE(InfoExtractor): for thumbnail in video_info.get('thumbnails', {}).get('list', [])], 'view_count': video_info.get('meta', {}).get('count'), } + + +class LineLiveBaseIE(InfoExtractor): + _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' + + def _parse_broadcast_item(self, item): + broadcast_id = compat_str(item['id']) + title = item['title'] + is_live = item.get('isBroadcastingNow') + + thumbnails = [] + for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + }) + + channel = item.get('channel') or {} + channel_id = str_or_none(channel.get('id')) + + return { + 'id': broadcast_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('createdAt')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'duration': int_or_none(item.get('archiveDuration')), + 'view_count': int_or_none(item.get('viewerCount')), + 'comment_count': int_or_none(item.get('chatCount')), + 'is_live': is_live, + } + + +class LineLiveIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)/broadcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', + 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'info_dict': { + 'id': '16331360', + 'title': '振りコピ講座😙😙😙', + 'ext': 'mp4', + 'timestamp': 1617095132, + 'upload_date': '20210330', + 'channel': '白川ゆめか', + 'channel_id': '4867368', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + }, { + # archiveStatus == 'DELETED' + 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id, broadcast_id = re.match(self._VALID_URL, url).groups() + broadcast = self._download_json( + self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), + broadcast_id) + item = broadcast['item'] + info = self._parse_broadcast_item(item) + protocol = 'm3u8' if info['is_live'] else 'm3u8_native' + formats = [] + for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): + if not v: + continue + if k == 'abr': + formats.extend(self._extract_m3u8_formats( + v, broadcast_id, 'mp4', protocol, + m3u8_id='hls', fatal=False)) + continue + f = { + 'ext': 'mp4', + 'format_id': 'hls-' + k, + 'protocol': protocol, + 'url': v, + } + if not k.isdigit(): + f['vcodec'] = 'none' + formats.append(f) + if not formats: + archive_status = item.get('archiveStatus') + if archive_status != 'ARCHIVED': + raise ExtractorError('this video has been ' + archive_status.lower(), expected=True) + self._sort_formats(formats) + info['formats'] = formats + return info + + +class LineLiveChannelIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' + _TEST = { + 'url': 'https://live.line.me/channels/5893542', + 'info_dict': { + 'id': '5893542', + 'title': 'いくらちゃん', + 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + }, + 'playlist_mincount': 29 + } + + def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): + while True: + for row in (archived_broadcasts.get('rows') or []): + share_url = str_or_none(row.get('shareURL')) + if not share_url: + continue + info = self._parse_broadcast_item(row) + info.update({ + '_type': 'url', + 'url': share_url, + 'ie_key': LineLiveIE.ie_key(), + }) + yield info + if not archived_broadcasts.get('hasNextPage'): + return + archived_broadcasts = self._download_json( + self._API_BASE_URL + channel_id + '/archived_broadcasts', + channel_id, query={ + 'lastId': info['id'], + }) + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) + return self.playlist_result( + self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), + channel_id, channel.get('title'), channel.get('information')) From 6b315d96bc0b07ddc3abaa7318583775828cce30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 6 Apr 2021 14:15:13 +0700 Subject: [PATCH 321/860] [compat] flake8 --- youtube_dl/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8a5262dc8..566e9d5ec 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -76,7 +76,7 @@ except ImportError: # Python 2 if sys.version_info[0] == 2: class compat_SimpleCookie(compat_cookies.SimpleCookie): def load(self, rawdata): - if isinstance(rawdata, unicode): + if isinstance(rawdata, compat_str): rawdata = str(rawdata) return super(compat_SimpleCookie, self).load(rawdata) else: From 70d0d4f9beba0e5b6d95ee50ad62ae7ab5be9be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 6 Apr 2021 14:22:28 +0700 Subject: [PATCH 322/860] [compat] Use more conventional name for compat SimpleCookie --- youtube_dl/compat.py | 8 ++++---- youtube_dl/extractor/common.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 566e9d5ec..9e45c454b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -74,13 +74,13 @@ except ImportError: # Python 2 import Cookie as compat_cookies if sys.version_info[0] == 2: - class compat_SimpleCookie(compat_cookies.SimpleCookie): + class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): def load(self, rawdata): if isinstance(rawdata, compat_str): rawdata = str(rawdata) - return super(compat_SimpleCookie, self).load(rawdata) + return super(compat_cookies_SimpleCookie, self).load(rawdata) else: - compat_SimpleCookie = compat_cookies.SimpleCookie + compat_cookies_SimpleCookie = compat_cookies.SimpleCookie try: import html.entities as compat_html_entities @@ -3002,7 +3002,6 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_SimpleCookie', 'compat_Struct', 'compat_b64decode', 'compat_basestring', @@ -3010,6 +3009,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', + 'compat_cookies_SimpleCookie', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', 'compat_etree_fromstring', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index af289d705..797c35fd5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,13 +17,13 @@ import math from ..compat import ( compat_cookiejar_Cookie, + compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_os_name, - compat_SimpleCookie, compat_str, compat_urllib_error, compat_urllib_parse_unquote, @@ -2901,10 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_SimpleCookie with the cookies for the url """ + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_SimpleCookie(req.get_header('Cookie')) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 6b116f0c03ac0b1aff01cd08bbe1d5cb87dff853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:34:43 +0700 Subject: [PATCH 323/860] [youtube] Fix videos with restricted location (closes #28685) --- youtube_dl/extractor/youtube.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e027528d..6b4c7912c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1084,6 +1084,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', 'only_matching': True, }, + { + # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 + 'url': 'cBvYw8_A0vQ', + 'info_dict': { + 'id': 'cBvYw8_A0vQ', + 'ext': 'mp4', + 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', + 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', + 'upload_date': '20201120', + 'uploader': 'Walk around Japan', + 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1485,7 +1502,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def get_text(x): if not x: return - return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) + text = x.get('simpleText') + if text and isinstance(text, compat_str): + return text + runs = x.get('runs') + if not isinstance(runs, list): + return + return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) search_meta = ( lambda x: self._html_search_meta(x, webpage, default=None)) \ From 445db582a27c44cb02d57ac9171d58651cafbd76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:35:25 +0700 Subject: [PATCH 324/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4304ecd9e..e5e546744 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version + +Core +* [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies ++ [compat] Introduce compat_cookies_SimpleCookie +* [extractor/common] Improve JSON-LD author extraction +* [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, + #28640) + +Extractors +* [youtube] Fix extraction of videos with restricted location (#28685) ++ [line] Add support for live.line.me (#17205, #28658) +* [vimeo] Improve extraction (#28591) +* [youku] Update ccode (#17852, #28447, #28460, #28648) +* [youtube] Prefer direct entry metadata over entry metadata from playlist + (#28619, #28636) +* [screencastomatic] Fix extraction (#11976, #24489) ++ [palcomp3] Add support for palcomp3.com (#13120) ++ [arnes] Add support for video.arnes.si (#28483) ++ [youtube:tab] Add support for hashtags (#28308) + + version 2021.04.01 Extractors From 72a2c0a9ede04c6b82235e453b1a933faf072a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:42:24 +0700 Subject: [PATCH 325/860] release 2021.04.07 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 ++++++ youtube_dl/version.py | 2 +- 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 98ec799e8..febbd2344 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.01 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5387a6cd1..d7296d0a9 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 945c80366..92e616a1a 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 0acc8b679..b55739f6c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.01 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 42c3126a3..dbdb8356a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index e5e546744..22b4fa67d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.07 Core * [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d2ad937a4..ff9177a2c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -463,6 +463,8 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineLive** + - **LineLiveChannel** - **LineTV** - **linkedin:learning** - **linkedin:learning:course** @@ -679,6 +681,9 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** + - **PalcoMP3:artist** + - **PalcoMP3:song** + - **PalcoMP3:video** - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos @@ -1059,6 +1064,7 @@ - **Vidbit** - **Viddler** - **Videa** + - **video.arnes.si**: Arnes Video - **video.google:search**: Google Video search - **video.sky.it** - **video.sky.it:live** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0457d1a15..a6b1b8dce 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.01' +__version__ = '2021.04.07' From c0c5134c5771dd2a1caeeaee62dcd207d169e981 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 09:27:05 +0100 Subject: [PATCH 326/860] [curiositystream] fix format extraction(closes #26845, closes #28668) --- youtube_dl/extractor/curiositystream.py | 103 +++++++++++++----------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index e4a7fca6c..ae64a07d7 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) - def _call_api(self, path, video_id): + def _call_api(self, path, video_id, query=None): headers = {} if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers) + self._API_BASE_URL + path, video_id, headers=headers, query=query) self._handle_errors(result) return result['data'] @@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - title = media['title'] formats = [] - for encoding in media.get('encodings', []): - m3u8_url = encoding.get('master_playlist_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: continue - fmt = f.copy() - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) self._sort_formats(formats) + title = media['title'] + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -140,7 +153,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 16, }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, From 281b8e34432d8dba9902be2c1eb77d3e6371cd73 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 10:41:06 +0100 Subject: [PATCH 327/860] [jamendo] fix track extraction(closes #28686) --- youtube_dl/extractor/jamendo.py | 74 ++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 490efa8fb..1db7c64af 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor): 'id': '196219', 'display_id': 'stories-from-emona-i', 'ext': 'flac', - 'title': 'Maya Filipič - Stories from Emona I', - 'artist': 'Maya Filipič', + # 'title': 'Maya Filipič - Stories from Emona I', + 'title': 'Stories from Emona I', + # 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1217438117, 'upload_date': '20080730', + 'license': 'by-nc-nd', + 'view_count': int, + 'like_count': int, + 'average_rating': int, + 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'], } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'only_matching': True, }] + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage( - 'https://www.jamendo.com/track/' + track_id, track_id) - models = self._parse_json(self._html_search_regex( - r"data-bundled-models='([^']+)", - webpage, 'bundled models'), track_id) - track = models['track']['models'][0] + # webpage = self._download_webpage( + # 'https://www.jamendo.com/track/' + track_id, track_id) + # models = self._parse_json(self._html_search_regex( + # r"data-bundled-models='([^']+)", + # webpage, 'bundled models'), track_id) + # track = models['track']['models'][0] + track = self._call_api('track', track_id) title = track_name = track['name'] - get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} - artist = get_model('artist') - artist_name = artist.get('name') - if artist_name: - title = '%s - %s' % (artist_name, title) - album = get_model('album') + # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + # artist = get_model('artist') + # artist_name = artist.get('name') + # if artist_name: + # title = '%s - %s' % (artist_name, title) + # album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor): urls = [] thumbnails = [] - for _, covers in track.get('cover', {}).items(): + for covers in (track.get('cover') or {}).values(): for cover_id, cover_url in covers.items(): if not cover_url or cover_url in urls: continue @@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor): }) tags = [] - for tag in track.get('tags', []): + for tag in (track.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue tags.append(tag_name) stats = track.get('stats') or {} + license = track.get('licenseCC') or [] return { 'id': track_id, @@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - 'artist': artist_name, + # 'artist': artist_name, 'track': track_name, - 'album': album.get('name'), + # 'album': album.get('name'), 'formats': formats, - 'license': '-'.join(track.get('licenseCC', [])) or None, + 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), 'view_count': int_or_none(stats.get('listenedAll')), 'like_count': int_or_none(stats.get('favorited')), @@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(InfoExtractor): +class JamendoAlbumIE(JamendoIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', @@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 } - } - - def _call_api(self, resource, resource_id): - path = '/api/%ss' % resource - rand = compat_str(random.random()) - return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ - 'id[]': resource_id, - }, headers={ - 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) - })[0] + }] def _real_extract(self, url): album_id = self._match_id(url) @@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor): album_name = album.get('name') entries = [] - for track in album.get('tracks', []): + for track in (album.get('tracks') or []): track_id = track.get('id') if not track_id: continue From 006eea564d55130bb2e2ea7feb3a0e286d75d91f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 14:01:48 +0100 Subject: [PATCH 328/860] [cbssports] fix extraction(closes #28682) --- youtube_dl/extractor/cbssports.py | 125 +++++++++++++++++++++++------ youtube_dl/extractor/extractors.py | 6 +- 2 files changed, 105 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 83b764762..a891c9a55 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,38 +1,113 @@ from __future__ import unicode_literals -from .cbs import CBSBaseIE +import re + +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) -class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' - +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P\d+) + )''' _TESTS = [{ - 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', - 'info_dict': { - 'id': '1214315075735', - 'ext': 'mp4', - 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', - 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', - 'timestamp': 1524111457, - 'upload_date': '20180419', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, }, { - 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): + uuid, pcid = re.match(self._VALID_URL, url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], - webpage, 'video id') - return self._extract_video_info('byId=%s' % video_id, video_id) + iframe_url = self._search_regex( + r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d5cd364e8..5ff9110b4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -191,7 +191,11 @@ from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, ) -from .cbssports import CBSSportsIE +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) from .ccc import ( CCCIE, CCCPlaylistIE, From 545d6cb9d06a8bf32bcd24463c0fd25e650bb2c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Apr 2021 15:32:59 +0700 Subject: [PATCH 329/860] [pornhub] Extract DASH and HLS formats from get_media end point (closes #28698) --- youtube_dl/extractor/pornhub.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2a7818e41..031454600 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -398,6 +398,16 @@ class PornHubIE(PornHubBaseIE): formats = [] def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return tbr = None mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) if mobj: @@ -417,16 +427,6 @@ class PornHubIE(PornHubBaseIE): r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue if '/video/get_media' in video_url: medias = self._download_json(video_url, video_id, fatal=False) if isinstance(medias, list): From 27e5a4464d1d4c418d4937492e18a9d47d30fc50 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Apr 2021 18:53:36 +0100 Subject: [PATCH 330/860] [mtv] Fix Viacom A/B Testing Video Player extraction(closes #28703) --- youtube_dl/extractor/mtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 600cf2d89..5a5205c0e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -255,7 +255,9 @@ class MTVServicesInfoExtractor(InfoExtractor): @staticmethod def _extract_child_with_type(parent, t): - return next(c for c in parent['children'] if c.get('type') == t) + for c in parent['children']: + if c.get('type') == t: + return c def _extract_mgid(self, webpage): try: @@ -286,7 +288,8 @@ class MTVServicesInfoExtractor(InfoExtractor): data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') - video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + ab_testing = self._extract_child_with_type(main_container, 'ABTesting') + video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') mgid = video_player['props']['media']['video']['config']['uri'] return mgid From 1b0a13f33cfb3644cc718d35951ea85bb1905459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 9 Apr 2021 02:09:52 +0700 Subject: [PATCH 331/860] [youtube:tab] Pass innertube context and x-goog-visitor-id header along with continuation requests (closes #28702) --- youtube_dl/extractor/youtube.py | 42 +++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6b4c7912c..79e47c919 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -306,7 +306,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._parse_json( self._search_regex( r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) + default='{}'), video_id, fatal=False) or {} def _extract_video(self, renderer): video_id = renderer['videoId'] @@ -2475,7 +2475,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): ctp = continuation_ep.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) - def _entries(self, tab, identity_token): + def _entries(self, tab, item_id, webpage): tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return @@ -2535,26 +2535,37 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(rich_grid_renderer) + ytcfg = self._extract_ytcfg(item_id, webpage) + client_version = try_get( + ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00' + headers = { 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201112.04.01', + 'x-youtube-client-version': client_version, 'content-type': 'application/json', } + + context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or { + 'client': { + 'clientName': 'WEB', + 'clientVersion': client_version, + } + } + visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) + + identity_token = self._extract_identity_token(ytcfg, webpage) if identity_token: headers['x-youtube-identity-token'] = identity_token data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, + 'context': context, } for page_num in itertools.count(1): if not continuation: break + if visitor_data: + headers['x-goog-visitor-id'] = visitor_data data['continuation'] = continuation['continuation'] data['clickTracking'] = { 'clickTrackingParams': continuation['itct'] @@ -2579,6 +2590,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not response: break + visitor_data = try_get( + response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data + continuation_contents = try_get( response, lambda x: x['continuationContents'], dict) if continuation_contents: @@ -2687,7 +2701,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): alerts.append(text) return '\n'.join(alerts) - def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) @@ -2712,7 +2726,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if renderer: title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( - self._entries(selected_tab, identity_token), + self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) playlist.update(self._extract_uploader(data)) @@ -2736,8 +2750,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._playlist_entries(playlist), playlist_id=playlist_id, playlist_title=title) - def _extract_identity_token(self, webpage, item_id): - ytcfg = self._extract_ytcfg(item_id, webpage) + def _extract_identity_token(self, ytcfg, webpage): if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: @@ -2760,12 +2773,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) - identity_token = self._extract_identity_token(webpage, item_id) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + return self._extract_from_tabs(item_id, webpage, data, tabs) playlist = try_get( data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: From 4fb25ff5a3be5206bb72e5c4046715b1529fb2c7 Mon Sep 17 00:00:00 2001 From: Aaron Lipinski Date: Thu, 8 Apr 2021 19:59:36 +1200 Subject: [PATCH 332/860] [maoritv] Add new extractor(closes #24552) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/maoritv.py | 31 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 youtube_dl/extractor/maoritv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ff9110b4..ac33cd996 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -640,6 +640,7 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE from .markiza import ( MarkizaIE, MarkizaPageIE, diff --git a/youtube_dl/extractor/maoritv.py b/youtube_dl/extractor/maoritv.py new file mode 100644 index 000000000..0d23fec75 --- /dev/null +++ b/youtube_dl/extractor/maoritv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MaoriTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54', + 'md5': '5ade8ef53851b6a132c051b1cd858899', + 'info_dict': { + 'id': '4774724855001', + 'ext': 'mp4', + 'title': 'Kōrero Mai, Series 1 Episode 54', + 'upload_date': '20160226', + 'timestamp': 1456455018, + 'description': 'md5:59bde32fd066d637a1a55794c56d8dcb', + 'uploader_id': '1614493167001', + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex( + r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) From 06159135ef148a6ddc632d0c89b90c937d5bb021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 00:07:32 +0700 Subject: [PATCH 333/860] [youtube] Improve URL to extractor routing (closes #27572, closes #28335, closes #28742) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 79e47c919..4d7f3f837 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,6 +46,10 @@ from ..utils import ( ) +def parse_qs(url): + return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -413,16 +417,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" % { - 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, 'invidious': '|'.join(_INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( @@ -1208,6 +1205,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } + @classmethod + def suitable(cls, url): + qs = parse_qs(url) + if qs.get('list', [None])[0]: + return False + return super(YoutubeIE, cls).suitable(url) + def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} @@ -2275,6 +2279,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '#cctv9', }, 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, }] @classmethod @@ -2764,7 +2771,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) # Handle both video/playlist URLs - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] if video_id and playlist_id: @@ -2860,12 +2867,16 @@ class YoutubePlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) + if YoutubeTabIE.suitable(url): + return False + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super(YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) if not qs: qs = {'list': playlist_id} return self.url_result( From 79e4ccfc4b395127bb3e5e957b20b04e75cba355 Mon Sep 17 00:00:00 2001 From: quyleanh Date: Sat, 17 Apr 2021 00:30:10 +0700 Subject: [PATCH 334/860] [pluralsight] Extend anti-throttling timeout (#28712) --- youtube_dl/extractor/pluralsight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index abd08bc28..2d63855df 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -393,7 +393,7 @@ query viewClip { # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( - random.randint(2, 5), display_id, + random.randint(5, 10), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not viewclip: From d01e261a15abd779decae6e0858d8586f7a71621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20=C3=81vila?= Date: Fri, 16 Apr 2021 14:31:34 -0300 Subject: [PATCH 335/860] [youtube] Add more invidious instances (#28706) --- youtube_dl/extractor/youtube.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4d7f3f837..7fa9b473a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -359,21 +359,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.zapashcanon\.fr', r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', r'(?:www\.)?invidious\.tube', r'(?:www\.)?invidiou\.site', r'(?:www\.)?invidious\.site', r'(?:www\.)?invidious\.xyz', r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', r'(?:www\.)?tube\.poal\.co', r'(?:www\.)?tube\.connect\.cafe', r'(?:www\.)?vid\.wxzm\.sx', r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', r'(?:www\.)?yewtu\.be', r'(?:www\.)?yt\.elukerio\.org', r'(?:www\.)?yt\.lelux\.fi', r'(?:www\.)?invidious\.ggc-project\.de', r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.fdn\.fr', From ea87ed8394127c4bf824688b8780eaf5a804e7a3 Mon Sep 17 00:00:00 2001 From: zraktvor <=> Date: Sat, 10 Apr 2021 15:11:35 +0200 Subject: [PATCH 336/860] [youtube:tab] Detect series playlist on playlists page (closes #28723) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7fa9b473a..581687d96 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2019,6 +2019,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, + }, { + # playlists, series + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', @@ -2311,7 +2320,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel'): + for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): renderer = item.get('grid%sRenderer' % item_kind) if renderer: return renderer @@ -2344,6 +2353,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) + # show + if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice + show_playlist_url = try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + if show_playlist_url: + playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, + 'playlist id', default=None) + if playlist_id: + title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + "https://www.youtube.com/playlist?list=%s" % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 7c5239547928079513b65f62e4c84aea21ce76e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 01:05:44 +0700 Subject: [PATCH 337/860] [youtube:tab] Improve grid extraction (closes #28725) --- youtube_dl/extractor/youtube.py | 38 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 581687d96..b6945570f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2320,10 +2320,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): - renderer = item.get('grid%sRenderer' % item_kind) - if renderer: - return renderer + assert isinstance(item, dict) + for key, renderer in item.items(): + if not key.startswith('grid') or not key.endswith('Renderer'): + continue + if not isinstance(renderer, dict): + continue + return renderer def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: @@ -2333,7 +2336,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not isinstance(renderer, dict): continue title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2341,10 +2345,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/playlist?list=%s' % playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + continue # video video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) + continue # channel channel_id = renderer.get('channelId') if channel_id: @@ -2353,19 +2359,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - # show - if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice - show_playlist_url = try_get( - renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) - if show_playlist_url: - playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, - 'playlist id', default=None) - if playlist_id: - title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if ep_url: + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): yield self.url_result( - "https://www.youtube.com/playlist?list=%s" % playlist_id, - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 54558e0baa4d62a94af105cd1d7f8abcbd16b468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 02:27:54 +0700 Subject: [PATCH 338/860] [youtube] Improve stretch extraction and fix stretched ratio calculation (closes #28769) --- youtube_dl/extractor/youtube.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b6945570f..75751d5a6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -812,6 +812,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'This video does not exist.', }, + { + # Video with incomplete 'yt:stretch=16:' + 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI', + 'only_matching': True, + }, { # Video licensed under Creative Commons 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', @@ -1717,13 +1722,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] for keyword in keywords: if keyword.startswith('yt:stretch='): - w, h = keyword.split('=')[1].split(':') - w, h = int(w), int(h) - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio + mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) + if mobj: + # NB: float is intentional for forcing float division + w, h = (float(v) for v in mobj.groups()) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + break thumbnails = [] for container in (video_details, microformat): From a00a7e0cad3308d999599bf17df5d3e6aba502d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:22:13 +0700 Subject: [PATCH 339/860] [utils] Add support for support for experimental HTTP response status code 308 Permanent Redirect (refs #27877, refs #28768) --- youtube_dl/utils.py | 62 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e4d144c9..538cc2b63 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ import zlib from .compat import ( compat_HTMLParseError, compat_HTMLParser, + compat_HTTPError, compat_basestring, compat_chr, compat_cookiejar, @@ -2879,12 +2880,61 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - if sys.version_info[0] < 3: - def redirect_request(self, req, fp, code, msg, headers, newurl): - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler solves two issues: + - ensures redirect URL is always unicode under python 2 + - introduces support for experimental HTTP response status code + 308 Permanent Redirect [2] used by some sites [3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 + 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise compat_HTTPError(req.full_url, code, msg, headers, fp) + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + if sys.version_info[0] < 3: + newurl = compat_str(newurl) + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + # NB: don't use dict comprehension for python 2.6 compatibility + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request(newurl, + headers=newheaders, + origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From 30a3a4c70fdcad10ef1dc6c3402457a95fe1ae5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:23:47 +0700 Subject: [PATCH 340/860] [lbry] Add support for HLS videos (closes #27877, closes #28768) --- youtube_dl/extractor/lbry.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index ae43d56ea..cfd6b8393 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -120,6 +120,26 @@ class LBRYIE(LBRYBaseIE): 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'vcodec': 'none', } + }, { + # HLS + 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', + 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'info_dict': { + 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', + 'ext': 'mp4', + 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', + 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', + 'timestamp': 1618254123, + 'upload_date': '20210412', + 'release_timestamp': 1618254002, + 'release_date': '20210412', + 'tags': list, + 'duration': 554, + 'channel': 'Gardening In Canada', + 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'formats': 'mincount:3', + } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -163,10 +183,18 @@ class LBRYIE(LBRYBaseIE): streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] info = self._parse_stream(result, url) + urlh = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info') + if determine_ext(urlh.geturl()) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(info['formats']) + else: + info['url'] = streaming_url info.update({ 'id': claim_id, 'title': title, - 'url': streaming_url, }) return info From cfee2dfe83c5593d46bd0c8e8ce6a3d8c6e42db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:32:04 +0700 Subject: [PATCH 341/860] [utils] PEP 8 --- youtube_dl/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 538cc2b63..e722eed58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2908,7 +2908,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ m = req.get_method() if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): + or code in (301, 302, 303) and m == "POST")): raise compat_HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation @@ -2930,11 +2930,10 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): CONTENT_HEADERS = ("content-length", "content-type") # NB: don't use dict comprehension for python 2.6 compatibility newheaders = dict((k, v) for k, v in req.headers.items() - if k.lower() not in CONTENT_HEADERS) - return compat_urllib_request.Request(newurl, - headers=newheaders, - origin_req_host=req.origin_req_host, - unverifiable=True) + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request( + newurl, headers=newheaders, origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From f20b505b46e6654464635ca4afb0f37e1f14c57b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:47:00 +0700 Subject: [PATCH 342/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 22b4fa67d..c249412e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ [utils] Add support for experimental HTTP response status code + 308 Permanent Redirect (#27877, #28768) + +Extractors ++ [lbry] Add support for HLS videos (#27877, #28768) +* [youtube] Fix stretched ratio calculation +* [youtube] Improve stretch extraction (#28769) +* [youtube:tab] Improve grid extraction (#28725) ++ [youtube:tab] Detect series playlist on playlists page (#28723) ++ [youtube] Add more invidious instances (#28706) +* [pluralsight] Extend anti-throttling timeout (#28712) +* [youtube] Improve URL to extractor routing (#27572, #28335, #28742) ++ [maoritv] Add support for maoritelevision.com (#24552) ++ [youtube:tab] Pass innertube context and x-goog-visitor-id header along with + continuation requests (#28702) +* [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) ++ [pornhub] Extract DASH and HLS formats from get_media end point (#28698) +* [cbssports] Fix extraction (#28682) +* [jamendo] Fix track extraction (#28686) +* [curiositystream] Fix format extraction (#26845, #28668) + + version 2021.04.07 Core From 596b26606cfe20aa9f776ac0658b6bfb1ea95397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:50:09 +0700 Subject: [PATCH 343/860] release 2021.04.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index febbd2344..e2a89d5c2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d7296d0a9..4d7abd775 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 92e616a1a..d8dce6fd4 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b55739f6c..d95ee291a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dbdb8356a..ac5dd2f27 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c249412e2..45d5c2ebf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.17 Core + [utils] Add support for experimental HTTP response status code diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ff9177a2c..a23da1a31 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,6 +3,7 @@ - **20min** - **220.ro** - **23video** + - **247sports** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -160,7 +161,8 @@ - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos - - **CBSSports** + - **cbssports** + - **cbssports:embed** - **CCMA** - **CCTV**: 央视网 - **CDA** @@ -490,6 +492,7 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **MaoriTV** - **Markiza** - **MarkizaPage** - **massengeschmack.tv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a6b1b8dce..2b041d593 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.07' +__version__ = '2021.04.17' From 9f6c03a00602eb1119e43a522cf50682f6d6a6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 05:05:31 +0700 Subject: [PATCH 344/860] [cbsnews] Fix extraction for python <3.6 (closes #23359) --- youtube_dl/extractor/cbsnews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 345debcf0..1285ed65e 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE): def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS), None)['video']['items'][0] + -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') From 41920fc80e4fe4a8996aeb31a04826a5a2534814 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 20 Apr 2021 20:51:55 +0100 Subject: [PATCH 345/860] [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) --- youtube_dl/extractor/bbc.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e8d000bbb..71ea25881 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -25,8 +26,10 @@ from ..utils import ( js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, @@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1164,12 +1176,23 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = try_get( + media, + lambda x: x['summary']['blocks'][0]['model']['text'], + compat_str) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') From dab83a25972e0dbcc69583bf78d2a992f581563d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:00:56 +0700 Subject: [PATCH 346/860] [bbc] Extract full description from __INITIAL_DATA__ (refs #28774) --- youtube_dl/extractor/bbc.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 71ea25881..247d982ce 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -770,7 +770,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xzws1', 'ext': 'mp4', 'title': "Pluto may have 'nitrogen glaciers'", - 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', @@ -1176,10 +1176,16 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) - item_desc = try_get( - media, - lambda x: x['summary']['blocks'][0]['model']['text'], - compat_str) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) item_time = None for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: if try_get(meta, lambda x: x['label']) == 'Published': From 32290307a45260885b2210aa3c2a57e64abf8c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:56:04 +0700 Subject: [PATCH 347/860] [youtube] Fix lazy extractors (closes #28780) --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75751d5a6..c16dc7ab8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1219,6 +1219,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('list', [None])[0]: return False @@ -2910,6 +2913,9 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False From 5ad69d3d0e7d1d8d15a1f2497d602b1b91fcc74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:45:13 +0700 Subject: [PATCH 348/860] [test_youtube_misc] Move YoutubeIE.extract_id test into separate module --- test/test_all_urls.py | 9 --------- test/test_youtube_misc.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 test/test_youtube_misc.py diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..365b66bad 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -70,15 +70,6 @@ class TestAllURLsMatching(unittest.TestCase): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_youtube_extract(self): - assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) - assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') - assertExtractId('BaW_jenozKc', 'BaW_jenozKc') - def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 000000000..e18e71101 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): + def test_youtube_extract(self): + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': + unittest.main() From c4a451bcdd2b743fdb96fcbae261c86ed91022ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:47:29 +0700 Subject: [PATCH 349/860] [test_execution] Add test for lazy extractors (refs #28780) --- test/test_execution.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_execution.py b/test/test_execution.py index 11661bb68..32948d93e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + def test_lazy_extractors(self): + try: + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + finally: + try: + os.remove('youtube_dl/extractor/lazy_extractors.py') + except (IOError, OSError): + pass + if __name__ == '__main__': unittest.main() From ac19c3ac8035fbf9369fd4bd336c9045d4eeafa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:35:39 +0700 Subject: [PATCH 350/860] [go] Improve video id extraction (closes #25207, closes #25216, closes #26058) --- youtube_dl/extractor/go.py | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 0d731e90a..878ba14e6 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, determine_ext, parse_age_limit, + try_get, urlencode_postdata, ExtractorError, ) @@ -116,6 +118,18 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -149,14 +163,30 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', From 7e8b3f9439ebefb3a3a4e5da9c0bd2b595976438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:37:51 +0700 Subject: [PATCH 351/860] [youtube] Remove unused code --- youtube_dl/extractor/youtube.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c16dc7ab8..0c52e5a8b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -65,11 +65,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _login(self): """ Attempt to log in to YouTube. From 0db79d8181c1c3ebf74b9b6d38262c8dcaaf0f4f Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Sat, 24 Apr 2021 20:58:03 +0900 Subject: [PATCH 352/860] [tver] Redirect all downloads to Brightcove (#28849) --- youtube_dl/extractor/tver.py | 37 +++++++++++------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index a54f49319..a4a30b1e6 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,7 +9,6 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, - strip_or_none, try_get, ) @@ -45,32 +44,18 @@ class TVerIE(InfoExtractor): query={'token': self._TOKEN})['main'] p_id = main['publisher_id'] service = remove_start(main['service'], 'ts_') - info = { + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', } - - if service == 'cx': - title = main['title'] - subtitle = strip_or_none(main.get('subtitle')) - if subtitle: - title += ' - ' + subtitle - info.update({ - 'title': title, - 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), - 'ie_key': 'FujiTVFODPlus7', - }) - else: - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - info.update({ - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - }) - - return info From c6ab79299034492c5af9dd29b0c49585e4efc4cd Mon Sep 17 00:00:00 2001 From: catboy <79282513+catboy-oss@users.noreply.github.com> Date: Sat, 24 Apr 2021 12:10:35 +0000 Subject: [PATCH 353/860] [medaltv] Fix extraction (#28807) numeric clip ids are no longer used by medal, and integer user ids are now sent as strings. --- youtube_dl/extractor/medaltv.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index 1603b55f6..ef2283dea 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -15,32 +15,32 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', 'info_dict': { - 'id': '34934644', + 'id': '2mA60jWAGQCBH', 'ext': 'mp4', 'title': 'Quad Cold', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'MowgliSB', 'timestamp': 1603165266, 'upload_date': '20201020', - 'uploader_id': 10619174, + 'uploader_id': '10619174', } }, { - 'url': 'https://medal.tv/clips/36787208', + 'url': 'https://medal.tv/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { - 'id': '36787208', + 'id': '2um24TWdty0NA', 'ext': 'mp4', 'title': 'u tk me i tk u bigger', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'Mimicc', 'timestamp': 1605580939, 'upload_date': '20201117', - 'uploader_id': 5156321, + 'uploader_id': '5156321', } }] From 999329cf6b29878d054c6ccdd24573489a2886d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Apr 2021 23:52:16 +0700 Subject: [PATCH 354/860] [workflows/ci.yml] Fix install nose for Jython --- .github/workflows/ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9dc47a71..97b50aedc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,7 +53,14 @@ jobs: java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: From 57eaaff5cf1ae63d4e3ae89f301c0f9b3e86bb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Apr 2021 22:52:28 +0700 Subject: [PATCH 355/860] [francetvinfo] Improve video id extraction (closes #28792) --- youtube_dl/extractor/francetv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7cc88bf18..e4ec2e200 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "
    ]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + r'(?:data-id| Date: Sun, 25 Apr 2021 19:32:47 +0200 Subject: [PATCH 356/860] [xfileshare] Add support for wolfstream.tv (#28858) --- youtube_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index cbd5d1cbb..df9efa9fa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor): (r'vidlocker\.xyz', 'VidLocker'), (r'vidshare\.tv', 'VidShare'), (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), (r'xvideosharing\.com', 'XVideoSharing'), ) @@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor): }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, }] @staticmethod From 346dd3b5e87503c52fc6800d9f73cd6bdbce71bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:29:50 +0700 Subject: [PATCH 357/860] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 45d5c2ebf..c59984d63 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version + +Extractors ++ [xfileshare] Add support for wolfstream.tv (#28858) +* [francetvinfo] Improve video id extraction (#28792) +* [medaltv] Fix extraction (#28807) +* [tver] Redirect all downloads to Brightcove (#28849) +* [go] Improve video id extraction (#25207, #25216, #26058) +* [youtube] Fix lazy extractors (#28780) ++ [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) +* [cbsnews] Fix extraction for python <3.6 (#23359) + + version 2021.04.17 Core From 273964d190fb048477e71114c4734fcb819c5c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:33:30 +0700 Subject: [PATCH 358/860] release 2021.04.26 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2a89d5c2..6ece3e031 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 4d7abd775..f923b2d5f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d8dce6fd4..97d605653 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d95ee291a..73a806833 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index ac5dd2f27..ee19a75f5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c59984d63..f15c84225 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.26 Extractors + [xfileshare] Add support for wolfstream.tv (#28858) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a23da1a31..88d474de4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1162,7 +1162,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b041d593..576f721db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.17' +__version__ = '2021.04.26' From 94520568b399d5b4f35a9708f5643d8b16c6c4ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 02:16:47 +0700 Subject: [PATCH 359/860] [workflows/ci.yml] Update link to jython-installer --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97b50aedc..90bd63c32 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,7 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose From e33dfb445c547f210a7060e8b7abd592dbe42808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 17:53:27 +0700 Subject: [PATCH 360/860] [tv2dk] Fix extraction (closes #28888) --- youtube_dl/extractor/tv2dk.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bda9348d..8bd5fd640 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor): webpage = self._download_webpage(url, video_id) entries = [] + + def add_entry(partner_id, kaltura_id): + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): video = extract_attributes(video_el) kaltura_id = video.get('data-entryid') @@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor): partner_id = video.get('data-partnerid') if not partner_id: continue - entries.append(self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', - video_id=kaltura_id)) + add_entry(partner_id, kaltura_id) + if not entries: + kaltura_id = self._search_regex( + r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + partner_id = self._search_regex( + (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, + 'partner id') + add_entry(partner_id, kaltura_id) return self.playlist_result(entries) From d2f72c40db0d1fe1102c98c017682b283579ad97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:09:32 +0700 Subject: [PATCH 361/860] [svtplay] Improve extraction (closes #28507, closes #28876) --- youtube_dl/extractor/svt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index aba9bb447..a5bb6daa7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,7 +146,7 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) - (?:.*?modalId=(?P[\da-zA-Z-]+))? + (?:.*?(?:modalId|id)=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ @@ -177,6 +177,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -259,7 +262,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From ff04d43c469e4cf8c14ba3e2e79da0d35ef3c7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:33:05 +0700 Subject: [PATCH 362/860] [xtube] Fix formats extraction (closes #28870) --- youtube_dl/extractor/xtube.py | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 18969058f..7246409e3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, sanitized_Request, str_to_int, + url_or_none, ) @@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - title, thumbnail, duration = [None] * 3 + title, thumbnail, duration, sources, media_definition = [None] * 5 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') @@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor): thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') - if not isinstance(sources, dict): + if not isinstance(sources, dict) and not media_definition: sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, transform_source=js_to_json) formats = [] - for format_id, format_url in sources.items(): - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) From d1b9a5e2eff1c075b38815a3d2b25eb8b3f626bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 19:00:39 +0700 Subject: [PATCH 363/860] [twitter] Improve formats extraction from vmap URL (closes #28909) --- youtube_dl/extractor/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ed495f297..cfa7a7326 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -19,6 +19,7 @@ from ..utils import ( strip_or_none, unified_timestamp, update_url_query, + url_or_none, xpath_text, ) @@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor): return [f] def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] vmap_data = self._download_xml(vmap_url, video_id) formats = [] urls = [] From a0df8a06178e530a1097f177a1faf1d2c609ac99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 22:53:30 +0700 Subject: [PATCH 364/860] [cda] Improve extraction (closes #28709, closes #28937) --- youtube_dl/extractor/cda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1b4362144..e1b391937 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -133,6 +133,8 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -197,7 +199,7 @@ class CDAIE(InfoExtractor): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -209,6 +211,4 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) From 0204838163bd4068fe23b40414573d1307d817ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 01:57:02 +0700 Subject: [PATCH 365/860] [kaltura] Make embed code alternatives actually work --- youtube_dl/extractor/kaltura.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 49d13460d..5d0ff0418 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor): def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( - re.finditer( + list(re.finditer( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) - or re.finditer( + """, webpage)) + or list(re.finditer( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -142,8 +142,8 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) - or re.finditer( + ''', webpage)) + or list(re.finditer( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) @@ -151,7 +151,7 @@ class KalturaIE(InfoExtractor): [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* (?P=q1) - ''', webpage) + ''', webpage)) ) urls = [] for mobj in finditer: From fe05191b8c59538a48b6cbc95f4fe54fc7e6a0ac Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Tue, 4 May 2021 14:14:35 -0500 Subject: [PATCH 366/860] [kaltura] Improve iframe extraction (#28969) Co-authored-by: Sergey M. --- youtube_dl/extractor/gdcvault.py | 15 +++++++++++++++ youtube_dl/extractor/kaltura.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 2f555c1d4..5ad40ee23 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -102,6 +102,21 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, ] def _login(self, webpage_url, display_id): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 5d0ff0418..c731612c4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -145,7 +145,7 @@ class KalturaIE(InfoExtractor): ''', webpage)) or list(re.finditer( r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["'])\s* (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) From b8645c1f5885522ec8bb77649f49ce842e947c25 Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Sat, 17 Apr 2021 23:15:10 -0500 Subject: [PATCH 367/860] [dispeak] Improve FLV extraction (closes #13513) --- youtube_dl/extractor/dispeak.py | 50 ++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index c345e0274..e776ac00c 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -32,6 +32,14 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624 + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, }] def _parse_mp4(self, metadata): @@ -84,26 +92,28 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) + slide_video_path = xpath_text(metadata, './slideVideo') + if slide_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'slide deck video', + 'quality': -2, + 'preference': -2, + 'format_id': 'slides', + }) + speaker_video_path = xpath_text(metadata, './speakerVideo') + if speaker_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'speaker video', + 'quality': -1, + 'preference': -1, + 'format_id': 'speaker', + }) return formats def _real_extract(self, url): From 1786cd3fe4e555b83bdd3eea77ade3477293330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:30:42 +0700 Subject: [PATCH 368/860] [dispeak] DRY and update tests (closes #28970) --- youtube_dl/extractor/dispeak.py | 34 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index e776ac00c..276fd4b09 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -33,13 +33,17 @@ class DigitallySpeakingIE(InfoExtractor): 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, }, { - # From https://gdcvault.com/play/1016624 + # From https://gdcvault.com/play/1016624, empty speakerVideo 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', 'info_dict': { 'id': '201210-822101_1349794556671DDDD', 'ext': 'flv', 'title': 'Pre-launch - Preparing to Take the Plunge', }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -92,27 +96,19 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo') - if slide_video_path: + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), + 'play_path': remove_end(video_path, '.flv'), 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo') - if speaker_video_path: - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'preference': preference, + 'format_id': format_id, }) return formats From 504e4d804df0ee666d80ba6796017cf97e026c0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:44:29 +0700 Subject: [PATCH 369/860] [gdcvault] Add support for HTML5 videos --- youtube_dl/extractor/gdcvault.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 5ad40ee23..acc6478b8 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( HEADRequest, + remove_start, sanitized_Request, smuggle_url, urlencode_postdata, @@ -117,6 +118,11 @@ class GDCVaultIE(InfoExtractor): 'skip_download': True, }, }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -190,7 +196,18 @@ class GDCVaultIE(InfoExtractor): xml_name = self._html_search_regex( r'