Fix tests

* use vanilla UA for ITVIE media
* settle on best effort geo-restriction handling
* handle news articles
This commit is contained in:
dirkf 2023-01-29 13:09:45 +00:00 committed by GitHub
commit cacc07ab51
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -24,6 +24,7 @@ from ..utils import (
merge_dicts, merge_dicts,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
remove_start,
smuggle_url, smuggle_url,
strip_or_none, strip_or_none,
traverse_obj, traverse_obj,
@ -56,7 +57,12 @@ class ITVBaseIE(InfoExtractor):
self._downloader.report_warning(errmsg) self._downloader.report_warning(errmsg)
return False return False
@staticmethod
def _vanilla_ua_header():
return {'User-agent': 'Mozilla/5.0'}
def _download_webpage_handle(self, url, video_id, *args, **kwargs): def _download_webpage_handle(self, url, video_id, *args, **kwargs):
# specialised to (a) use vanilla UA (b) detect geo-block
params = self._downloader.params params = self._downloader.params
nkwargs = {} nkwargs = {}
if ( if (
@ -66,7 +72,7 @@ class ITVBaseIE(InfoExtractor):
and 'User-agent' not in (kwargs.get('headers') or {})): and 'User-agent' not in (kwargs.get('headers') or {})):
kwargs.setdefault('headers', {}) kwargs.setdefault('headers', {})
kwargs['headers']['User-agent'] = 'Mozilla/5.0' kwargs['headers'] = self._vanilla_ua_header()
nkwargs = kwargs nkwargs = kwargs
if kwargs.get('expected_status') is not None: if kwargs.get('expected_status') is not None:
exp = kwargs['expected_status'] exp = kwargs['expected_status']
@ -93,9 +99,7 @@ class ITVBaseIE(InfoExtractor):
# '{\n "Message" : "Request Originated Outside Of Allowed Geographic Region",\ # '{\n "Message" : "Request Originated Outside Of Allowed Geographic Region",\
# \n "TransactionId" : "oas-magni-475082-xbYF0W"\n}' # \n "TransactionId" : "oas-magni-475082-xbYF0W"\n}'
if '"Request Originated Outside Of Allowed Geographic Region"' in webpage: if '"Request Originated Outside Of Allowed Geographic Region"' in webpage:
self.raise_geo_restricted( self.raise_geo_restricted(countries=['GB'])
msg='This video is not available from your location due to geo restriction: try --geo-ip-block "193.113.0.0/16"',
countries=['GB'])
ret = self.__handle_request_webpage_error( ret = self.__handle_request_webpage_error(
compat_HTTPError(urlh.geturl(), 403, 'HTTP Error 403: Forbidden', urlh.headers, urlh), compat_HTTPError(urlh.geturl(), 403, 'HTTP Error 403: Forbidden', urlh.headers, urlh),
fatal=kwargs.get('fatal')) fatal=kwargs.get('fatal'))
@ -107,22 +111,23 @@ class ITVIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
_IE_DESC = 'ITVX' _IE_DESC = 'ITVX'
_TESTS = [{ _TESTS = [{
# while it redirects to ITVX 'note': 'Hub URLs redirect to ITVX',
'url': 'https://www.itv.com/hub/liar/2a4547a0012', 'url': 'https://www.itv.com/hub/liar/2a4547a0012',
'only_matching': True, 'only_matching': True,
}, { }, {
# unavailable via data-playlist-url 'note': 'Hub page unavailable via data-playlist-url (404 now)',
'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
'only_matching': True, 'only_matching': True,
}, { }, {
# InvalidVodcrid 'note': 'Hub page with InvalidVodcrid (404 now)',
'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
'only_matching': True, 'only_matching': True,
}, { }, {
# ContentUnavailable 'note': 'Hub page with ContentUnavailable (404 now)',
'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
'only_matching': True, 'only_matching': True,
}, { }, {
'note': 'ITVX, or itvX, show',
'url': 'https://www.itv.com/watch/vera/1a7314/1a7314a0014', 'url': 'https://www.itv.com/watch/vera/1a7314/1a7314a0014',
'md5': 'bd0ad666b2c058fffe7d036785880064', 'md5': 'bd0ad666b2c058fffe7d036785880064',
'info_dict': { 'info_dict': {
@ -144,12 +149,12 @@ class ITVIE(ITVBaseIE):
'categories': list, 'categories': list,
}, },
'params': { 'params': {
'geo_bypass_ip_block': '193.113.0.0/16',
# m3u8 download # m3u8 download
'skip_download': True, # 'skip_download': True,
}, },
'skip': 'only available in UK',
}, { }, {
# Latest ITV news bulletin: details change daily 'note': 'Latest ITV news bulletin: details change daily',
'url': 'https://www.itv.com/watch/news/varies-but-is-not-checked/6js5d0f', 'url': 'https://www.itv.com/watch/news/varies-but-is-not-checked/6js5d0f',
'info_dict': { 'info_dict': {
'id': '6js5d0f', 'id': '6js5d0f',
@ -164,10 +169,10 @@ class ITVIE(ITVBaseIE):
'age_limit': None, 'age_limit': None,
}, },
'params': { 'params': {
'geo_bypass_ip_block': '193.113.0.0/16',
# variable download # variable download
'skip_download': True, # 'skip_download': True,
}, },
'skip': 'only available in UK',
} }
] ]
@ -251,6 +256,9 @@ class ITVIE(ITVBaseIE):
'url': href, 'url': href,
}) })
self._sort_formats(formats) self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {})
f['http_headers'].update(self._vanilla_ua_header())
subtitles = {} subtitles = {}
for sub in traverse_obj(video_data, 'Subtitles', expected_type=list) or []: for sub in traverse_obj(video_data, 'Subtitles', expected_type=list) or []:
@ -315,16 +323,33 @@ class ITVIE(ITVBaseIE):
class ITVBTCCIE(ITVBaseIE): class ITVBTCCIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
_TEST = { _IE_DESC = 'ITV articles: News, British Touring Car Championship'
'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', _TESTS = [{
'note': 'British Touring Car Championship',
'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',
'info_dict': { 'info_dict': {
'id': 'btcc-2018-all-the-action-from-brands-hatch', 'id': 'btcc-2018-all-the-action-from-brands-hatch',
'title': 'BTCC 2018: All the action from Brands Hatch', 'title': 'BTCC 2018: All the action from Brands Hatch',
}, },
'playlist_mincount': 9, 'playlist_mincount': 9,
} }, {
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/HkiHLnNRx_default/index.html?videoId=%s' 'note': 'redirects to /btcc/articles/...',
'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
'only_matching': True,
}, {
'note': 'news article',
'url': 'https://www.itv.com/news/wales/2020-07-23/sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations',
'info_dict': {
'id': 'sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations',
'title': '''Sean Fletcher on why Wales' coastline should be your 'staycation' destination | ITV News''',
},
'playlist_mincount': 1,
}]
# should really be a class var of the BC IE
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
BRIGHTCOVE_ACCOUNT = '1582188683001' BRIGHTCOVE_ACCOUNT = '1582188683001'
BRIGHTCOVE_PLAYER = 'HkiHLnNRx'
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)
@ -333,13 +358,16 @@ class ITVBTCCIE(ITVBaseIE):
link = compat_urlparse.urlparse(urlh.geturl()).path.strip('/') link = compat_urlparse.urlparse(urlh.geturl()).path.strip('/')
next_data = self._search_nextjs_data(webpage, playlist_id, fatal=False, default='{}') next_data = self._search_nextjs_data(webpage, playlist_id, fatal=False, default='{}')
path_prefix = compat_urlparse.urlparse(next_data.get('assetPrefix') or '').path.strip('/')
link = remove_start(link, path_prefix).strip('/')
content = traverse_obj( content = traverse_obj(
next_data, ('props', 'pageProps', Ellipsis), next_data, ('props', 'pageProps', Ellipsis),
expected_type=lambda x: x if x['link'] == link else None, expected_type=lambda x: x if x['link'] == link else None,
get_all=False, default={}) get_all=False, default={})
content = traverse_obj( content = traverse_obj(
content, ('body', 'content', Ellipsis, 'data'), content, ('body', 'content', Ellipsis, 'data'),
expected_type=lambda x: x if x['name'] == 'Brightcove' else None) expected_type=lambda x: x if x.get('name') == 'Brightcove' or x.get('type') == 'Brightcove' else None)
contraband = { contraband = {
# ITV does not like some GB IP ranges, so here are some # ITV does not like some GB IP ranges, so here are some
@ -357,14 +385,15 @@ class ITVBTCCIE(ITVBaseIE):
if not video_id: if not video_id:
continue continue
account = data.get('accountId') or self.BRIGHTCOVE_ACCOUNT account = data.get('accountId') or self.BRIGHTCOVE_ACCOUNT
player = data.get('playerId') or self.BRIGHTCOVE_PLAYER
yield self.url_result( yield self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account, video_id), contraband), smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account, player, video_id), contraband),
ie=BrightcoveNewIE.ie_key(), video_id=video_id) ie=BrightcoveNewIE.ie_key(), video_id=video_id)
# obsolete ? # obsolete ?
for video_id in re.findall(r'''data-video-id=["'](\d+)''', webpage): for video_id in re.findall(r'''data-video-id=["'](\d+)''', webpage):
yield self.url_result( yield self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (self.BRIGHTCOVE_ACCOUNT, video_id), contraband), smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (self.BRIGHTCOVE_ACCOUNT, self.BRIGHTCOVE_PLAYER, video_id), contraband),
ie=BrightcoveNewIE.ie_key(), video_id=video_id) ie=BrightcoveNewIE.ie_key(), video_id=video_id)
title = self._og_search_title(webpage, fatal=False) title = self._og_search_title(webpage, fatal=False)