Merge branch 'ytdl-org:master' into callin

This commit is contained in:
Ruowang Sun 2022-12-24 08:03:01 -05:00 committed by GitHub
commit 8441987828
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 421 additions and 2 deletions

View file

@ -1266,6 +1266,11 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .thisvid import (
ThisVidIE,
ThisVidMemberIE,
ThisVidPlaylistIE,
)
from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,

View file

@ -28,6 +28,7 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_duration,
parse_resolution,
sanitized_Request,
smuggle_url,
unescapeHTML,
@ -35,6 +36,7 @@ from ..utils import (
unsmuggle_url,
UnsupportedError,
url_or_none,
urljoin,
xpath_attr,
xpath_text,
xpath_with_ns,
@ -2227,6 +2229,97 @@ class GenericIE(InfoExtractor):
# Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
'only_matching': True,
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/',
'info_dict': {
'id': '105',
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/embed/105/',
'info_dict': {
'id': '105',
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
}, {
# KVS Player (tested also in thisvid.py)
'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
'id': '18485',
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
},
}, {
# KVS Player
'url': 'https://youix.com/embed/18485',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
'id': '18485',
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Ленинград - ЗОЖ',
'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
},
}, {
# KVS Player
'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': {
'id': '21217',
'display_id': '40-nochey-2016',
'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org',
'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
},
}, {
# KVS Player (for sites that serve kt_player.js via non-https urls)
'url': 'http://www.camhub.world/embed/389508',
'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32',
'info_dict': {
'id': '389508',
'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
'ext': 'mp4',
'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
},
}, {
'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes',
'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4',
'info_dict': {
'id': '5',
'display_id': 'selena-gomez-pov-deep-fakes',
'ext': 'mp4',
'title': 'Selena Gomez POV (Deep Fakes) DeepFake Porn - MrDeepFakes',
'description': 'md5:17d1f84b578c9c26875ac5ef9a932354',
'height': 720,
'age_limit': 18,
},
}, {
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': {
'id': '284002',
'display_id': 'just-out-of-the-shower-joi',
'ext': 'mp4',
'title': 'Just Out Of The Shower JOI - Shooshtime',
'height': 720,
'age_limit': 18,
},
},
]
@ -2332,6 +2425,88 @@ class GenericIE(InfoExtractor):
'title': title,
}
def _extract_kvs(self, url, webpage, video_id):
def getlicensetoken(license):
modlicense = license.replace('$', '').replace('0', '1')
center = int(len(modlicense) / 2)
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
modlicense = compat_str(4 * abs(fronthalf - backhalf))
def parts():
for o in range(0, center + 1):
for i in range(1, 5):
yield compat_str((int(license[o + i]) + int(modlicense[o])) % 10)
return ''.join(parts())
def getrealurl(video_url, license_code):
if not video_url.startswith('function/0/'):
return video_url # not obfuscated
url_path, _, url_query = video_url.partition('?')
urlparts = url_path.split('/')[2:]
license = getlicensetoken(license_code)
newmagic = urlparts[5][:32]
def spells(x, o):
l = (o + sum(int(n) for n in license[o:])) % 32
for i in range(0, len(x)):
yield {l: x[o], o: x[l]}.get(i, x[i])
for o in range(len(newmagic) - 1, -1, -1):
newmagic = ''.join(spells(newmagic, o))
urlparts[5] = newmagic + urlparts[5][32:]
return '/'.join(urlparts) + '?' + url_query
flashvars = self._search_regex(
r'(?s)<script\b[^>]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?</script>',
webpage, 'flashvars')
flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json)
# extract the part after the last / as the display_id from the
# canonical URL.
display_id = self._search_regex(
r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False
)
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
formats = []
for key in url_keys:
if '/get_file/' not in flashvars[key]:
continue
format_id = flashvars.get(key + '_text', key)
formats.append(merge_dicts(
parse_resolution(format_id) or parse_resolution(flashvars[key]), {
'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])),
'format_id': format_id,
'ext': 'mp4',
'http_headers': {'Referer': url},
}))
if not formats[-1].get('height'):
formats[-1]['quality'] = 1
self._sort_formats(formats)
return {
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
def _real_extract(self, url):
if url.startswith('//'):
return self.url_result(self.http_scheme() + url)
@ -2540,9 +2715,16 @@ class GenericIE(InfoExtractor):
# but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
r'>[^<]*you acknowledge you are at least (\d+) years old',
r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
]
if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
age_limit = 18
for marker in AGE_LIMIT_MARKERS:
m = re.search(marker, webpage)
if not m:
continue
age_limit = max(
age_limit or 0,
int_or_none(m.groups() and m.group(1), default=18))
# video uploader is domain name
video_uploader = self._search_regex(
@ -3389,6 +3571,20 @@ class GenericIE(InfoExtractor):
info_dict['formats'] = formats
return info_dict
# Look for generic KVS player (before ld+json for tests)
found = self._search_regex(
(r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
# kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ...
r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
), webpage, 'KVS player', group='ver', default=False)
if found:
self.report_extraction('%s: KVS Player' % (video_id, ))
if found.split('.')[0] not in ('4', '5', '6'):
self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, ))
return merge_dicts(
self._extract_kvs(url, webpage, video_id),
info_dict)
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')

View file

@ -0,0 +1,218 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
from ..utils import (
clean_html,
get_element_by_class,
int_or_none,
merge_dicts,
url_or_none,
urljoin,
)
class ThisVidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'description': 'md5:372353bb995883d1b65fddf507489acd',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'age_limit': 18,
}
}, {
'url': 'https://thisvid.com/embed/3533241/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'age_limit': 18,
}
}]
def _real_extract(self, url):
main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
webpage = self._download_webpage(url, main_id)
title = self._html_search_regex(
r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
webpage, 'title')
if type_ == 'embed':
# look for more metadata
video_alt_url = url_or_none(self._search_regex(
r'''video_alt_url\s*:\s+'(%s/)',''' % (self._VALID_URL, ),
webpage, 'video_alt_url', default=None))
if video_alt_url and video_alt_url != url:
webpage = self._download_webpage(
video_alt_url, main_id,
note='Redirecting embed to main page', fatal=False) or webpage
video_holder = get_element_by_class('video-holder', webpage) or ''
if '>This video is a private video' in video_holder:
self.raise_login_required(
(clean_html(video_holder) or 'Private video').split('\n', 1)[0])
uploader = self._html_search_regex(
r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
webpage, 'uploader', default='')
uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
if len(uploader) == 2:
# id must be non-empty, uploader could be ''
uploader_id, uploader = uploader
uploader = uploader or None
else:
uploader_id = uploader = None
return merge_dicts({
'_type': 'url_transparent',
'title': title,
'age_limit': 18,
'uploader': uploader,
'uploader_id': uploader_id,
}, self.url_result(url, ie='Generic'))
class ThisVidMemberIE(InfoExtractor):
_VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
_TESTS = [{
'url': 'https://thisvid.com/members/2140501/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Profile',
},
'playlist_mincount': 16,
}, {
'url': 'https://thisvid.com/members/2140501/favourite_videos/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Favourite Videos',
},
'playlist_mincount': 15,
}, {
'url': 'https://thisvid.com/members/636468/public_videos/',
'info_dict': {
'id': '636468',
'title': 'Happymouth\'s Public Videos',
},
'playlist_mincount': 196,
},
]
def _urls(self, html):
for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (ThisVidIE._VALID_URL, ), html):
yield m.group('url')
def _real_extract(self, url):
pl_id = self._match_id(url)
webpage = self._download_webpage(url, pl_id)
title = re.split(
r'(?i)\s*\|\s*ThisVid\.com\s*$',
self._og_search_title(webpage, default=None) or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
def entries(page_url, html=None):
for page in itertools.count(1):
if not html:
html = self._download_webpage(
page_url, pl_id, note='Downloading page %d' % (page, ),
fatal=False) or ''
for u in self._urls(html):
yield u
next_page = get_element_by_class('pagination-next', html) or ''
if next_page:
# member list page
next_page = urljoin(url, self._search_regex(
r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
next_page, 'next page link', group='url', default=None))
# in case a member page should have pagination-next with empty link, not just `else:`
if next_page is None:
# playlist page
parsed_url = compat_urlparse.urlparse(page_url)
base_path, num = parsed_url.path.rsplit('/', 1)
num = int_or_none(num)
if num is None:
base_path, num = parsed_url.path.rstrip('/'), 1
parsed_url = parsed_url._replace(path=base_path + ('/%d' % (num + 1, )))
next_page = compat_urlparse.urlunparse(parsed_url)
if page_url == next_page:
next_page = None
if not next_page:
break
page_url, html = next_page, None
return self.playlist_from_matches(
entries(url, webpage), playlist_id=pl_id, playlist_title=title, ie='ThisVid')
class ThisVidPlaylistIE(ThisVidMemberIE):
_VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '6615',
'title': 'Underwear Stuff',
},
'playlist_mincount': 200,
}, {
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '1072387',
'ext': 'mp4',
'title': 'Big Italian Booty 28',
'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
'uploader_id': '367912',
'uploader': 'Jcmusclefun',
'age_limit': 18,
},
'params': {
'noplaylist': True,
},
}]
def _get_video_url(self, pl_url):
video_id = re.match(self._VALID_URL, pl_url).group('video_id')
return urljoin(pl_url, '/videos/%s/' % (video_id, ))
def _urls(self, html):
for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (self._VALID_URL, ), html):
yield self._get_video_url(m.group('url'))
def _real_extract(self, url):
pl_id = self._match_id(url)
if self._downloader.params.get('noplaylist'):
self.to_screen('Downloading just the featured video because of --no-playlist')
return self.url_result(self._get_video_url(url), 'ThisVid')
self.to_screen(
'Downloading playlist %s - add --no-playlist to download just the featured video' % (pl_id, ))
result = super(ThisVidPlaylistIE, self)._real_extract(url)
# rework title returned as `the title - the title`
title = result['title']
t_len = len(title)
if t_len > 5 and t_len % 2 != 0:
t_len = t_len // 2
if title[t_len] == '-':
title = [t.strip() for t in (title[:t_len], title[t_len + 1:])]
if title[0] and title[0] == title[1]:
result['title'] = title[0]
return result