From 7c86551560e3614b3b259510dcf4d1bcdd34e6ac Mon Sep 17 00:00:00 2001 From: camel2314 Date: Sat, 29 Jan 2022 18:02:54 -0500 Subject: [PATCH] Add support for cambro.tv and others * Added a new extractors * There is some video url reencoding algorithm used to convert the video_url received to an actual one See comments in CambroIE._convert_video_hash on how to find the original JS code --- docs/supportedsites.md | 4 + test/test_ktplayer.py | 55 +++++ test/test_utils.py | 41 ++++ youtube_dl/extractor/extractors.py | 6 + youtube_dl/extractor/ktplayer.py | 311 +++++++++++++++++++++++++++++ youtube_dl/utils.py | 39 ++++ 6 files changed, 456 insertions(+) create mode 100644 test/test_ktplayer.py create mode 100644 youtube_dl/extractor/ktplayer.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ae2a6b8b0..0a65806f0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -136,10 +136,13 @@ - **BusinessInsider** - **BuzzFeed** - **BYUtv** + - **Cambro** - **Camdemy** - **CamdemyFolder** + - **Camhub** - **CamModels** - **CamTube** + - **CamWhores** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr @@ -644,6 +647,7 @@ - **NRKTVSeries** - **NRLTV** - **ntv.ru** + - **Nudespree** - **Nuvid** - **NYTimes** - **NYTimesArticle** diff --git a/test/test_ktplayer.py b/test/test_ktplayer.py new file mode 100644 index 000000000..5ccd392b3 --- /dev/null +++ b/test/test_ktplayer.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor.ktplayer import KtPlayerHelper + + +class TestKtPlayerHelper(unittest.TestCase): + def test_kt_player_helper_lc(self): + self.assertEqual( + KtPlayerHelper._hash_kt_player_lic_code('$385023312702592'), + '49618502835613441220119020166725') + self.assertEqual( + KtPlayerHelper._hash_kt_player_lic_code('$518170117095338'), + '62924140695851455899788411700698') + + def test_kt_player_helper_hash_convert(self): + self.assertEqual( + KtPlayerHelper.convert_video_hash('$385023312702592', 'bed397181d043299c43f63582406a20b'), + '8b0bdf194430202ed49325c186633a79') + self.assertEqual( + KtPlayerHelper.convert_video_hash('$518170117095338', '8b25b576ffbf46fa3dc91e34eddc2190b7d3146586'), + 'f34c6dff1f890e75b6b59422dde3b1acb7d3146586') + + def test_get_url(self): + page1 = """ + var flashvars = { + license_code: '$385023312702592', + video_url: 'http://example.com/get_file/2/bed397181d043299c43f63582406a20b/223000/223101/223101.mp4/', + } + """ + self.assertEqual( + KtPlayerHelper.get_url(page1), + 'http://example.com/get_file/2/8b0bdf194430202ed49325c186633a79/223000/223101/223101.mp4/') + + page2 = """ + var flashvars = { + license_code: '$518170117095338', + video_url: 'http://example.com/get_file/2/8b25b576ffbf46fa3dc91e34eddc2190b7d3146586/223000/223101/223101.mp4/', + } + """ + self.assertEqual( + KtPlayerHelper.get_url(page2), + 'http://example.com/get_file/2/f34c6dff1f890e75b6b59422dde3b1acb7d3146586/223000/223101/223101.mp4/') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 259c4763e..d9ff59972 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests +import datetime import io import json import xml.etree.ElementTree @@ -18,6 +19,7 @@ import xml.etree.ElementTree from youtube_dl.utils import ( age_restricted, args_to_str, + date_from_ago, encode_base_n, caesar, clean_html, @@ -1475,6 +1477,45 @@ Line 1 self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + def test_date_from_ago(self): + self.assertIsNone(date_from_ago(None)) + self.assertIsNone(date_from_ago('')) + self.assertIsNone(date_from_ago('invalid')) + self.assertIsNone(date_from_ago('1 microsecond ago')) + self.assertIsNone(date_from_ago('five days ago')) + + self.assertEqual( + date_from_ago('1 minute ago'), + (datetime.datetime.utcnow() - datetime.timedelta(minutes=1)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('1 Minute Ago'), + (datetime.datetime.utcnow() - datetime.timedelta(minutes=1)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('2 minutes ago'), + (datetime.datetime.utcnow() - datetime.timedelta(minutes=2)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('1 hour ago'), + (datetime.datetime.utcnow() - datetime.timedelta(hours=1)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('2 hours ago'), + (datetime.datetime.utcnow() - datetime.timedelta(hours=2)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('5 days ago'), + (datetime.datetime.utcnow() - datetime.timedelta(days=5)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('2 months ago'), + (datetime.datetime.utcnow() - datetime.timedelta(days=60)).strftime('%Y%m%d')) + + self.assertEqual( + date_from_ago('10 years ago'), + (datetime.datetime.utcnow() - datetime.timedelta(days=3650)).strftime('%Y%m%d')) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961..8c6e3abd5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -547,6 +547,12 @@ from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE +from .ktplayer import ( + CambroIE, + CamWhoresIE, + CamhubIE, + NudespreeIE, +) from .ku6 import Ku6IE from .kusi import KUSIIE from .kuwo import ( diff --git a/youtube_dl/extractor/ktplayer.py b/youtube_dl/extractor/ktplayer.py new file mode 100644 index 000000000..a39c18895 --- /dev/null +++ b/youtube_dl/extractor/ktplayer.py @@ -0,0 +1,311 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + date_from_ago, + parse_duration, + url_or_none, +) + + +class KtPlayerHelper: + """KtPlayerHelper contains utility functions for video URL re-encoding + performed by kt_player that is used by cambro, camhub, etc. + """ + @staticmethod + def _hash_kt_player_lic_code(code): + """Some hash algorithm extracted from obfuscated JS + in: '$385023312702592' + out: '49618502835613441220119020166725' + """ + if not code: + return code + code_no_zeros = '' + for lim in range(1, len(code)): + val = int(code[lim]) + code_no_zeros += str(val) if val else '1' + mid = int(len(code_no_zeros) / 2) + left = int(code_no_zeros[0:mid + 1]) + right = int(code_no_zeros[mid:]) + val = abs(right - left) + abs(left - right) + val *= 2 + val = str(val) + lim = 10 + result = "" + i = 0 + while i < mid + 1: + for j in range(1, 5): + n = int(code[i + j]) + int(val[i]) + if n >= lim: + n -= lim + result += str(n) + i += 1 + return result + + @staticmethod + def convert_video_hash(lic_code, orig_hash, limit=32): + """Video url hash converter extracted from obfuscated JS + input '$385023312702592', 'bed397181d043299c43f63582406a20b' + output '8b0bdf194430202ed49325c186633a79' + input '$518170117095338', '8b25b576ffbf46fa3dc91e34eddc2190b7d3146586' + output 'f34c6dff1f890e75b6b59422dde3b1acb7d3146586' + In order to find a corresponding code in cambro.tv/camhub.com scripts + do the following: + 1. Set a breakpoint at kt_start + 2. Execute in CDT console when triggered: + flashvars._video_url = flashvars.video_url; + Object.defineProperty(flashvars, 'video_url', { + get: function () { + return flashvars._video_url; + }, + set: function (value) { + debugger; + flashvars._video_url = value; + } + }); + 3. The second break is where the url re-encoding happens + """ + i = KtPlayerHelper._hash_kt_player_lic_code(lic_code) + h = orig_hash[0:limit] + for k in range(len(h) - 1, -1, -1): + l = k + for m in range(k, len(i)): + l += int(i[m]) + while l >= len(h): + l -= len(h) + n = "" + for o in range(0, len(h)): + if o == k: + n += h[l] + elif o == l: + n += h[k] + else: + n += h[o] + h = n + return h + orig_hash[limit:] + + @staticmethod + def get_url(webpage): + def search(pattern, string, flags=0): + mobj = re.search(pattern, string, flags) + if mobj: + return next(g for g in mobj.groups() if g is not None) + return None + + # extract video url + license_code = search(r'license_code:\s+\'(.+?)\'', webpage) + video_raw_url = search(r'video_url:\s+\'(.+?)\'', webpage) + if not license_code or not video_raw_url: + return None + + # decode a real video url + parts = video_raw_url.split('/') + video_pre_parts = [] + + # cut some junk at the beginning + for i in range(len(parts)): + if parts[i].startswith('http'): + video_pre_parts = parts[i:] + if len(video_pre_parts) < 6: + # it is expected to be + # http://example.com/get_file/2/1039a5cd2f433e4d41adf41e0afc1773/223000/223101/223101.mp4/ + # with a hash value as 5th component + raise ExtractorError('url too short: %s' % (video_pre_parts, )) + + # convert video hash to a real one + orig_hash = video_pre_parts[5] + new_hash = KtPlayerHelper.convert_video_hash(license_code, orig_hash) + video_pre_parts[5] = new_hash + video_url = '/'.join(video_pre_parts) + + return video_url + + +class KtPlayerExtractor(InfoExtractor): + """Base class for kt-player based websites. + Supports both inlined and embedded usage variants. + + _DURATION_RE and _UPLOADED_RE class vars + must be set in subclasses as needed. + """ + + _DURATION_RE = None + _UPLOADED_RE = None + + def _kt_extract(self, url, embedded=False): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + title = mobj.group('title') + website = mobj.group('site') + + webpage = self._download_webpage(url, video_id) + + if 'This video is a private video' in webpage: + raise ExtractorError( + 'Video %s is private' % video_id, expected=True) + + flashdata = webpage + if embedded: + # find the iframe with a player + iframe_src = self._html_search_regex( + r'
', + webpage, 'iframe') + + flashdata = self._download_webpage( + iframe_src, video_id, headers={'Referer': website}) + + video_url = KtPlayerHelper.get_url(flashdata) + if not video_url: + raise ExtractorError( + 'Failed to extract video url for %s' % video_id, expected=True) + + preview_url = url_or_none(self._html_search_regex( + r'preview_url:\s+\'(.+?)\'', flashdata, 'preview_url', default=None)) + ext = self._html_search_regex( + r"""postfix:\s+'(.+?)'""", flashdata, 'ext', fatal=False) + if ext: + ext = ext[1:] + + description = self._og_search_title(webpage, fatal=False) + duration = self._html_search_regex( + self._DURATION_RE, + webpage, description, fatal=False, flags=re.DOTALL) + + categories = self._html_search_regex( + r'video_categories:\s+\'(.+?)\'', + flashdata, 'categories', fatal=False, default='') + categories = categories.split(',') + tags = self._html_search_regex( + r'video_tags:\s+\'(.+?)\'', + flashdata, 'tags', fatal=False, default='') + tags = tags.split(',') + + upload_date = date_from_ago(self._html_search_regex( + self._UPLOADED_RE, webpage, 'upload_date', + fatal=False, default=None)) + + return { + 'id': video_id, + 'ext': ext, + 'title': title, + 'url': video_url, + 'description': description, + 'thumbnail': preview_url, + 'duration': parse_duration(duration), + 'categories': categories, + 'tags': tags, + 'upload_date': upload_date, + } + + +class CambroIE(KtPlayerExtractor): + _VALID_URL = r'(?Phttps?://(?:www\.)?cambro\.tv)/(?P[0-9]+)/(?P[^/?#&]+)/' + _TEST = { + 'url': 'https://www.cambro.tv/223101/artoftease-chaturbate-nude-cam-porn-video/', + 'md5': '4019439bae333f5cdb171807bf406abf', + 'info_dict': { + 'id': '223101', + 'ext': 'mp4', + 'title': 'artoftease-chaturbate-nude-cam-porn-video', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:.*', + 'categories': ['Chaturbate'], + 'duration': 1802.0, + 'upload_date': r're:\d{8}', + } + } + _DURATION_RE = r'<div class="headline">.+?<h1>.+?</h1>.+?' + \ + r'<span><em>((?:\d+:)?(?:\d+:)?\d+)</em></span>.+?</div>' + _UPLOADED_RE = r'<span><em>(.+?\s+ago)</em></span>' + + def _real_extract(self, url): + return self._kt_extract(url) + + +class CamWhoresIE(KtPlayerExtractor): + _VALID_URL = r'''(?x) + (?P<site>https?://(?:www\.)? + (?: + (?:camwhores\.tv)| + (?:webpussi\.com) + ) + )/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/''' + _TESTS = [{ + 'url': 'https://www.camwhores.tv/videos/7195634/lizistrata-adammeva-vl-2b/', + 'md5': '6dd5ac7952cf1ac32d95bb44318c91d0', + 'info_dict': { + 'id': '7195634', + 'ext': 'mp4', + 'title': 'lizistrata-adammeva-vl-2b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:.*', + 'categories': ['CB'], + 'duration': 1387.0, + 'upload_date': r're:\d{8}', + } + }, { + 'url': 'http://www.webpussi.com/videos/60725/aliska-dark-new-free-show-petite-teen-part-3/', + 'md5': '60b3ac7dd16be6bc1cf45d0285217718', + 'info_dict': { + 'id': '60725', + 'ext': 'mp4', + 'title': 'aliska-dark-new-free-show-petite-teen-part-3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:.*', + 'duration': 729.0, + 'upload_date': r're:\d{8}', + } + }] + _DURATION_RE = r'<span>Duration: <em>((?:\d+:)?(?:\d+:)?\d+)</em></span>' + _UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>' + + def _real_extract(self, url): + return self._kt_extract(url) + + +class CamhubIE(KtPlayerExtractor): + _VALID_URL = r'(?P<site>https?://(?:www\.)?camhub\.cc)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/' + _TEST = { + 'url': 'http://www.camhub.cc/videos/48002/ehotlovea-skinny-hooker-private-show-ee59e3907cf1c935/', + 'md5': '6da44cc3148cad08243c78575b94b49f', + 'info_dict': { + 'id': '48002', + 'ext': 'mp4', + 'title': 'ehotlovea-skinny-hooker-private-show-ee59e3907cf1c935', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:.*', + 'duration': 853.0, + 'upload_date': r're:\d{8}', + } + } + _DURATION_RE = r'<span>Duration: <em>((?:\d+hour\s)?(?:\d+min\s)?\d+sec)</em></span>' + _UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>' + + def _real_extract(self, url): + return self._kt_extract(url, embedded=True) + + +class NudespreeIE(KtPlayerExtractor): + _VALID_URL = r'(?P<site>https?://(?:www\.)?nudespree\.com)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/' + _TEST = { + 'url': 'http://nudespree.com/videos/1048640/loloxxgocoffe-foryou-hot-brunette/', + 'md5': '67a759471cac087d0ad312d4d6d0bdd3', + 'info_dict': { + 'id': '1048640', + 'ext': 'mp4', + 'title': 'loloxxgocoffe-foryou-hot-brunette', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:.*', + 'duration': 528.0, + 'upload_date': r're:\d{8}', + } + } + _DURATION_RE = r'<span>Duration: <em>((?:\d+:)?(?:\d+:)?\d+)</em></span>' + _UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>' + + def _real_extract(self, url): + return self._kt_extract(url, embedded=True) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e722eed58..b810e136a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -5772,3 +5772,42 @@ def clean_podcast_url(url): st\.fm # https://podsights.com/docs/ )/e )/''', '', url) + + +def date_from_ago(ago_str): + """Converts strings like '2 months ago' into YYYYMMDD + Returns None if fails + """ + if not ago_str: + return None + + upload_date = re.search( + r'(?P<val>\d+)\s+(?P<unit>(?:years?)|(?:months?)|(?:weeks?)|(?:days?)|(?:hours?)|(?:minutes?))\s+ago', + ago_str, flags=re.IGNORECASE) + if not upload_date: + return None + + value = int(upload_date.group('val')) + unit = upload_date.group('unit') + if not unit or not value: + return None + + ago_units = { + 'minute': lambda x: {'minutes': x}, + 'hour': lambda x: {'hours': x}, + 'day': lambda x: {'days': x}, + 'week': lambda x: {'days': 7 * x}, + 'month': lambda x: {'days': 30 * x}, + 'year': lambda x: {'days': 365 * x}, + } + kwargs = {} + for k, v in ago_units.items(): + if unit.lower().startswith(k): + kwargs = v(value) + if not kwargs: + return None + + now = datetime.datetime.utcnow() + delta = datetime.timedelta(**kwargs) + upload_date = (now - delta).strftime('%Y%m%d') + return upload_date