Add support for cambro.tv and others

* Added a new extractors
* There is some video url reencoding algorithm used to convert
  the video_url received to an actual one
  See comments in CambroIE._convert_video_hash on how to find
  the original JS code
This commit is contained in:
camel2314 2022-01-29 18:02:54 -05:00
commit 7c86551560
6 changed files with 456 additions and 0 deletions

View file

@ -136,10 +136,13 @@
- **BusinessInsider** - **BusinessInsider**
- **BuzzFeed** - **BuzzFeed**
- **BYUtv** - **BYUtv**
- **Cambro**
- **Camdemy** - **Camdemy**
- **CamdemyFolder** - **CamdemyFolder**
- **Camhub**
- **CamModels** - **CamModels**
- **CamTube** - **CamTube**
- **CamWhores**
- **CamWithHer** - **CamWithHer**
- **canalc2.tv** - **canalc2.tv**
- **Canalplus**: mycanal.fr and piwiplus.fr - **Canalplus**: mycanal.fr and piwiplus.fr
@ -644,6 +647,7 @@
- **NRKTVSeries** - **NRKTVSeries**
- **NRLTV** - **NRLTV**
- **ntv.ru** - **ntv.ru**
- **Nudespree**
- **Nuvid** - **Nuvid**
- **NYTimes** - **NYTimes**
- **NYTimesArticle** - **NYTimesArticle**

55
test/test_ktplayer.py Normal file
View file

@ -0,0 +1,55 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.extractor.ktplayer import KtPlayerHelper
class TestKtPlayerHelper(unittest.TestCase):
def test_kt_player_helper_lc(self):
self.assertEqual(
KtPlayerHelper._hash_kt_player_lic_code('$385023312702592'),
'49618502835613441220119020166725')
self.assertEqual(
KtPlayerHelper._hash_kt_player_lic_code('$518170117095338'),
'62924140695851455899788411700698')
def test_kt_player_helper_hash_convert(self):
self.assertEqual(
KtPlayerHelper.convert_video_hash('$385023312702592', 'bed397181d043299c43f63582406a20b'),
'8b0bdf194430202ed49325c186633a79')
self.assertEqual(
KtPlayerHelper.convert_video_hash('$518170117095338', '8b25b576ffbf46fa3dc91e34eddc2190b7d3146586'),
'f34c6dff1f890e75b6b59422dde3b1acb7d3146586')
def test_get_url(self):
page1 = """
var flashvars = {
license_code: '$385023312702592',
video_url: 'http://example.com/get_file/2/bed397181d043299c43f63582406a20b/223000/223101/223101.mp4/',
}
"""
self.assertEqual(
KtPlayerHelper.get_url(page1),
'http://example.com/get_file/2/8b0bdf194430202ed49325c186633a79/223000/223101/223101.mp4/')
page2 = """
var flashvars = {
license_code: '$518170117095338',
video_url: 'http://example.com/get_file/2/8b25b576ffbf46fa3dc91e34eddc2190b7d3146586/223000/223101/223101.mp4/',
}
"""
self.assertEqual(
KtPlayerHelper.get_url(page2),
'http://example.com/get_file/2/f34c6dff1f890e75b6b59422dde3b1acb7d3146586/223000/223101/223101.mp4/')
if __name__ == '__main__':
unittest.main()

View file

@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests # Various small unit tests
import datetime
import io import io
import json import json
import xml.etree.ElementTree import xml.etree.ElementTree
@ -18,6 +19,7 @@ import xml.etree.ElementTree
from youtube_dl.utils import ( from youtube_dl.utils import (
age_restricted, age_restricted,
args_to_str, args_to_str,
date_from_ago,
encode_base_n, encode_base_n,
caesar, caesar,
clean_html, clean_html,
@ -1475,6 +1477,45 @@ Line 1
self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
def test_date_from_ago(self):
self.assertIsNone(date_from_ago(None))
self.assertIsNone(date_from_ago(''))
self.assertIsNone(date_from_ago('invalid'))
self.assertIsNone(date_from_ago('1 microsecond ago'))
self.assertIsNone(date_from_ago('five days ago'))
self.assertEqual(
date_from_ago('1 minute ago'),
(datetime.datetime.utcnow() - datetime.timedelta(minutes=1)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('1 Minute Ago'),
(datetime.datetime.utcnow() - datetime.timedelta(minutes=1)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('2 minutes ago'),
(datetime.datetime.utcnow() - datetime.timedelta(minutes=2)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('1 hour ago'),
(datetime.datetime.utcnow() - datetime.timedelta(hours=1)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('2 hours ago'),
(datetime.datetime.utcnow() - datetime.timedelta(hours=2)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('5 days ago'),
(datetime.datetime.utcnow() - datetime.timedelta(days=5)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('2 months ago'),
(datetime.datetime.utcnow() - datetime.timedelta(days=60)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('10 years ago'),
(datetime.datetime.utcnow() - datetime.timedelta(days=3650)).strftime('%Y%m%d'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -547,6 +547,12 @@ from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE
from .konserthusetplay import KonserthusetPlayIE from .konserthusetplay import KonserthusetPlayIE
from .krasview import KrasViewIE from .krasview import KrasViewIE
from .ktplayer import (
CambroIE,
CamWhoresIE,
CamhubIE,
NudespreeIE,
)
from .ku6 import Ku6IE from .ku6 import Ku6IE
from .kusi import KUSIIE from .kusi import KUSIIE
from .kuwo import ( from .kuwo import (

View file

@ -0,0 +1,311 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
date_from_ago,
parse_duration,
url_or_none,
)
class KtPlayerHelper:
"""KtPlayerHelper contains utility functions for video URL re-encoding
performed by kt_player that is used by cambro, camhub, etc.
"""
@staticmethod
def _hash_kt_player_lic_code(code):
"""Some hash algorithm extracted from obfuscated JS
in: '$385023312702592'
out: '49618502835613441220119020166725'
"""
if not code:
return code
code_no_zeros = ''
for lim in range(1, len(code)):
val = int(code[lim])
code_no_zeros += str(val) if val else '1'
mid = int(len(code_no_zeros) / 2)
left = int(code_no_zeros[0:mid + 1])
right = int(code_no_zeros[mid:])
val = abs(right - left) + abs(left - right)
val *= 2
val = str(val)
lim = 10
result = ""
i = 0
while i < mid + 1:
for j in range(1, 5):
n = int(code[i + j]) + int(val[i])
if n >= lim:
n -= lim
result += str(n)
i += 1
return result
@staticmethod
def convert_video_hash(lic_code, orig_hash, limit=32):
"""Video url hash converter extracted from obfuscated JS
input '$385023312702592', 'bed397181d043299c43f63582406a20b'
output '8b0bdf194430202ed49325c186633a79'
input '$518170117095338', '8b25b576ffbf46fa3dc91e34eddc2190b7d3146586'
output 'f34c6dff1f890e75b6b59422dde3b1acb7d3146586'
In order to find a corresponding code in cambro.tv/camhub.com scripts
do the following:
1. Set a breakpoint at kt_start
2. Execute in CDT console when triggered:
flashvars._video_url = flashvars.video_url;
Object.defineProperty(flashvars, 'video_url', {
get: function () {
return flashvars._video_url;
},
set: function (value) {
debugger;
flashvars._video_url = value;
}
});
3. The second break is where the url re-encoding happens
"""
i = KtPlayerHelper._hash_kt_player_lic_code(lic_code)
h = orig_hash[0:limit]
for k in range(len(h) - 1, -1, -1):
l = k
for m in range(k, len(i)):
l += int(i[m])
while l >= len(h):
l -= len(h)
n = ""
for o in range(0, len(h)):
if o == k:
n += h[l]
elif o == l:
n += h[k]
else:
n += h[o]
h = n
return h + orig_hash[limit:]
@staticmethod
def get_url(webpage):
def search(pattern, string, flags=0):
mobj = re.search(pattern, string, flags)
if mobj:
return next(g for g in mobj.groups() if g is not None)
return None
# extract video url
license_code = search(r'license_code:\s+\'(.+?)\'', webpage)
video_raw_url = search(r'video_url:\s+\'(.+?)\'', webpage)
if not license_code or not video_raw_url:
return None
# decode a real video url
parts = video_raw_url.split('/')
video_pre_parts = []
# cut some junk at the beginning
for i in range(len(parts)):
if parts[i].startswith('http'):
video_pre_parts = parts[i:]
if len(video_pre_parts) < 6:
# it is expected to be
# http://example.com/get_file/2/1039a5cd2f433e4d41adf41e0afc1773/223000/223101/223101.mp4/
# with a hash value as 5th component
raise ExtractorError('url too short: %s' % (video_pre_parts, ))
# convert video hash to a real one
orig_hash = video_pre_parts[5]
new_hash = KtPlayerHelper.convert_video_hash(license_code, orig_hash)
video_pre_parts[5] = new_hash
video_url = '/'.join(video_pre_parts)
return video_url
class KtPlayerExtractor(InfoExtractor):
"""Base class for kt-player based websites.
Supports both inlined and embedded usage variants.
_DURATION_RE and _UPLOADED_RE class vars
must be set in subclasses as needed.
"""
_DURATION_RE = None
_UPLOADED_RE = None
def _kt_extract(self, url, embedded=False):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
title = mobj.group('title')
website = mobj.group('site')
webpage = self._download_webpage(url, video_id)
if 'This video is a private video' in webpage:
raise ExtractorError(
'Video %s is private' % video_id, expected=True)
flashdata = webpage
if embedded:
# find the iframe with a player
iframe_src = self._html_search_regex(
r'<div class="embed-wrap".+?<iframe.+?src="(.+?)"\s+.+?</iframe>',
webpage, 'iframe')
flashdata = self._download_webpage(
iframe_src, video_id, headers={'Referer': website})
video_url = KtPlayerHelper.get_url(flashdata)
if not video_url:
raise ExtractorError(
'Failed to extract video url for %s' % video_id, expected=True)
preview_url = url_or_none(self._html_search_regex(
r'preview_url:\s+\'(.+?)\'', flashdata, 'preview_url', default=None))
ext = self._html_search_regex(
r"""postfix:\s+'(.+?)'""", flashdata, 'ext', fatal=False)
if ext:
ext = ext[1:]
description = self._og_search_title(webpage, fatal=False)
duration = self._html_search_regex(
self._DURATION_RE,
webpage, description, fatal=False, flags=re.DOTALL)
categories = self._html_search_regex(
r'video_categories:\s+\'(.+?)\'',
flashdata, 'categories', fatal=False, default='')
categories = categories.split(',')
tags = self._html_search_regex(
r'video_tags:\s+\'(.+?)\'',
flashdata, 'tags', fatal=False, default='')
tags = tags.split(',')
upload_date = date_from_ago(self._html_search_regex(
self._UPLOADED_RE, webpage, 'upload_date',
fatal=False, default=None))
return {
'id': video_id,
'ext': ext,
'title': title,
'url': video_url,
'description': description,
'thumbnail': preview_url,
'duration': parse_duration(duration),
'categories': categories,
'tags': tags,
'upload_date': upload_date,
}
class CambroIE(KtPlayerExtractor):
_VALID_URL = r'(?P<site>https?://(?:www\.)?cambro\.tv)/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'
_TEST = {
'url': 'https://www.cambro.tv/223101/artoftease-chaturbate-nude-cam-porn-video/',
'md5': '4019439bae333f5cdb171807bf406abf',
'info_dict': {
'id': '223101',
'ext': 'mp4',
'title': 'artoftease-chaturbate-nude-cam-porn-video',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'categories': ['Chaturbate'],
'duration': 1802.0,
'upload_date': r're:\d{8}',
}
}
_DURATION_RE = r'<div class="headline">.+?<h1>.+?</h1>.+?' + \
r'<span><em>((?:\d+:)?(?:\d+:)?\d+)</em></span>.+?</div>'
_UPLOADED_RE = r'<span><em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url)
class CamWhoresIE(KtPlayerExtractor):
_VALID_URL = r'''(?x)
(?P<site>https?://(?:www\.)?
(?:
(?:camwhores\.tv)|
(?:webpussi\.com)
)
)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'''
_TESTS = [{
'url': 'https://www.camwhores.tv/videos/7195634/lizistrata-adammeva-vl-2b/',
'md5': '6dd5ac7952cf1ac32d95bb44318c91d0',
'info_dict': {
'id': '7195634',
'ext': 'mp4',
'title': 'lizistrata-adammeva-vl-2b',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'categories': ['CB'],
'duration': 1387.0,
'upload_date': r're:\d{8}',
}
}, {
'url': 'http://www.webpussi.com/videos/60725/aliska-dark-new-free-show-petite-teen-part-3/',
'md5': '60b3ac7dd16be6bc1cf45d0285217718',
'info_dict': {
'id': '60725',
'ext': 'mp4',
'title': 'aliska-dark-new-free-show-petite-teen-part-3',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'duration': 729.0,
'upload_date': r're:\d{8}',
}
}]
_DURATION_RE = r'<span>Duration: <em>((?:\d+:)?(?:\d+:)?\d+)</em></span>'
_UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url)
class CamhubIE(KtPlayerExtractor):
_VALID_URL = r'(?P<site>https?://(?:www\.)?camhub\.cc)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'
_TEST = {
'url': 'http://www.camhub.cc/videos/48002/ehotlovea-skinny-hooker-private-show-ee59e3907cf1c935/',
'md5': '6da44cc3148cad08243c78575b94b49f',
'info_dict': {
'id': '48002',
'ext': 'mp4',
'title': 'ehotlovea-skinny-hooker-private-show-ee59e3907cf1c935',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'duration': 853.0,
'upload_date': r're:\d{8}',
}
}
_DURATION_RE = r'<span>Duration: <em>((?:\d+hour\s)?(?:\d+min\s)?\d+sec)</em></span>'
_UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url, embedded=True)
class NudespreeIE(KtPlayerExtractor):
_VALID_URL = r'(?P<site>https?://(?:www\.)?nudespree\.com)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'
_TEST = {
'url': 'http://nudespree.com/videos/1048640/loloxxgocoffe-foryou-hot-brunette/',
'md5': '67a759471cac087d0ad312d4d6d0bdd3',
'info_dict': {
'id': '1048640',
'ext': 'mp4',
'title': 'loloxxgocoffe-foryou-hot-brunette',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'duration': 528.0,
'upload_date': r're:\d{8}',
}
}
_DURATION_RE = r'<span>Duration: <em>((?:\d+:)?(?:\d+:)?\d+)</em></span>'
_UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url, embedded=True)

View file

@ -5772,3 +5772,42 @@ def clean_podcast_url(url):
st\.fm # https://podsights.com/docs/ st\.fm # https://podsights.com/docs/
)/e )/e
)/''', '', url) )/''', '', url)
def date_from_ago(ago_str):
"""Converts strings like '2 months ago' into YYYYMMDD
Returns None if fails
"""
if not ago_str:
return None
upload_date = re.search(
r'(?P<val>\d+)\s+(?P<unit>(?:years?)|(?:months?)|(?:weeks?)|(?:days?)|(?:hours?)|(?:minutes?))\s+ago',
ago_str, flags=re.IGNORECASE)
if not upload_date:
return None
value = int(upload_date.group('val'))
unit = upload_date.group('unit')
if not unit or not value:
return None
ago_units = {
'minute': lambda x: {'minutes': x},
'hour': lambda x: {'hours': x},
'day': lambda x: {'days': x},
'week': lambda x: {'days': 7 * x},
'month': lambda x: {'days': 30 * x},
'year': lambda x: {'days': 365 * x},
}
kwargs = {}
for k, v in ago_units.items():
if unit.lower().startswith(k):
kwargs = v(value)
if not kwargs:
return None
now = datetime.datetime.utcnow()
delta = datetime.timedelta(**kwargs)
upload_date = (now - delta).strftime('%Y%m%d')
return upload_date