Add support for cambro.tv and others

* Added a new extractors
* There is some video url reencoding algorithm used to convert
  the video_url received to an actual one
  See comments in CambroIE._convert_video_hash on how to find
  the original JS code
This commit is contained in:
camel2314 2022-01-29 18:02:54 -05:00
commit 7c86551560
6 changed files with 456 additions and 0 deletions

View file

@ -136,10 +136,13 @@
- **BusinessInsider**
- **BuzzFeed**
- **BYUtv**
- **Cambro**
- **Camdemy**
- **CamdemyFolder**
- **Camhub**
- **CamModels**
- **CamTube**
- **CamWhores**
- **CamWithHer**
- **canalc2.tv**
- **Canalplus**: mycanal.fr and piwiplus.fr
@ -644,6 +647,7 @@
- **NRKTVSeries**
- **NRLTV**
- **ntv.ru**
- **Nudespree**
- **Nuvid**
- **NYTimes**
- **NYTimesArticle**

55
test/test_ktplayer.py Normal file
View file

@ -0,0 +1,55 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.extractor.ktplayer import KtPlayerHelper
class TestKtPlayerHelper(unittest.TestCase):
def test_kt_player_helper_lc(self):
self.assertEqual(
KtPlayerHelper._hash_kt_player_lic_code('$385023312702592'),
'49618502835613441220119020166725')
self.assertEqual(
KtPlayerHelper._hash_kt_player_lic_code('$518170117095338'),
'62924140695851455899788411700698')
def test_kt_player_helper_hash_convert(self):
self.assertEqual(
KtPlayerHelper.convert_video_hash('$385023312702592', 'bed397181d043299c43f63582406a20b'),
'8b0bdf194430202ed49325c186633a79')
self.assertEqual(
KtPlayerHelper.convert_video_hash('$518170117095338', '8b25b576ffbf46fa3dc91e34eddc2190b7d3146586'),
'f34c6dff1f890e75b6b59422dde3b1acb7d3146586')
def test_get_url(self):
page1 = """
var flashvars = {
license_code: '$385023312702592',
video_url: 'http://example.com/get_file/2/bed397181d043299c43f63582406a20b/223000/223101/223101.mp4/',
}
"""
self.assertEqual(
KtPlayerHelper.get_url(page1),
'http://example.com/get_file/2/8b0bdf194430202ed49325c186633a79/223000/223101/223101.mp4/')
page2 = """
var flashvars = {
license_code: '$518170117095338',
video_url: 'http://example.com/get_file/2/8b25b576ffbf46fa3dc91e34eddc2190b7d3146586/223000/223101/223101.mp4/',
}
"""
self.assertEqual(
KtPlayerHelper.get_url(page2),
'http://example.com/get_file/2/f34c6dff1f890e75b6b59422dde3b1acb7d3146586/223000/223101/223101.mp4/')
if __name__ == '__main__':
unittest.main()

View file

@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests
import datetime
import io
import json
import xml.etree.ElementTree
@ -18,6 +19,7 @@ import xml.etree.ElementTree
from youtube_dl.utils import (
age_restricted,
args_to_str,
date_from_ago,
encode_base_n,
caesar,
clean_html,
@ -1475,6 +1477,45 @@ Line 1
self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
def test_date_from_ago(self):
self.assertIsNone(date_from_ago(None))
self.assertIsNone(date_from_ago(''))
self.assertIsNone(date_from_ago('invalid'))
self.assertIsNone(date_from_ago('1 microsecond ago'))
self.assertIsNone(date_from_ago('five days ago'))
self.assertEqual(
date_from_ago('1 minute ago'),
(datetime.datetime.utcnow() - datetime.timedelta(minutes=1)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('1 Minute Ago'),
(datetime.datetime.utcnow() - datetime.timedelta(minutes=1)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('2 minutes ago'),
(datetime.datetime.utcnow() - datetime.timedelta(minutes=2)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('1 hour ago'),
(datetime.datetime.utcnow() - datetime.timedelta(hours=1)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('2 hours ago'),
(datetime.datetime.utcnow() - datetime.timedelta(hours=2)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('5 days ago'),
(datetime.datetime.utcnow() - datetime.timedelta(days=5)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('2 months ago'),
(datetime.datetime.utcnow() - datetime.timedelta(days=60)).strftime('%Y%m%d'))
self.assertEqual(
date_from_ago('10 years ago'),
(datetime.datetime.utcnow() - datetime.timedelta(days=3650)).strftime('%Y%m%d'))
if __name__ == '__main__':
unittest.main()

View file

@ -547,6 +547,12 @@ from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
from .konserthusetplay import KonserthusetPlayIE
from .krasview import KrasViewIE
from .ktplayer import (
CambroIE,
CamWhoresIE,
CamhubIE,
NudespreeIE,
)
from .ku6 import Ku6IE
from .kusi import KUSIIE
from .kuwo import (

View file

@ -0,0 +1,311 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
date_from_ago,
parse_duration,
url_or_none,
)
class KtPlayerHelper:
"""KtPlayerHelper contains utility functions for video URL re-encoding
performed by kt_player that is used by cambro, camhub, etc.
"""
@staticmethod
def _hash_kt_player_lic_code(code):
"""Some hash algorithm extracted from obfuscated JS
in: '$385023312702592'
out: '49618502835613441220119020166725'
"""
if not code:
return code
code_no_zeros = ''
for lim in range(1, len(code)):
val = int(code[lim])
code_no_zeros += str(val) if val else '1'
mid = int(len(code_no_zeros) / 2)
left = int(code_no_zeros[0:mid + 1])
right = int(code_no_zeros[mid:])
val = abs(right - left) + abs(left - right)
val *= 2
val = str(val)
lim = 10
result = ""
i = 0
while i < mid + 1:
for j in range(1, 5):
n = int(code[i + j]) + int(val[i])
if n >= lim:
n -= lim
result += str(n)
i += 1
return result
@staticmethod
def convert_video_hash(lic_code, orig_hash, limit=32):
"""Video url hash converter extracted from obfuscated JS
input '$385023312702592', 'bed397181d043299c43f63582406a20b'
output '8b0bdf194430202ed49325c186633a79'
input '$518170117095338', '8b25b576ffbf46fa3dc91e34eddc2190b7d3146586'
output 'f34c6dff1f890e75b6b59422dde3b1acb7d3146586'
In order to find a corresponding code in cambro.tv/camhub.com scripts
do the following:
1. Set a breakpoint at kt_start
2. Execute in CDT console when triggered:
flashvars._video_url = flashvars.video_url;
Object.defineProperty(flashvars, 'video_url', {
get: function () {
return flashvars._video_url;
},
set: function (value) {
debugger;
flashvars._video_url = value;
}
});
3. The second break is where the url re-encoding happens
"""
i = KtPlayerHelper._hash_kt_player_lic_code(lic_code)
h = orig_hash[0:limit]
for k in range(len(h) - 1, -1, -1):
l = k
for m in range(k, len(i)):
l += int(i[m])
while l >= len(h):
l -= len(h)
n = ""
for o in range(0, len(h)):
if o == k:
n += h[l]
elif o == l:
n += h[k]
else:
n += h[o]
h = n
return h + orig_hash[limit:]
@staticmethod
def get_url(webpage):
def search(pattern, string, flags=0):
mobj = re.search(pattern, string, flags)
if mobj:
return next(g for g in mobj.groups() if g is not None)
return None
# extract video url
license_code = search(r'license_code:\s+\'(.+?)\'', webpage)
video_raw_url = search(r'video_url:\s+\'(.+?)\'', webpage)
if not license_code or not video_raw_url:
return None
# decode a real video url
parts = video_raw_url.split('/')
video_pre_parts = []
# cut some junk at the beginning
for i in range(len(parts)):
if parts[i].startswith('http'):
video_pre_parts = parts[i:]
if len(video_pre_parts) < 6:
# it is expected to be
# http://example.com/get_file/2/1039a5cd2f433e4d41adf41e0afc1773/223000/223101/223101.mp4/
# with a hash value as 5th component
raise ExtractorError('url too short: %s' % (video_pre_parts, ))
# convert video hash to a real one
orig_hash = video_pre_parts[5]
new_hash = KtPlayerHelper.convert_video_hash(license_code, orig_hash)
video_pre_parts[5] = new_hash
video_url = '/'.join(video_pre_parts)
return video_url
class KtPlayerExtractor(InfoExtractor):
"""Base class for kt-player based websites.
Supports both inlined and embedded usage variants.
_DURATION_RE and _UPLOADED_RE class vars
must be set in subclasses as needed.
"""
_DURATION_RE = None
_UPLOADED_RE = None
def _kt_extract(self, url, embedded=False):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
title = mobj.group('title')
website = mobj.group('site')
webpage = self._download_webpage(url, video_id)
if 'This video is a private video' in webpage:
raise ExtractorError(
'Video %s is private' % video_id, expected=True)
flashdata = webpage
if embedded:
# find the iframe with a player
iframe_src = self._html_search_regex(
r'<div class="embed-wrap".+?<iframe.+?src="(.+?)"\s+.+?</iframe>',
webpage, 'iframe')
flashdata = self._download_webpage(
iframe_src, video_id, headers={'Referer': website})
video_url = KtPlayerHelper.get_url(flashdata)
if not video_url:
raise ExtractorError(
'Failed to extract video url for %s' % video_id, expected=True)
preview_url = url_or_none(self._html_search_regex(
r'preview_url:\s+\'(.+?)\'', flashdata, 'preview_url', default=None))
ext = self._html_search_regex(
r"""postfix:\s+'(.+?)'""", flashdata, 'ext', fatal=False)
if ext:
ext = ext[1:]
description = self._og_search_title(webpage, fatal=False)
duration = self._html_search_regex(
self._DURATION_RE,
webpage, description, fatal=False, flags=re.DOTALL)
categories = self._html_search_regex(
r'video_categories:\s+\'(.+?)\'',
flashdata, 'categories', fatal=False, default='')
categories = categories.split(',')
tags = self._html_search_regex(
r'video_tags:\s+\'(.+?)\'',
flashdata, 'tags', fatal=False, default='')
tags = tags.split(',')
upload_date = date_from_ago(self._html_search_regex(
self._UPLOADED_RE, webpage, 'upload_date',
fatal=False, default=None))
return {
'id': video_id,
'ext': ext,
'title': title,
'url': video_url,
'description': description,
'thumbnail': preview_url,
'duration': parse_duration(duration),
'categories': categories,
'tags': tags,
'upload_date': upload_date,
}
class CambroIE(KtPlayerExtractor):
_VALID_URL = r'(?P<site>https?://(?:www\.)?cambro\.tv)/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'
_TEST = {
'url': 'https://www.cambro.tv/223101/artoftease-chaturbate-nude-cam-porn-video/',
'md5': '4019439bae333f5cdb171807bf406abf',
'info_dict': {
'id': '223101',
'ext': 'mp4',
'title': 'artoftease-chaturbate-nude-cam-porn-video',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'categories': ['Chaturbate'],
'duration': 1802.0,
'upload_date': r're:\d{8}',
}
}
_DURATION_RE = r'<div class="headline">.+?<h1>.+?</h1>.+?' + \
r'<span><em>((?:\d+:)?(?:\d+:)?\d+)</em></span>.+?</div>'
_UPLOADED_RE = r'<span><em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url)
class CamWhoresIE(KtPlayerExtractor):
_VALID_URL = r'''(?x)
(?P<site>https?://(?:www\.)?
(?:
(?:camwhores\.tv)|
(?:webpussi\.com)
)
)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'''
_TESTS = [{
'url': 'https://www.camwhores.tv/videos/7195634/lizistrata-adammeva-vl-2b/',
'md5': '6dd5ac7952cf1ac32d95bb44318c91d0',
'info_dict': {
'id': '7195634',
'ext': 'mp4',
'title': 'lizistrata-adammeva-vl-2b',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'categories': ['CB'],
'duration': 1387.0,
'upload_date': r're:\d{8}',
}
}, {
'url': 'http://www.webpussi.com/videos/60725/aliska-dark-new-free-show-petite-teen-part-3/',
'md5': '60b3ac7dd16be6bc1cf45d0285217718',
'info_dict': {
'id': '60725',
'ext': 'mp4',
'title': 'aliska-dark-new-free-show-petite-teen-part-3',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'duration': 729.0,
'upload_date': r're:\d{8}',
}
}]
_DURATION_RE = r'<span>Duration: <em>((?:\d+:)?(?:\d+:)?\d+)</em></span>'
_UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url)
class CamhubIE(KtPlayerExtractor):
_VALID_URL = r'(?P<site>https?://(?:www\.)?camhub\.cc)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'
_TEST = {
'url': 'http://www.camhub.cc/videos/48002/ehotlovea-skinny-hooker-private-show-ee59e3907cf1c935/',
'md5': '6da44cc3148cad08243c78575b94b49f',
'info_dict': {
'id': '48002',
'ext': 'mp4',
'title': 'ehotlovea-skinny-hooker-private-show-ee59e3907cf1c935',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'duration': 853.0,
'upload_date': r're:\d{8}',
}
}
_DURATION_RE = r'<span>Duration: <em>((?:\d+hour\s)?(?:\d+min\s)?\d+sec)</em></span>'
_UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url, embedded=True)
class NudespreeIE(KtPlayerExtractor):
_VALID_URL = r'(?P<site>https?://(?:www\.)?nudespree\.com)/videos/(?P<id>[0-9]+)/(?P<title>[^/?#&]+)/'
_TEST = {
'url': 'http://nudespree.com/videos/1048640/loloxxgocoffe-foryou-hot-brunette/',
'md5': '67a759471cac087d0ad312d4d6d0bdd3',
'info_dict': {
'id': '1048640',
'ext': 'mp4',
'title': 'loloxxgocoffe-foryou-hot-brunette',
'thumbnail': r're:^https?://.*\.jpg$',
'description': r're:.*',
'duration': 528.0,
'upload_date': r're:\d{8}',
}
}
_DURATION_RE = r'<span>Duration: <em>((?:\d+:)?(?:\d+:)?\d+)</em></span>'
_UPLOADED_RE = r'<span>Submitted:\s+<em>(.+?\s+ago)</em></span>'
def _real_extract(self, url):
return self._kt_extract(url, embedded=True)

View file

@ -5772,3 +5772,42 @@ def clean_podcast_url(url):
st\.fm # https://podsights.com/docs/
)/e
)/''', '', url)
def date_from_ago(ago_str):
"""Converts strings like '2 months ago' into YYYYMMDD
Returns None if fails
"""
if not ago_str:
return None
upload_date = re.search(
r'(?P<val>\d+)\s+(?P<unit>(?:years?)|(?:months?)|(?:weeks?)|(?:days?)|(?:hours?)|(?:minutes?))\s+ago',
ago_str, flags=re.IGNORECASE)
if not upload_date:
return None
value = int(upload_date.group('val'))
unit = upload_date.group('unit')
if not unit or not value:
return None
ago_units = {
'minute': lambda x: {'minutes': x},
'hour': lambda x: {'hours': x},
'day': lambda x: {'days': x},
'week': lambda x: {'days': 7 * x},
'month': lambda x: {'days': 30 * x},
'year': lambda x: {'days': 365 * x},
}
kwargs = {}
for k, v in ago_units.items():
if unit.lower().startswith(k):
kwargs = v(value)
if not kwargs:
return None
now = datetime.datetime.utcnow()
delta = datetime.timedelta(**kwargs)
upload_date = (now - delta).strftime('%Y%m%d')
return upload_date