From 4f3637930b5eea85d7ada3cb4c07fb651e7b6b9b Mon Sep 17 00:00:00 2001 From: zwolf13 Date: Wed, 2 Nov 2022 23:42:16 -0400 Subject: [PATCH] Update tiktok.py --- youtube_dl/extractor/tiktok.py | 168 +++++++++++++++++++++++++++------ 1 file changed, 138 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 4faa6de54..a1b91520b 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,11 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .generic import GenericIE +from ..compat import ( + compat_kwargs, +) from ..utils import ( - compat_str, + dict_get, ExtractorError, float_or_none, + get_element_by_id, int_or_none, str_or_none, try_get, @@ -13,12 +20,44 @@ from ..utils import ( ) +# decorator enforces UA that TT doesn't block +def vanilla_UA_request(func): + + vanilla_UA = 'Mozilla/5.0' + + def wrapped(*args, **kwargs): + headers = kwargs.get('headers', {}) + if 'User-Agent' not in headers: + headers['User-Agent'] = vanilla_UA + kwargs.update({'headers': headers, }) + kwargs = compat_kwargs(kwargs) + return func(*args, **kwargs) + + return wrapped + + class TikTokBaseIE(InfoExtractor): + IE_DESC = 'Abstract base for TikTok extractors' + IE_NAME = 'tiktok:base' + + @vanilla_UA_request + def _request_webpage(self, *args, **kwargs): + return super(TikTokBaseIE, self)._request_webpage(*args, **kwargs) + + def _get_SIGI_STATE(self, video_id, html): + state = self._parse_json( + get_element_by_id('SIGI_STATE', html) + or self._search_regex( + r'''(?s)]*?\bid\s*=\s*(?P"|'|\b)sigi-persisted-data(?P=q)[^>]*>[^=]*=\s*(?P{.+?})\s*(?:;[^<]+)?\d+)' + IE_DESC = 'TikTok video extractor' + IE_NAME = 'tiktok' + _VALID_URL = r'(?:https?://(?:(?:www|m)\.)?tiktok\.com/@[^/]+/video/|tiktok:(?P[^/?#&]+):)(?P\d+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@zureeal/video/6606727368545406213', 'md5': '163ceff303bb52de60e6887fe399e6cd', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', - 'title': 'Zureeal', + 'title': 'md5:363e08ccb6c691314710429f379bffe5', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'thumbnail': r're:^https?://.*', 'duration': 15, - 'uploader': 'Zureeal', + 'uploader': 'md5:363e08ccb6c691314710429f379bffe5', 'uploader_id': '188294915489964032', 'timestamp': 1538248586, 'upload_date': '20180929', @@ -105,34 +148,75 @@ class TikTokIE(TikTokBaseIE): 'https://www.tiktok.com/', None, note='Setting up session') def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - page_props = self._parse_json(self._search_regex( - r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*[^/?#&]+)' +class TikTokUserIE(TikTokIE): + IE_DESC = 'TikTok user profile extractor' + IE_NAME = 'tiktok:user' + _VALID_URL = r'https?://(?:(?:www|m)\.)?tiktok\.com/@(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@zureeal', 'info_dict': { 'id': '188294915489964032', }, - 'playlist_mincount': 24, + 'playlist_mincount': 30, + 'expected_warnings': [ + 'More videos are available', + ], }] - _WORKING = False @classmethod def suitable(cls, url): - return False if TikTokIE.suitable(url) else super(TikTokUserIE, cls).suitable(url) + return False if TikTokIE.suitable(url) else super(TikTokBaseIE, cls).suitable(url) def _real_extract(self, url): user_id = self._match_id(url) + + webpage = self._download_webpage(url.replace('://m.', '://www.'), user_id) + + page_props = self._get_SIGI_STATE(user_id, webpage) + user_data = try_get(page_props, lambda x: x['UserModule']['users'], dict) + if user_data: + num_id = try_get( + user_data.values(), + lambda x: [user['id'] for user in x if user['uniqueId'] == user_id and user['id'].isdigit()][0]) + item_data = try_get(page_props, lambda x: x['ItemList']['user-post'], dict) or {} + item_list = dict_get(item_data, ('list', 'browserList')) + if not item_list: + item_list = try_get(item_data, lambda x: [y['id'] for y in x['preloadList']], list) + entries = ['tiktok:%s:%s' % (user_id, x, ) for x in item_list or []] + if item_data.get('hasMore', False): + self._downloader.report_warning('More videos are available but the current extractor doesn\'t know how to find them') + + result = entries and self.playlist_from_matches(entries, num_id, ie='TikTok') + if result: + result['display_id'] = user_id + return result + + """ + # this no longer works 2022-01 data = self._download_json( 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, query={'_signature': '_'}) @@ -145,3 +229,27 @@ class TikTokUserIE(TikTokBaseIE): entry['extractor_key'] = TikTokIE.ie_key() entries.append(entry) return self.playlist_result(entries, user_id) + """ + + +class TikTokVMIE(GenericIE): + IE_DESC = 'Resolver for TikTok shortcuts' + IE_NAME = 'tiktok:shortcut' + _VALID_URL = r'https?://(?:vm\.tiktok\.com|m\.tiktok\.com/v)/(?P[^/?#&.]+)' + _TESTS = [{ + 'url': 'https://vm.tiktok.com/ZMLesneqK/', + 'info_dict': { + 'id': '7054218882072055046', + 'ext': 'mp4', + 'title': 'EddY', + 'upload_date': '20220117', + 'description': 'Hilft bestimmt gegen nervige Anrufer! 😂 #telefon #call #prank #fail #sprecher #stimme #voice #band #ansage #sound #comedy #unterhaltung #scammer #fy', + 'timestamp': 1642438324, + 'uploader': 'EddY', + 'uploader_id': '6850021004246467590', + }, + }] + + @vanilla_UA_request + def _request_webpage(self, *args, **kwargs): + return super(GenericIE, self)._request_webpage(*args, **kwargs) \ No newline at end of file