From fb6e3f4389b74d273fb34b737b2c5f75bf864d0e Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" Date: Tue, 8 Mar 2022 23:49:10 +0900 Subject: [PATCH] [mildom] Rework extractors (#2940) Authored by: Lesmiscore --- yt_dlp/extractor/extractors.py | 1 + yt_dlp/extractor/mildom.py | 291 +++++++++++++++------------------ 2 files changed, 135 insertions(+), 157 deletions(-) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 5448acf01..e023a9802 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -848,6 +848,7 @@ from .microsoftvirtualacademy import ( from .mildom import ( MildomIE, MildomVodIE, + MildomClipIE, MildomUserVodIE, ) from .minds import ( diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py index b5a2e17f2..ab718acb2 100644 --- a/yt_dlp/extractor/mildom.py +++ b/yt_dlp/extractor/mildom.py @@ -1,102 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 -from datetime import datetime -import itertools +import functools import json from .common import InfoExtractor from ..utils import ( - update_url_query, - random_uuidv4, - try_get, + determine_ext, + dict_get, + ExtractorError, float_or_none, - dict_get -) -from ..compat import ( - compat_str, + OnDemandPagedList, + random_uuidv4, + traverse_obj, + update_url_query, ) class MildomBaseIE(InfoExtractor): _GUEST_ID = None - _DISPATCHER_CONFIG = None - def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', init=False): - query = query or {} - if query: - query['__platform'] = 'web' - url = update_url_query(url, self._common_queries(query, init=init)) - content = self._download_json(url, video_id, note=note) - if content['code'] == 0: - return content['body'] - else: - self.raise_no_formats( - f'Video not found or premium content. {content["code"]} - {content["message"]}', + def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): + if not self._GUEST_ID: + self._GUEST_ID = f'pc-gp-{random_uuidv4()}' + + content = self._download_json( + url, video_id, note=note, data=json.dumps(body).encode() if body else None, + headers={'Content-Type': 'application/json'} if body else {}, + query={ + '__guest_id': self._GUEST_ID, + '__platform': 'web', + **(query or {}), + }) + + if content['code'] != 0: + raise ExtractorError( + f'Mildom says: {content["message"]} (code {content["code"]})', expected=True) - - def _common_queries(self, query={}, init=False): - dc = self._fetch_dispatcher_config() - r = { - 'timestamp': self.iso_timestamp(), - '__guest_id': '' if init else self.guest_id(), - '__location': dc['location'], - '__country': dc['country'], - '__cluster': dc['cluster'], - '__platform': 'web', - '__la': self.lang_code(), - '__pcv': 'v2.9.44', - 'sfr': 'pc', - 'accessToken': '', - } - r.update(query) - return r - - def _fetch_dispatcher_config(self): - if not self._DISPATCHER_CONFIG: - tmp = self._download_json( - 'https://disp.mildom.com/serverListV2', 'initialization', - note='Downloading dispatcher_config', data=json.dumps({ - 'protover': 0, - 'data': base64.b64encode(json.dumps({ - 'fr': 'web', - 'sfr': 'pc', - 'devi': 'Windows', - 'la': 'ja', - 'gid': None, - 'loc': '', - 'clu': '', - 'wh': '1919*810', - 'rtm': self.iso_timestamp(), - 'ua': self.get_param('http_headers')['User-Agent'], - }).encode('utf8')).decode('utf8').replace('\n', ''), - }).encode('utf8')) - self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization') - return self._DISPATCHER_CONFIG - - @staticmethod - def iso_timestamp(): - 'new Date().toISOString()' - return datetime.utcnow().isoformat()[0:-3] + 'Z' - - def guest_id(self): - 'getGuestId' - if self._GUEST_ID: - return self._GUEST_ID - self._GUEST_ID = try_get( - self, ( - lambda x: x._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization', - note='Downloading guest token', init=True)['guest_id'] or None, - lambda x: x._get_cookies('https://www.mildom.com').get('gid').value, - lambda x: x._get_cookies('https://m.mildom.com').get('gid').value, - ), compat_str) or '' - return self._GUEST_ID - - def lang_code(self): - 'getCurrentLangCode' - return 'ja' + return content['body'] class MildomIE(MildomBaseIE): @@ -106,31 +47,13 @@ class MildomIE(MildomBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://www.mildom.com/%s' % video_id - - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id) enterstudio = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, note='Downloading live metadata', query={'user_id': video_id}) result_video_id = enterstudio.get('log_id', video_id) - title = try_get( - enterstudio, ( - lambda x: self._html_search_meta('twitter:description', webpage), - lambda x: x['anchor_intro'], - ), compat_str) - description = try_get( - enterstudio, ( - lambda x: x['intro'], - lambda x: x['live_intro'], - ), compat_str) - uploader = try_get( - enterstudio, ( - lambda x: self._html_search_meta('twitter:title', webpage), - lambda x: x['loginname'], - ), compat_str) - servers = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, note='Downloading live server list', query={ @@ -138,17 +61,20 @@ class MildomIE(MildomBaseIE): 'live_server_type': 'hls', }) - stream_query = self._common_queries({ - 'streamReqId': random_uuidv4(), - 'is_lhls': '0', - }) - m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query) - formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={ - 'Referer': 'https://www.mildom.com/', - 'Origin': 'https://www.mildom.com', - }, note='Downloading m3u8 information') + playback_token = self._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id, + note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'}) + playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False) + if not playback_token: + raise ExtractorError('Failed to obtain live playback token') + + formats = self._extract_m3u8_formats( + f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}', + result_video_id, 'mp4', headers={ + 'Referer': 'https://www.mildom.com/', + 'Origin': 'https://www.mildom.com', + }) - del stream_query['streamReqId'], stream_query['timestamp'] for fmt in formats: fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' @@ -156,10 +82,10 @@ class MildomIE(MildomBaseIE): return { 'id': result_video_id, - 'title': title, - 'description': description, + 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), + 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str), 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), - 'uploader': uploader, + 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'), 'uploader_id': video_id, 'formats': formats, 'is_live': True, @@ -168,7 +94,7 @@ class MildomIE(MildomBaseIE): class MildomVodIE(MildomBaseIE): IE_NAME = 'mildom:vod' - IE_DESC = 'Download a VOD in Mildom' + IE_DESC = 'VOD in Mildom' _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P\d+)/(?P(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' _TESTS = [{ 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', @@ -215,11 +141,8 @@ class MildomVodIE(MildomBaseIE): }] def _real_extract(self, url): - m = self._match_valid_url(url) - user_id, video_id = m.group('user_id'), m.group('id') - url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id) - - webpage = self._download_webpage(url, video_id) + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id) autoplay = self._call_api( 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, @@ -227,20 +150,6 @@ class MildomVodIE(MildomBaseIE): 'v_id': video_id, })['playback'] - title = try_get( - autoplay, ( - lambda x: self._html_search_meta('og:description', webpage), - lambda x: x['title'], - ), compat_str) - description = try_get( - autoplay, ( - lambda x: x['video_intro'], - ), compat_str) - uploader = try_get( - autoplay, ( - lambda x: x['author_info']['login_name'], - ), compat_str) - formats = [{ 'url': autoplay['audio_url'], 'format_id': 'audio', @@ -265,17 +174,81 @@ class MildomVodIE(MildomBaseIE): return { 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': float_or_none(autoplay['publish_time'], scale=1000), - 'duration': float_or_none(autoplay['video_length'], scale=1000), + 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), + 'description': traverse_obj(autoplay, 'video_intro'), + 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000), + 'duration': float_or_none(autoplay.get('video_length'), scale=1000), 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), - 'uploader': uploader, + 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')), 'uploader_id': user_id, 'formats': formats, } +class MildomClipIE(MildomBaseIE): + IE_NAME = 'mildom:clip' + IE_DESC = 'Clip in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P(?P\d+)-[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9', + 'info_dict': { + 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9', + 'title': '全然違ったよ', + 'timestamp': 1619181890, + 'duration': 59, + 'thumbnail': r're:https?://.+', + 'uploader': 'ざきんぽ', + 'uploader_id': '10042245', + }, + }, { + 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864', + 'info_dict': { + 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864', + 'title': 'かっこいい', + 'timestamp': 1621094003, + 'duration': 59, + 'thumbnail': r're:https?://.+', + 'uploader': '(ルーキー', + 'uploader_id': '10111524', + }, + }, { + 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', + 'info_dict': { + 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', + 'title': 'あ', + 'timestamp': 1614769431, + 'duration': 31, + 'thumbnail': r're:https?://.+', + 'uploader': 'ドルゴルスレンギーン=ダグワドルジ', + 'uploader_id': '10660174', + }, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id) + + clip_detail = self._call_api( + 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id, + note='Downloading playback metadata', query={ + 'clip_id': video_id, + }) + + return { + 'id': video_id, + 'title': self._html_search_meta( + ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'), + 'timestamp': float_or_none(clip_detail.get('create_time')), + 'duration': float_or_none(clip_detail.get('length')), + 'thumbnail': clip_detail.get('cover'), + 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')), + 'uploader_id': user_id, + + 'url': clip_detail['url'], + 'ext': determine_ext(clip_detail.get('url'), 'mp4'), + } + + class MildomUserVodIE(MildomBaseIE): IE_NAME = 'mildom:user:vod' IE_DESC = 'Download all VODs from specific user in Mildom' @@ -286,29 +259,32 @@ class MildomUserVodIE(MildomBaseIE): 'id': '10093333', 'title': 'Uploads from ねこばたけ', }, - 'playlist_mincount': 351, + 'playlist_mincount': 732, }, { 'url': 'https://www.mildom.com/profile/10882672', 'info_dict': { 'id': '10882672', 'title': 'Uploads from kson組長(けいそん)', }, - 'playlist_mincount': 191, + 'playlist_mincount': 201, }] - def _entries(self, user_id): - for page in itertools.count(1): - reply = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', - user_id, note='Downloading page %d' % page, query={ - 'user_id': user_id, - 'page': page, - 'limit': '30', - }) - if not reply: - break - for x in reply: - yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id'])) + def _fetch_page(self, user_id, page): + page += 1 + reply = self._call_api( + 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', + user_id, note=f'Downloading page {page}', query={ + 'user_id': user_id, + 'page': page, + 'limit': '30', + }) + if not reply: + return + for x in reply: + v_id = x.get('v_id') + if not v_id: + continue + yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}') def _real_extract(self, url): user_id = self._match_id(url) @@ -319,4 +295,5 @@ class MildomUserVodIE(MildomBaseIE): query={'user_id': user_id}, note='Downloading user profile')['user_info'] return self.playlist_result( - self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname']) + OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30), + user_id, f'Uploads from {profile["loginname"]}')