[VIKI] Rewrite extractors (#475)

Closes #462
Also added extractor-arg `video_types` to `vikichannel`

Co-authored-by: zackmark29, pukkandan
This commit is contained in:
zackmark29 2021-07-10 04:38:09 +08:00 committed by GitHub
parent 60bdb7bd9e
commit 73d829c144
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 114 additions and 205 deletions

View file

@ -1340,6 +1340,9 @@ # EXTRACTOR ARGUMENTS
* `language`: Languages to extract. Eg: `funimation:language=english,japanese` * `language`: Languages to extract. Eg: `funimation:language=english,japanese`
* `version`: The video version to extract - `uncut` or `simulcast` * `version`: The video version to extract - `uncut` or `simulcast`
* **vikiChannel**
* `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers`
NOTE: These options may be changed/removed in the future without concern for backward compatibility NOTE: These options may be changed/removed in the future without concern for backward compatibility

View file

@ -1,39 +1,28 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import base64
import hashlib import hashlib
import hmac import hmac
import itertools
import json import json
import re
import time import time
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
HEADRequest,
parse_age_limit, parse_age_limit,
parse_iso8601, parse_iso8601,
sanitized_Request,
std_headers,
try_get, try_get,
) )
class VikiBaseIE(InfoExtractor): class VikiBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
_API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' _API_URL_TEMPLATE = 'https://api.viki.io%s'
_API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s'
_DEVICE_ID = '86085977d' # used for android api
_APP = '100005a' _APP = '100005a'
_APP_VERSION = '6.0.0' _APP_VERSION = '6.11.3'
_APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472'
_GEO_BYPASS = False _GEO_BYPASS = False
_NETRC_MACHINE = 'viki' _NETRC_MACHINE = 'viki'
@ -46,53 +35,57 @@ class VikiBaseIE(InfoExtractor):
'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
} }
def _prepare_call(self, path, timestamp=None, post_data=None): def _stream_headers(self, timestamp, sig):
return {
'X-Viki-manufacturer': 'vivo',
'X-Viki-device-model': 'vivo 1606',
'X-Viki-device-os-ver': '6.0.1',
'X-Viki-connection-type': 'WIFI',
'X-Viki-carrier': '',
'X-Viki-as-id': '100005a-1625321982-3932',
'timestamp': str(timestamp),
'signature': str(sig),
'x-viki-app-ver': self._APP_VERSION
}
def _api_query(self, path, version=4, **kwargs):
path += '?' if '?' not in path else '&' path += '?' if '?' not in path else '&'
if not timestamp: query = f'/v{version}/{path}app={self._APP}'
timestamp = int(time.time())
query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
if self._token: if self._token:
query += '&token=%s' % self._token query += '&token=%s' % self._token
return query + ''.join(f'&{name}={val}' for name, val in kwargs.items())
def _sign_query(self, path):
timestamp = int(time.time())
query = self._api_query(path, version=5)
sig = hmac.new( sig = hmac.new(
self._APP_SECRET.encode('ascii'), self._APP_SECRET.encode('ascii'), f'{query}&t={timestamp}'.encode('ascii'), hashlib.sha1).hexdigest()
query.encode('ascii'), return timestamp, sig, self._API_URL_TEMPLATE % query
hashlib.sha1
).hexdigest()
url = self._API_URL_TEMPLATE % (query, sig)
return sanitized_Request(
url, json.dumps(post_data).encode('utf-8')) if post_data else url
def _call_api(self, path, video_id, note, timestamp=None, post_data=None): def _call_api(
self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True):
if query is None:
timestamp, sig, url = self._sign_query(path)
else:
url = self._API_URL_TEMPLATE % self._api_query(path, version=4)
resp = self._download_json( resp = self._download_json(
self._prepare_call(path, timestamp, post_data), url, video_id, note, fatal=fatal, query=query,
video_id, note, data=json.dumps(data).encode('utf-8') if data else None,
headers={ headers=({'x-viki-app-ver': self._APP_VERSION} if data
'x-client-user-agent': std_headers['User-Agent'], else self._stream_headers(timestamp, sig) if query is None
'x-viki-as-id': self._APP, else None)) or {}
'x-viki-app-ver': self._APP_VERSION,
})
error = resp.get('error')
if error:
if error == 'invalid timestamp':
resp = self._download_json(
self._prepare_call(path, int(resp['current_timestamp']), post_data),
video_id, '%s (retry)' % note,
headers={
'x-client-user-agent': std_headers['User-Agent'],
'x-viki-as-id': self._APP,
'x-viki-app-ver': self._APP_VERSION,
})
error = resp.get('error')
if error:
self._raise_error(resp['error'])
self._raise_error(resp.get('error'), fatal)
return resp return resp
def _raise_error(self, error): def _raise_error(self, error, fatal=True):
raise ExtractorError( if error is None:
'%s returned error: %s' % (self.IE_NAME, error), return
expected=True) msg = '%s said: %s' % (self.IE_NAME, error)
if fatal:
raise ExtractorError(msg, expected=True)
else:
self.report_warning(msg)
def _check_errors(self, data): def _check_errors(self, data):
for reason, status in (data.get('blocking') or {}).items(): for reason, status in (data.get('blocking') or {}).items():
@ -101,9 +94,10 @@ def _check_errors(self, data):
if reason == 'geo': if reason == 'geo':
self.raise_geo_restricted(msg=message) self.raise_geo_restricted(msg=message)
elif reason == 'paywall': elif reason == 'paywall':
if try_get(data, lambda x: x['paywallable']['tvod']):
self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)')
self.raise_login_required(message) self.raise_login_required(message)
raise ExtractorError('%s said: %s' % ( self._raise_error(message)
self.IE_NAME, message), expected=True)
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -113,29 +107,17 @@ def _login(self):
if username is None: if username is None:
return return
login_form = { self._token = self._call_api(
'login_id': username, 'sessions.json', None, 'Logging in', fatal=False,
'password': password, data={'username': username, 'password': password}).get('token')
}
login = self._call_api(
'sessions.json', None,
'Logging in', post_data=login_form)
self._token = login.get('token')
if not self._token: if not self._token:
self.report_warning('Unable to get session token, login has probably failed') self.report_warning('Login Failed: Unable to get session token')
@staticmethod @staticmethod
def dict_selection(dict_obj, preferred_key, allow_fallback=True): def dict_selection(dict_obj, preferred_key):
if preferred_key in dict_obj: if preferred_key in dict_obj:
return dict_obj.get(preferred_key) return dict_obj[preferred_key]
return (list(filter(None, dict_obj.values())) or [None])[0]
if not allow_fallback:
return
filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
return filtered_dict[0] if filtered_dict else None
class VikiIE(VikiBaseIE): class VikiIE(VikiBaseIE):
@ -266,18 +248,10 @@ class VikiIE(VikiBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video = self._call_api(f'videos/{video_id}.json', video_id, 'Downloading video JSON', query={})
resp = self._download_json(
'https://www.viki.com/api/videos/' + video_id,
video_id, 'Downloading video JSON', headers={
'x-client-user-agent': std_headers['User-Agent'],
'x-viki-app-ver': '3.0.0',
})
video = resp['video']
self._check_errors(video) self._check_errors(video)
title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) title = try_get(video, lambda x: x['titles']['en'], str)
episode_number = int_or_none(video.get('number')) episode_number = int_or_none(video.get('number'))
if not title: if not title:
title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
@ -285,116 +259,46 @@ def _real_extract(self, url):
container_title = self.dict_selection(container_titles, 'en') container_title = self.dict_selection(container_titles, 'en')
title = '%s - %s' % (container_title, title) title = '%s - %s' % (container_title, title)
description = self.dict_selection(video.get('descriptions', {}), 'en') thumbnails = [{
'id': thumbnail_id,
'url': thumbnail['url'],
} for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')]
like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) resp = self._call_api(
'playback_streams/%s.json?drms=dt1,dt2&device_id=%s' % (video_id, self._DEVICE_ID),
video_id, 'Downloading video streams JSON')['main'][0]
thumbnails = [] stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id'])
for thumbnail_id, thumbnail in (video.get('images') or {}).items(): subtitles = dict((lang, [{
thumbnails.append({ 'ext': ext,
'id': thumbnail_id, 'url': self._API_URL_TEMPLATE % self._api_query(
'url': thumbnail.get('url'), f'videos/{video_id}/auth_subtitles/{lang}.{ext}', stream_id=stream_id)
}) } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys())
subtitles = {} mpd_url = resp['url']
for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): # 1080p is hidden in another mpd which can be found in the current manifest content
subtitles[subtitle_lang] = [{ mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest')
'ext': subtitles_format, mpd_url = self._search_regex(
'url': self._prepare_call( r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url)
'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), formats = self._extract_mpd_formats(mpd_url, video_id)
} for subtitles_format in ('srt', 'vtt')] self._sort_formats(formats)
result = { return {
'id': video_id, 'id': video_id,
'formats': formats,
'title': title, 'title': title,
'description': description, 'description': self.dict_selection(video.get('descriptions', {}), 'en'),
'duration': int_or_none(video.get('duration')), 'duration': int_or_none(video.get('duration')),
'timestamp': parse_iso8601(video.get('created_at')), 'timestamp': parse_iso8601(video.get('created_at')),
'uploader': video.get('author'), 'uploader': video.get('author'),
'uploader_url': video.get('author_url'), 'uploader_url': video.get('author_url'),
'like_count': like_count, 'like_count': int_or_none(try_get(video, lambda x: x['likes']['count'])),
'age_limit': parse_age_limit(video.get('rating')), 'age_limit': parse_age_limit(video.get('rating')),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'subtitles': subtitles, 'subtitles': subtitles,
'episode_number': episode_number, 'episode_number': episode_number,
} }
formats = []
def add_format(format_id, format_dict, protocol='http'):
# rtmps URLs does not seem to work
if protocol == 'rtmps':
return
format_url = format_dict.get('url')
if not format_url:
return
qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
stream = qs.get('stream', [None])[0]
if stream:
format_url = base64.b64decode(stream).decode()
if format_id in ('m3u8', 'hls'):
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4',
entry_protocol='m3u8_native',
m3u8_id='m3u8-%s' % protocol, fatal=False)
# Despite CODECS metadata in m3u8 all video-only formats
# are actually video+audio
for f in m3u8_formats:
if not self.get_param('allow_unplayable_formats') and '_drm/index_' in f['url']:
continue
if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
f['acodec'] = None
formats.append(f)
elif format_id in ('mpd', 'dash'):
formats.extend(self._extract_mpd_formats(
format_url, video_id, 'mpd-%s' % protocol, fatal=False))
elif format_url.startswith('rtmp'):
mobj = re.search(
r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
format_url)
if not mobj:
return
formats.append({
'format_id': 'rtmp-%s' % format_id,
'ext': 'flv',
'url': mobj.group('url'),
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': url,
})
else:
urlh = self._request_webpage(
HEADRequest(format_url), video_id, 'Checking file size', fatal=False)
formats.append({
'url': format_url,
'format_id': '%s-%s' % (format_id, protocol),
'height': int_or_none(self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)),
'filesize': int_or_none(urlh.headers.get('Content-Length')),
})
for format_id, format_dict in (resp.get('streams') or {}).items():
add_format(format_id, format_dict)
if not formats:
streams = self._call_api(
'videos/%s/streams.json' % video_id, video_id,
'Downloading video streams JSON')
if 'external' in streams:
result.update({
'_type': 'url_transparent',
'url': streams['external']['url'],
})
return result
for format_id, stream_dict in streams.items():
for protocol, format_dict in stream_dict.items():
add_format(format_id, format_dict, protocol)
self._sort_formats(formats)
result['formats'] = formats
return result
class VikiChannelIE(VikiBaseIE): class VikiChannelIE(VikiBaseIE):
IE_NAME = 'viki:channel' IE_NAME = 'viki:channel'
@ -406,7 +310,7 @@ class VikiChannelIE(VikiBaseIE):
'title': 'Boys Over Flowers', 'title': 'Boys Over Flowers',
'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
}, },
'playlist_mincount': 71, 'playlist_mincount': 51,
}, { }, {
'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
'info_dict': { 'info_dict': {
@ -427,33 +331,35 @@ class VikiChannelIE(VikiBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
_PER_PAGE = 25 _video_types = ('episodes', 'movies', 'clips', 'trailers')
def _entries(self, channel_id):
params = {
'app': self._APP, 'token': self._token, 'only_ids': 'true',
'direction': 'asc', 'sort': 'number', 'per_page': 30
}
video_types = self._configuration_arg('video_types') or self._video_types
for video_type in video_types:
if video_type not in self._video_types:
self.report_warning(f'Unknown video_type: {video_type}')
page_num = 0
while True:
page_num += 1
params['page'] = page_num
res = self._call_api(
f'containers/{channel_id}/{video_type}.json', channel_id, query=params, fatal=False,
note='Downloading %s JSON page %d' % (video_type.title(), page_num))
for video_id in res.get('response') or []:
yield self.url_result(f'https://www.viki.com/videos/{video_id}', VikiIE.ie_key(), video_id)
if not res.get('more'):
break
def _real_extract(self, url): def _real_extract(self, url):
channel_id = self._match_id(url) channel_id = self._match_id(url)
channel = self._call_api('containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON')
channel = self._call_api(
'containers/%s.json' % channel_id, channel_id,
'Downloading channel JSON')
self._check_errors(channel) self._check_errors(channel)
return self.playlist_result(
title = self.dict_selection(channel['titles'], 'en') self._entries(channel_id), channel_id,
self.dict_selection(channel['titles'], 'en'),
description = self.dict_selection(channel['descriptions'], 'en') self.dict_selection(channel['descriptions'], 'en'))
entries = []
for video_type in ('episodes', 'clips', 'movies'):
for page_num in itertools.count(1):
page = self._call_api(
'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
% (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
'Downloading %s JSON page #%d' % (video_type, page_num))
for video in page['response']:
video_id = video['id']
entries.append(self.url_result(
'https://www.viki.com/videos/%s' % video_id, 'Viki'))
if not page['pagination']['next']:
break
return self.playlist_result(entries, channel_id, title, description)