mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-21 20:46:36 -05:00
[extractor] Extract subtitles from manifests for more sites (#2686)
vimeo, globo, kaltura, svt Authored by: fstirlitz
This commit is contained in:
parent
82b5176783
commit
3f047fc406
4 changed files with 34 additions and 23 deletions
|
@ -139,11 +139,11 @@ def _real_extract(self, url):
|
||||||
resource_url = source['scheme'] + '://' + source['domain'] + source['path']
|
resource_url = source['scheme'] + '://' + source['domain'] + source['path']
|
||||||
signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
|
signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
|
||||||
|
|
||||||
formats.extend(self._extract_m3u8_formats(
|
fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
|
||||||
signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
|
signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
|
|
||||||
subtitles = {}
|
|
||||||
for resource in video['resources']:
|
for resource in video['resources']:
|
||||||
if resource.get('type') == 'subtitle':
|
if resource.get('type') == 'subtitle':
|
||||||
subtitles.setdefault(resource.get('language') or 'por', []).append({
|
subtitles.setdefault(resource.get('language') or 'por', []).append({
|
||||||
|
|
|
@ -301,6 +301,7 @@ def sign_url(unsigned_url):
|
||||||
data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
|
data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
|
||||||
|
|
||||||
formats = []
|
formats = []
|
||||||
|
subtitles = {}
|
||||||
for f in flavor_assets:
|
for f in flavor_assets:
|
||||||
# Continue if asset is not ready
|
# Continue if asset is not ready
|
||||||
if f.get('status') != 2:
|
if f.get('status') != 2:
|
||||||
|
@ -344,13 +345,14 @@ def sign_url(unsigned_url):
|
||||||
if '/playManifest/' in data_url:
|
if '/playManifest/' in data_url:
|
||||||
m3u8_url = sign_url(data_url.replace(
|
m3u8_url = sign_url(data_url.replace(
|
||||||
'format/url', 'format/applehttp'))
|
'format/url', 'format/applehttp'))
|
||||||
formats.extend(self._extract_m3u8_formats(
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
m3u8_url, entry_id, 'mp4', 'm3u8_native',
|
m3u8_url, entry_id, 'mp4', 'm3u8_native',
|
||||||
m3u8_id='hls', fatal=False))
|
m3u8_id='hls', fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
|
|
||||||
subtitles = {}
|
|
||||||
if captions:
|
if captions:
|
||||||
for caption in captions.get('objects', []):
|
for caption in captions.get('objects', []):
|
||||||
# Continue if caption is not ready
|
# Continue if caption is not ready
|
||||||
|
|
|
@ -23,23 +23,27 @@ def _extract_video(self, video_info, video_id):
|
||||||
is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
|
is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
|
||||||
m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
|
m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
|
||||||
formats = []
|
formats = []
|
||||||
|
subtitles = {}
|
||||||
for vr in video_info['videoReferences']:
|
for vr in video_info['videoReferences']:
|
||||||
player_type = vr.get('playerType') or vr.get('format')
|
player_type = vr.get('playerType') or vr.get('format')
|
||||||
vurl = vr['url']
|
vurl = vr['url']
|
||||||
ext = determine_ext(vurl)
|
ext = determine_ext(vurl)
|
||||||
if ext == 'm3u8':
|
if ext == 'm3u8':
|
||||||
formats.extend(self._extract_m3u8_formats(
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
vurl, video_id,
|
vurl, video_id,
|
||||||
ext='mp4', entry_protocol=m3u8_protocol,
|
ext='mp4', entry_protocol=m3u8_protocol,
|
||||||
m3u8_id=player_type, fatal=False))
|
m3u8_id=player_type, fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
elif ext == 'f4m':
|
elif ext == 'f4m':
|
||||||
formats.extend(self._extract_f4m_formats(
|
formats.extend(self._extract_f4m_formats(
|
||||||
vurl + '?hdcore=3.3.0', video_id,
|
vurl + '?hdcore=3.3.0', video_id,
|
||||||
f4m_id=player_type, fatal=False))
|
f4m_id=player_type, fatal=False))
|
||||||
elif ext == 'mpd':
|
elif ext == 'mpd':
|
||||||
if player_type == 'dashhbbtv':
|
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||||
formats.extend(self._extract_mpd_formats(
|
vurl, video_id, mpd_id=player_type, fatal=False)
|
||||||
vurl, video_id, mpd_id=player_type, fatal=False))
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
else:
|
else:
|
||||||
formats.append({
|
formats.append({
|
||||||
'format_id': player_type,
|
'format_id': player_type,
|
||||||
|
@ -52,18 +56,19 @@ def _extract_video(self, video_info, video_id):
|
||||||
countries=self._GEO_COUNTRIES, metadata_available=True)
|
countries=self._GEO_COUNTRIES, metadata_available=True)
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
|
|
||||||
subtitles = {}
|
|
||||||
subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
|
subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
|
||||||
if isinstance(subtitle_references, list):
|
if isinstance(subtitle_references, list):
|
||||||
for sr in subtitle_references:
|
for sr in subtitle_references:
|
||||||
subtitle_url = sr.get('url')
|
subtitle_url = sr.get('url')
|
||||||
subtitle_lang = sr.get('language', 'sv')
|
subtitle_lang = sr.get('language', 'sv')
|
||||||
if subtitle_url:
|
if subtitle_url:
|
||||||
|
sub = {
|
||||||
|
'url': subtitle_url,
|
||||||
|
}
|
||||||
if determine_ext(subtitle_url) == 'm3u8':
|
if determine_ext(subtitle_url) == 'm3u8':
|
||||||
# TODO(yan12125): handle WebVTT in m3u8 manifests
|
# XXX: no way of testing, is it ever hit?
|
||||||
continue
|
sub['ext'] = 'vtt'
|
||||||
|
subtitles.setdefault(subtitle_lang, []).append(sub)
|
||||||
subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
|
|
||||||
|
|
||||||
title = video_info.get('title')
|
title = video_info.get('title')
|
||||||
|
|
||||||
|
|
|
@ -131,6 +131,8 @@ def _parse_config(self, config, video_id):
|
||||||
request = config.get('request') or {}
|
request = config.get('request') or {}
|
||||||
|
|
||||||
formats = []
|
formats = []
|
||||||
|
subtitles = {}
|
||||||
|
|
||||||
config_files = video_data.get('files') or request.get('files') or {}
|
config_files = video_data.get('files') or request.get('files') or {}
|
||||||
for f in (config_files.get('progressive') or []):
|
for f in (config_files.get('progressive') or []):
|
||||||
video_url = f.get('url')
|
video_url = f.get('url')
|
||||||
|
@ -163,21 +165,24 @@ def _parse_config(self, config, video_id):
|
||||||
sep_manifest_urls = [(format_id, manifest_url)]
|
sep_manifest_urls = [(format_id, manifest_url)]
|
||||||
for f_id, m_url in sep_manifest_urls:
|
for f_id, m_url in sep_manifest_urls:
|
||||||
if files_type == 'hls':
|
if files_type == 'hls':
|
||||||
formats.extend(self._extract_m3u8_formats(
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
m_url, video_id, 'mp4',
|
m_url, video_id, 'mp4',
|
||||||
'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
|
'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
|
||||||
note='Downloading %s m3u8 information' % cdn_name,
|
note='Downloading %s m3u8 information' % cdn_name,
|
||||||
fatal=False))
|
fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
elif files_type == 'dash':
|
elif files_type == 'dash':
|
||||||
if 'json=1' in m_url:
|
if 'json=1' in m_url:
|
||||||
real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
|
real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
|
||||||
if real_m_url:
|
if real_m_url:
|
||||||
m_url = real_m_url
|
m_url = real_m_url
|
||||||
mpd_formats = self._extract_mpd_formats(
|
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||||
m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
|
m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
|
||||||
'Downloading %s MPD information' % cdn_name,
|
'Downloading %s MPD information' % cdn_name,
|
||||||
fatal=False)
|
fatal=False)
|
||||||
formats.extend(mpd_formats)
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
|
||||||
live_archive = live_event.get('archive') or {}
|
live_archive = live_event.get('archive') or {}
|
||||||
live_archive_source_url = live_archive.get('source_url')
|
live_archive_source_url = live_archive.get('source_url')
|
||||||
|
@ -188,12 +193,11 @@ def _parse_config(self, config, video_id):
|
||||||
'quality': 10,
|
'quality': 10,
|
||||||
})
|
})
|
||||||
|
|
||||||
subtitles = {}
|
|
||||||
for tt in (request.get('text_tracks') or []):
|
for tt in (request.get('text_tracks') or []):
|
||||||
subtitles[tt['lang']] = [{
|
subtitles.setdefault(tt['lang'], []).append({
|
||||||
'ext': 'vtt',
|
'ext': 'vtt',
|
||||||
'url': urljoin('https://vimeo.com', tt['url']),
|
'url': urljoin('https://vimeo.com', tt['url']),
|
||||||
}]
|
})
|
||||||
|
|
||||||
thumbnails = []
|
thumbnails = []
|
||||||
if not is_live:
|
if not is_live:
|
||||||
|
|
Loading…
Reference in a new issue