From 3f047fc406dc2df4f2ca6a75b2ea07d9928b2a09 Mon Sep 17 00:00:00 2001 From: Felix S Date: Fri, 11 Feb 2022 19:03:33 +0000 Subject: [PATCH] [extractor] Extract subtitles from manifests for more sites (#2686) vimeo, globo, kaltura, svt Authored by: fstirlitz --- yt_dlp/extractor/globo.py | 6 +++--- yt_dlp/extractor/kaltura.py | 8 +++++--- yt_dlp/extractor/svt.py | 25 +++++++++++++++---------- yt_dlp/extractor/vimeo.py | 18 +++++++++++------- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py index 9addb7043..f6aaae1e9 100644 --- a/yt_dlp/extractor/globo.py +++ b/yt_dlp/extractor/globo.py @@ -139,11 +139,11 @@ def _real_extract(self, url): resource_url = source['scheme'] + '://' + source['domain'] + source['path'] signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') - formats.extend(self._extract_m3u8_formats( - signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(fmts) self._sort_formats(formats) - subtitles = {} for resource in video['resources']: if resource.get('type') == 'subtitle': subtitles.setdefault(resource.get('language') or 'por', []).append({ diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index c58216458..f6dfc9caa 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -301,6 +301,7 @@ def sign_url(unsigned_url): data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) formats = [] + subtitles = {} for f in flavor_assets: # Continue if asset is not ready if f.get('status') != 2: @@ -344,13 +345,14 @@ def sign_url(unsigned_url): if '/playManifest/' in data_url: m3u8_url = sign_url(data_url.replace( 'format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( m3u8_url, entry_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) self._sort_formats(formats) - subtitles = {} if captions: for caption in captions.get('objects', []): # Continue if caption is not ready diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 6ad01a912..8ca62e370 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -23,23 +23,27 @@ def _extract_video(self, video_info, video_id): is_live = dict_get(video_info, ('live', 'simulcast'), default=False) m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] + subtitles = {} for vr in video_info['videoReferences']: player_type = vr.get('playerType') or vr.get('format') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( vurl, video_id, ext='mp4', entry_protocol=m3u8_protocol, - m3u8_id=player_type, fatal=False)) + m3u8_id=player_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, f4m_id=player_type, fatal=False)) elif ext == 'mpd': - if player_type == 'dashhbbtv': - formats.extend(self._extract_mpd_formats( - vurl, video_id, mpd_id=player_type, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + vurl, video_id, mpd_id=player_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': player_type, @@ -52,18 +56,19 @@ def _extract_video(self, video_info, video_id): countries=self._GEO_COUNTRIES, metadata_available=True) self._sort_formats(formats) - subtitles = {} subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) if isinstance(subtitle_references, list): for sr in subtitle_references: subtitle_url = sr.get('url') subtitle_lang = sr.get('language', 'sv') if subtitle_url: + sub = { + 'url': subtitle_url, + } if determine_ext(subtitle_url) == 'm3u8': - # TODO(yan12125): handle WebVTT in m3u8 manifests - continue - - subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) + # XXX: no way of testing, is it ever hit? + sub['ext'] = 'vtt' + subtitles.setdefault(subtitle_lang, []).append(sub) title = video_info.get('title') diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 57391d766..c2dec244f 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -131,6 +131,8 @@ def _parse_config(self, config, video_id): request = config.get('request') or {} formats = [] + subtitles = {} + config_files = video_data.get('files') or request.get('files') or {} for f in (config_files.get('progressive') or []): video_url = f.get('url') @@ -163,21 +165,24 @@ def _parse_config(self, config, video_id): sep_manifest_urls = [(format_id, manifest_url)] for f_id, m_url in sep_manifest_urls: if files_type == 'hls': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( m_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id, note='Downloading %s m3u8 information' % cdn_name, - fatal=False)) + fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': if 'json=1' in m_url: real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') if real_m_url: m_url = real_m_url - mpd_formats = self._extract_mpd_formats( + fmts, subs = self._extract_mpd_formats_and_subtitles( m_url.replace('/master.json', '/master.mpd'), video_id, f_id, 'Downloading %s MPD information' % cdn_name, fatal=False) - formats.extend(mpd_formats) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) live_archive = live_event.get('archive') or {} live_archive_source_url = live_archive.get('source_url') @@ -188,12 +193,11 @@ def _parse_config(self, config, video_id): 'quality': 10, }) - subtitles = {} for tt in (request.get('text_tracks') or []): - subtitles[tt['lang']] = [{ + subtitles.setdefault(tt['lang'], []).append({ 'ext': 'vtt', 'url': urljoin('https://vimeo.com', tt['url']), - }] + }) thumbnails = [] if not is_live: