[extractor] Extract subtitles from manifests for more sites (#2686)

vimeo, globo, kaltura, svt Authored by: fstirlitz
2024-11-21 20:46:36 -05:00 · 2022-02-11 19:03:33 +00:00 · 2022-02-11 19:03:33 +00:00 · 3f047fc406
commit 3f047fc406
parent 82b5176783
4 changed files with 34 additions and 23 deletions
--- a/yt_dlp/extractor/globo.py
+++ b/yt_dlp/extractor/globo.py
@ -139,11 +139,11 @@ def _real_extract(self, url):
        resource_url = source['scheme'] + '://' + source['domain'] + source['path']
        signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
-        formats.extend(self._extract_m3u8_formats(
+        fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
-            signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
        formats.extend(fmts)
        self._sort_formats(formats)
        subtitles = {}
        for resource in video['resources']:
            if resource.get('type') == 'subtitle':
                subtitles.setdefault(resource.get('language') or 'por', []).append({
--- a/yt_dlp/extractor/kaltura.py
+++ b/yt_dlp/extractor/kaltura.py
@ -301,6 +301,7 @@ def sign_url(unsigned_url):
            data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
        formats = []
        subtitles = {}
        for f in flavor_assets:
            # Continue if asset is not ready
            if f.get('status') != 2:
@ -344,13 +345,14 @@ def sign_url(unsigned_url):
        if '/playManifest/' in data_url:
            m3u8_url = sign_url(data_url.replace(
                'format/url', 'format/applehttp'))
-            formats.extend(self._extract_m3u8_formats(
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
                m3u8_url, entry_id, 'mp4', 'm3u8_native',
-                m3u8_id='hls', fatal=False))
+                m3u8_id='hls', fatal=False)
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)
        self._sort_formats(formats)
        subtitles = {}
        if captions:
            for caption in captions.get('objects', []):
                # Continue if caption is not ready
--- a/yt_dlp/extractor/svt.py
+++ b/yt_dlp/extractor/svt.py
@ -23,23 +23,27 @@ def _extract_video(self, video_info, video_id):
        is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
        m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
        formats = []
        subtitles = {}
        for vr in video_info['videoReferences']:
            player_type = vr.get('playerType') or vr.get('format')
            vurl = vr['url']
            ext = determine_ext(vurl)
            if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(
                    vurl, video_id,
                    ext='mp4', entry_protocol=m3u8_protocol,
-                    m3u8_id=player_type, fatal=False))
+                    m3u8_id=player_type, fatal=False)
                formats.extend(fmts)
                self._merge_subtitles(subs, target=subtitles)
            elif ext == 'f4m':
                formats.extend(self._extract_f4m_formats(
                    vurl + '?hdcore=3.3.0', video_id,
                    f4m_id=player_type, fatal=False))
            elif ext == 'mpd':
-                if player_type == 'dashhbbtv':
+                fmts, subs = self._extract_mpd_formats_and_subtitles(
-                    formats.extend(self._extract_mpd_formats(
+                    vurl, video_id, mpd_id=player_type, fatal=False)
-                        vurl, video_id, mpd_id=player_type, fatal=False))
+                formats.extend(fmts)
                self._merge_subtitles(subs, target=subtitles)
            else:
                formats.append({
                    'format_id': player_type,
@ -52,18 +56,19 @@ def _extract_video(self, video_info, video_id):
                countries=self._GEO_COUNTRIES, metadata_available=True)
        self._sort_formats(formats)
        subtitles = {}
        subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
        if isinstance(subtitle_references, list):
            for sr in subtitle_references:
                subtitle_url = sr.get('url')
                subtitle_lang = sr.get('language', 'sv')
                if subtitle_url:
                    sub = {
                        'url': subtitle_url,
                    }
                    if determine_ext(subtitle_url) == 'm3u8':
-                        # TODO(yan12125): handle WebVTT in m3u8 manifests
+                        # XXX: no way of testing, is it ever hit?
-                        continue
+                        sub['ext'] = 'vtt'
-
+                    subtitles.setdefault(subtitle_lang, []).append(sub)
                    subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
        title = video_info.get('title')
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@ -131,6 +131,8 @@ def _parse_config(self, config, video_id):
        request = config.get('request') or {}
        formats = []
        subtitles = {}
        config_files = video_data.get('files') or request.get('files') or {}
        for f in (config_files.get('progressive') or []):
            video_url = f.get('url')
@ -163,21 +165,24 @@ def _parse_config(self, config, video_id):
                    sep_manifest_urls = [(format_id, manifest_url)]
                for f_id, m_url in sep_manifest_urls:
                    if files_type == 'hls':
-                        formats.extend(self._extract_m3u8_formats(
+                        fmts, subs = self._extract_m3u8_formats_and_subtitles(
                            m_url, video_id, 'mp4',
                            'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
                            note='Downloading %s m3u8 information' % cdn_name,
-                            fatal=False))
+                            fatal=False)
                        formats.extend(fmts)
                        self._merge_subtitles(subs, target=subtitles)
                    elif files_type == 'dash':
                        if 'json=1' in m_url:
                            real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
                            if real_m_url:
                                m_url = real_m_url
-                        mpd_formats = self._extract_mpd_formats(
+                        fmts, subs = self._extract_mpd_formats_and_subtitles(
                            m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
                            'Downloading %s MPD information' % cdn_name,
                            fatal=False)
-                        formats.extend(mpd_formats)
+                        formats.extend(fmts)
                        self._merge_subtitles(subs, target=subtitles)
        live_archive = live_event.get('archive') or {}
        live_archive_source_url = live_archive.get('source_url')
@ -188,12 +193,11 @@ def _parse_config(self, config, video_id):
                'quality': 10,
            })
        subtitles = {}
        for tt in (request.get('text_tracks') or []):
-            subtitles[tt['lang']] = [{
+            subtitles.setdefault(tt['lang'], []).append({
                'ext': 'vtt',
                'url': urljoin('https://vimeo.com', tt['url']),
-            }]
+            })
        thumbnails = []
        if not is_live: