[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
2024-12-24 18:51:00 +00:00 · 2021-10-02 18:43:42 +00:00 · 2021-10-02 18:43:42 +00:00 · 9359f3d4f0
commit 9359f3d4f0
parent 0eaec13ba6
3 changed files with 56 additions and 12 deletions
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -3029,9 +3029,7 @@ def record_download_archive(self, info_dict):
    @staticmethod
    def format_resolution(format, default='unknown'):
-        if format.get('vcodec') == 'none':
+        if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
            if format.get('acodec') == 'none':
                return 'images'
            return 'audio only'
        if format.get('resolution') is not None:
            return format['resolution']
@ -3043,6 +3041,8 @@ def format_resolution(format, default='unknown'):
            res = '%dx?' % format['width']
        else:
            res = default
        if format.get('vcodec') == 'none' and format.get('acodec') == 'none':
            res += ' (images)'
        return res
    def _format_note(self, fdict):
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2346,14 +2346,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
        rtmp_count = 0
        http_count = 0
        m3u8_count = 0
        imgs_count = 0
-        srcs = []
+        srcs = set()
        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
        for medium in media:
            src = medium.get('src')
            if not src or src in srcs:
                continue
-            srcs.append(src)
+            srcs.add(src)
            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
@ -2427,6 +2428,24 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                    'height': height,
                })
        for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
            src = medium.get('src')
            if not src or src in srcs:
                continue
            srcs.add(src)
            imgs_count += 1
            formats.append({
                'format_id': 'imagestream-%d' % (imgs_count),
                'url': src,
                'ext': mimetype2ext(medium.get('type')),
                'acodec': 'none',
                'vcodec': 'none',
                'width': int_or_none(medium.get('width')),
                'height': int_or_none(medium.get('height')),
                'format_note': 'SMIL storyboards',
            })
        return formats
    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -4546,20 +4546,24 @@ def mimetype2ext(mt):
    if mt is None:
        return None
-    ext = {
+    mt, _, params = mt.partition(';')
    mt = mt.strip()
    FULL_MAP = {
        'audio/mp4': 'm4a',
        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
        # it's the most popular one
        'audio/mpeg': 'mp3',
        'audio/x-wav': 'wav',
-    }.get(mt)
+        'audio/wav': 'wav',
        'audio/wave': 'wav',
    }
    ext = FULL_MAP.get(mt)
    if ext is not None:
        return ext
-    _, _, res = mt.rpartition('/')
+    SUBTYPE_MAP = {
    res = res.split(';')[0].strip().lower()
    return {
        '3gpp': '3gp',
        'smptett+xml': 'tt',
        'ttaf+xml': 'dfxp',
@ -4578,7 +4582,28 @@ def mimetype2ext(mt):
        'quicktime': 'mov',
        'mp2t': 'ts',
        'x-wav': 'wav',
-    }.get(res, res)
+        'filmstrip+json': 'fs',
        'svg+xml': 'svg',
    }
    _, _, subtype = mt.rpartition('/')
    ext = SUBTYPE_MAP.get(subtype.lower())
    if ext is not None:
        return ext
    SUFFIX_MAP = {
        'json': 'json',
        'xml': 'xml',
        'zip': 'zip',
        'gzip': 'gz',
    }
    _, _, suffix = subtype.partition('+')
    ext = SUFFIX_MAP.get(suffix)
    if ext is not None:
        return ext
    return subtype.replace('+', '.')
 def parse_codecs(codecs_str):