[tagesschau] Add support for audio

2024-11-30 14:20:46 -05:00 · 2015-07-23 01:01:04 +06:00 · 2015-07-23 01:01:04 +06:00 · a47b602b08
commit a47b602b08
parent a083b859e4
1 changed files with 24 additions and 13 deletions
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@ -30,6 +30,16 @@ class TagesschauIE(InfoExtractor):
            'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
            'thumbnail': 're:^http:.*\.jpg$',
        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
+        'md5': 'aef45de271c4bf0a5db834aa40bf774c',
+        'info_dict': {
+            'id': '18407',
+            'ext': 'mp3',
+            'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+            'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+            'thumbnail': 're:^https?:.*\.jpg$',
+        },
    }, {
        'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
        'only_matching': True,
@ -51,9 +61,6 @@ class TagesschauIE(InfoExtractor):
    }, {
        'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
        'only_matching': True,
-    }, {
-        'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
-        'only_matching': True,
    }]

    _FORMATS = {
@ -73,19 +80,26 @@ def _real_extract(self, url):
            playerpage = self._download_webpage(
                player_url, display_id, 'Downloading player page')

-            medias = re.findall(
-                r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
-                playerpage)
            formats = []
-            for url, ext, res in medias:
+            for media in re.finditer(
+                    r'''(?x)
+                        (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
+                        ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
+                        (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
+                    ''', playerpage):
+                url = media.group('url')
+                type_ = media.group('type')
+                ext = media.group('ext')
+                res = media.group('quality')
                f = {
-                    'format_id': res + '_' + ext,
+                    'format_id': '%s_%s' % (res, ext) if res else ext,
                    'url': url,
                    'ext': ext,
+                    'vcodec': 'none' if type_ == 'audio' else None,
                }
                f.update(self._FORMATS.get(res, {}))
                formats.append(f)
-            thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+            thumbnail = self._og_search_thumbnail(playerpage)
            title = self._og_search_title(webpage).strip()
            description = self._og_search_description(webpage).strip()
        else:
@ -123,9 +137,7 @@ def _real_extract(self, url):
                        'filesize_approx': parse_filesize(m.group('filesize_approx')),
                    })
                formats.append(format)
-            thumbnail_fn = self._search_regex(
-                r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
-                webpage, 'thumbnail', fatal=False)
+            thumbnail = self._og_search_thumbnail(webpage)
            description = self._html_search_regex(
                r'(?s)<p class="teasertext">(.*?)</p>',
                webpage, 'description', default=None)
@ -133,7 +145,6 @@ def _real_extract(self, url):
                r'<span class="headline".*?>(.*?)</span>', webpage, 'title')

        self._sort_formats(formats)
-        thumbnail = 'http://www.tagesschau.de' + thumbnail_fn

        return {
            'id': display_id,