From a107193e4b7a3d5414dd7422263c34ac0e309ec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:13:21 +0600
Subject: [PATCH 1/8] [extractor/common] Extract f4m and m3u8 formats,
 subtitles and info

---
 youtube_dl/extractor/common.py | 200 ++++++++++++++++++++++++---------
 1 file changed, 149 insertions(+), 51 deletions(-)

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index dc5080504..f9578b838 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -18,6 +18,7 @@
     compat_HTTPError,
     compat_http_client,
     compat_urllib_error,
+    compat_urllib_parse,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urlparse,
@@ -37,6 +38,7 @@
     RegexNotFoundError,
     sanitize_filename,
     unescapeHTML,
+    url_basename,
 )
 
 
@@ -978,69 +980,165 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
         self._sort_formats(formats)
         return formats
 
-    # TODO: improve extraction
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True):
-        smil = self._download_xml(
-            smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+    @staticmethod
+    def _xpath_ns(path, namespace=None):
+        if not namespace:
+            return path
+        out = []
+        for c in path.split('/'):
+            if not c or c == '.':
+                out.append(c)
+            else:
+                out.append('{%s}%s' % (namespace, c))
+        return '/'.join(out)
+
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
         if smil is False:
             assert not fatal
             return []
 
-        base = smil.find('./head/meta').get('base')
+        namespace = self._search_regex(
+            r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
+
+        return self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+        if smil is False:
+            return {}
+        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+    def _download_smil(self, smil_url, video_id, fatal=True):
+        return self._download_xml(
+            smil_url, video_id, 'Downloading SMIL file',
+            'Unable to download SMIL file', fatal=fatal)
+
+    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+        namespace = self._search_regex(
+            r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
+
+        formats = self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+        video_id = os.path.splitext(url_basename(smil_url))[0]
+        title = None
+        description = None
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            name = meta.attrib.get('name')
+            content = meta.attrib.get('content')
+            if not name or not content:
+                continue
+            if not title and name == 'title':
+                title = content
+            elif not description and name in ('description', 'abstract'):
+                description = content
+
+        return {
+            'id': video_id,
+            'title': title or video_id,
+            'description': description,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
+        base = smil_url
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            b = meta.get('base') or meta.get('httpBase')
+            if b:
+                base = b
+                break
 
         formats = []
         rtmp_count = 0
-        if smil.findall('./body/seq/video'):
-            video = smil.findall('./body/seq/video')[0]
-            fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-            formats.extend(fmts)
-        else:
-            for video in smil.findall('./body/switch/video'):
-                fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-                formats.extend(fmts)
+        http_count = 0
+
+        videos = smil.findall(self._xpath_ns('.//video', namespace))
+        for video in videos:
+            src = video.get('src')
+            if not src:
+                continue
+
+            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            filesize = int_or_none(video.get('size') or video.get('fileSize'))
+            width = int_or_none(video.get('width'))
+            height = int_or_none(video.get('height'))
+            proto = video.get('proto')
+            ext = video.get('ext')
+            src_ext = determine_ext(src)
+            streamer = video.get('streamer') or base
+
+            if proto == 'rtmp' or streamer.startswith('rtmp'):
+                rtmp_count += 1
+                formats.append({
+                    'url': streamer,
+                    'play_path': src,
+                    'ext': 'flv',
+                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+                    'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+                continue
+
+            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+            if proto == 'm3u8' or src_ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                continue
+
+            if src_ext == 'f4m':
+                f4m_url = src_url
+                if not f4m_params:
+                    f4m_params = {
+                        'hdcore': '3.2.0',
+                        'plugin': 'flowplayer-3.2.0.1',
+                    }
+                f4m_url += '&' if '?' in f4m_url else '?'
+                f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8')
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                continue
+
+            if src_url.startswith('http'):
+                http_count += 1
+                formats.append({
+                    'url': src_url,
+                    'ext': ext or src_ext or 'flv',
+                    'format_id': 'http-%d' % (bitrate or http_count),
+                    'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+                continue
 
         self._sort_formats(formats)
 
         return formats
 
-    def _parse_smil_video(self, video, video_id, base, rtmp_count):
-        src = video.get('src')
-        if not src:
-            return [], rtmp_count
-        bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
-        width = int_or_none(video.get('width'))
-        height = int_or_none(video.get('height'))
-        proto = video.get('proto')
-        if not proto:
-            if base:
-                if base.startswith('rtmp'):
-                    proto = 'rtmp'
-                elif base.startswith('http'):
-                    proto = 'http'
-        ext = video.get('ext')
-        if proto == 'm3u8':
-            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
-        elif proto == 'rtmp':
-            rtmp_count += 1
-            streamer = video.get('streamer') or base
-            return ([{
-                'url': streamer,
-                'play_path': src,
-                'ext': 'flv',
-                'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
-                'tbr': bitrate,
-                'width': width,
-                'height': height,
-            }], rtmp_count)
-        elif proto.startswith('http'):
-            return ([{
-                'url': base + src,
-                'ext': ext or 'flv',
-                'tbr': bitrate,
-                'width': width,
-                'height': height,
-            }], rtmp_count)
+    def _parse_smil_subtitles(self, smil, namespace=None):
+        subtitles = {}
+        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+            src = textstream.get('src')
+            if not src:
+                continue
+            ext = textstream.get('ext') or determine_ext(src)
+            if not ext:
+                type_ = textstream.get('type')
+                if type_ == 'text/srt':
+                    ext = 'srt'
+            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
+            subtitles.setdefault(lang, []).append({
+                'url': src,
+                'ext': ext,
+            })
+        return subtitles
 
     def _live_title(self, name):
         """ Generate the title for a live video """

From e5e8d20a3a65832c74b002f247866fcbb92e9246 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:13:59 +0600
Subject: [PATCH 2/8] [extractor/generic] Improve generic SMIL detection

---
 youtube_dl/extractor/generic.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 8cef61c3c..6900ed96f 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1110,11 +1110,13 @@ def _real_extract(self, url):
 
         self.report_extraction(video_id)
 
-        # Is it an RSS feed?
+        # Is it an RSS feed or a SMIL file?
         try:
             doc = parse_xml(webpage)
             if doc.tag == 'rss':
                 return self._extract_rss(url, video_id, doc)
+            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
+                return self._parse_smil(doc, url, video_id)
         except compat_xml_parse_error:
             pass
 

From 308cfe0ab3ec7122602ba2d6a4e3acd2caa7a757 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:14:41 +0600
Subject: [PATCH 3/8] [test_downloader] Respect --force-generic-extractor

---
 test/test_download.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_download.py b/test/test_download.py
index 1110357a7..284418834 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -136,7 +136,9 @@ def try_rm_tcs_files(tcs=None):
                     # We're not using .download here sine that is just a shim
                     # for outside error handling, and returns the exit code
                     # instead of the result dict.
-                    res_dict = ydl.extract_info(test_case['url'])
+                    res_dict = ydl.extract_info(
+                        test_case['url'],
+                        force_generic_extractor=params.get('force_generic_extractor', False))
                 except (DownloadError, ExtractorError) as err:
                     # Check if the exception is not a network related one
                     if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):

From 645f814544f9d40386e504a1eb8cf3558f2c109e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:15:33 +0600
Subject: [PATCH 4/8] [test/helper] Allow dicts for mincount

---
 test/helper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/helper.py b/test/helper.py
index e1129e58f..c8b34654d 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict):
             elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
                 got = got_dict.get(info_field)
                 self.assertTrue(
-                    isinstance(got, list),
-                    'Expected field %s to be a list, but it is of type %s' % (
+                    isinstance(got, (list, dict)),
+                    'Expected field %s to be a list or a dict, but it is of type %s' % (
                         info_field, type(got).__name__))
                 expected_num = int(expected.partition(':')[2])
                 assertGreaterEqual(

From 8765222d2211cd6f2a40611249181af0bbb2d531 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:16:21 +0600
Subject: [PATCH 5/8] [extractor/generic] Add generic SMIL tests

---
 youtube_dl/extractor/generic.py | 68 +++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 6900ed96f..27584c44c 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -130,6 +130,74 @@ class GenericIE(InfoExtractor):
                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
             }
         },
+        # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
+        {
+            'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
+            'info_dict': {
+                'id': 'smil',
+                'ext': 'mp4',
+                'title': 'Automatics, robotics and biocybernetics',
+                'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+                'formats': 'mincount:16',
+                'subtitles': 'mincount:1',
+            },
+            'params': {
+                'force_generic_extractor': True,
+                'skip_download': True,
+            },
+        },
+        # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
+        {
+            'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
+            'info_dict': {
+                'id': 'hds',
+                'ext': 'flv',
+                'title': 'hds',
+                'formats': 'mincount:1',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # SMIL from https://www.restudy.dk/video/play/id/1637
+        {
+            'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
+            'info_dict': {
+                'id': 'video_1637',
+                'ext': 'flv',
+                'title': 'video_1637',
+                'formats': 'mincount:3',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
+        {
+            'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
+            'info_dict': {
+                'id': 'smil-service',
+                'ext': 'flv',
+                'title': 'smil-service',
+                'formats': 'mincount:1',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
+        {
+            'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
+            'info_dict': {
+                'id': '4719370',
+                'ext': 'mp4',
+                'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
+                'formats': 'mincount:3',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         # google redirect
         {
             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',

From 41c3a5a7beebbf5f60c5edb5093d564f0829c5c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:20:49 +0600
Subject: [PATCH 6/8] [extractor/common] Fix python 3

---
 youtube_dl/extractor/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index f9578b838..c123d9fca 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1101,7 +1101,7 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                         'plugin': 'flowplayer-3.2.0.1',
                     }
                 f4m_url += '&' if '?' in f4m_url else '?'
-                f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8')
+                f4m_url += compat_urllib_parse.urlencode(f4m_params)
                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
                 continue
 

From 17712eeb1933f53696c1fc53606174e988a96472 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 01:31:17 +0600
Subject: [PATCH 7/8] [extractor/common] Extract namespace parse routine

---
 youtube_dl/extractor/common.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index c123d9fca..717dcec7b 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -999,8 +999,7 @@ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None)
             assert not fatal
             return []
 
-        namespace = self._search_regex(
-            r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
+        namespace = self._parse_smil_namespace(smil)
 
         return self._parse_smil_formats(
             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
@@ -1017,8 +1016,7 @@ def _download_smil(self, smil_url, video_id, fatal=True):
             'Unable to download SMIL file', fatal=fatal)
 
     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
-        namespace = self._search_regex(
-            r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
+        namespace = self._parse_smil_namespace(smil)
 
         formats = self._parse_smil_formats(
             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
@@ -1045,6 +1043,10 @@ def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
             'subtitles': subtitles,
         }
 
+    def _parse_smil_namespace(self, smil):
+        return self._search_regex(
+            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
         base = smil_url
         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

From d41d04c0f513ad3b83ab6aee60cf2201710b6063 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Sun, 2 Aug 2015 06:35:35 +0600
Subject: [PATCH 8/8] [videolectures] Fix _VALID_URL

---
 youtube_dl/extractor/videolecturesnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
index d6a7eb203..24584dc80 100644
--- a/youtube_dl/extractor/videolecturesnet.py
+++ b/youtube_dl/extractor/videolecturesnet.py
@@ -12,7 +12,7 @@
 
 
 class VideoLecturesNetIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
+    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$'
     IE_NAME = 'videolectures.net'
 
     _TEST = {