From 4e262a8838d487362a85eb8b8693d2fa84899f17 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 26 Nov 2014 10:44:39 +0100 Subject: [PATCH] [generic] Detect direct video links (Fixes #4149, #4313) --- youtube_dl/extractor/common.py | 4 +++- youtube_dl/extractor/generic.py | 42 +++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3c2d46dd5..3e84d0e62 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -296,9 +296,11 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6a95e2952..328301de3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -452,7 +452,23 @@ class GenericIE(InfoExtractor): 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', }, 'playlist_mincount': 2, + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] } + ] def report_following_redirect(self, new_url): @@ -606,10 +622,28 @@ def _real_extract(self, url): if not self._downloader.params.get('test', False) and not is_intentional: self._downloader.report_warning('Falling back on generic information extractor.') - if full_response: - webpage = self._webpage_read_content(full_response, url, video_id) - else: - webpage = self._download_webpage(url, video_id) + if not full_response: + full_response = self._request_webpage(url, video_id) + + # Maybe it's a direct link to a video? + # Be careful not to download the whole thing! + first_bytes = full_response.read(512) + if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): + self._downloader.report_warning( + 'URL could be a direct video link, returning it as such.') + upload_date = unified_strdate( + head_response.headers.get('Last-Modified')) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'direct': True, + 'url': url, + 'upload_date': upload_date, + } + + webpage = self._webpage_read_content( + full_response, url, video_id, prefix=first_bytes) + self.report_extraction(video_id) # Is it an RSS feed?