From db1f388878db8ce2ae6473a5447a5aa6c9ea86f1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 27 Jan 2014 05:47:30 +0100 Subject: [PATCH] [huffpost] Add support --- youtube_dl/downloader/__init__.py | 5 ++- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/generic.py | 8 +++- youtube_dl/extractor/huffpost.py | 70 +++++++++++++++++++++++++++++++ 5 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 youtube_dl/extractor/huffpost.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 0d9eb0001..aaa92bc75 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + from .common import FileDownloader from .hls import HlsFD from .http import HttpFD @@ -12,10 +14,11 @@ def get_suitable_downloader(info_dict): """Get the downloader class that can handle the info dict.""" url = info_dict['url'] + protocol = info_dict.get('protocol') if url.startswith('rtmp'): return RtmpFD - if determine_ext(url) == u'm3u8': + if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'): return HlsFD if url.startswith('mms') or url.startswith('rtsp'): return MplayerFD diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8daf995b9..5de90d6d9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE +from .huffpost import HuffPostIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE from .imdb import ( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cf742a3b..db1ca9edb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,7 +71,7 @@ class InfoExtractor(object): * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp" or so. + "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted by this field. diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e1933837d..829e5894f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -332,10 +332,16 @@ def _real_extract(self, url): # Look for embedded Facebook player mobj = re.search( - r']+?src=(["\'])(?Phttps://www.facebook.com/video/embed.+?)\1', webpage) + r']+?src=(["\'])(?Phttps://www\.facebook\.com/video/embed.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'Facebook') + # Look for embedded Huffington Post player + mobj = re.search( + r']+?src=(["\'])(?Phttps?://embed\.live.huffingtonpost\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'HuffPost') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py new file mode 100644 index 000000000..b47114ab4 --- /dev/null +++ b/youtube_dl/extractor/huffpost.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class HuffPostIE(InfoExtractor): + IE_DESC = 'Huffington Post' + _VALID_URL = r'''(?x) + https?://(embed\.)?live\.huffingtonpost\.com/ + (?: + r/segment/[^/]+/| + HPLEmbedPlayer/\?segmentId= + ) + (?P[0-9a-f]+)''' + + _TEST = { + 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', + 'file': '52dd3e4b02a7602131000677.mp4', + 'md5': 'TODO', + 'info_dict': { + 'title': 'TODO', + 'description': 'TODO', + 'duration': 1549, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id + data = self._download_json(api_url, video_id)['data'] + + video_title = data['title'] + duration = parse_duration(data['running_time']) + upload_date = unified_strdate(data['schedule']['started_at']) + + thumbnails = [] + for url in data['images'].values(): + m = re.match('.*-([0-9]+x[0-9]+)\.', url) + if not m: + continue + thumbnails.append({ + 'url': url, + 'resolution': m.group(1), + }) + + formats = [{ + 'format': key, + 'format_id': key.replace('/', '.'), + 'ext': 'mp4', + 'url': url, + 'vcodec': 'none' if key.startswith('audio/') else None, + } for key, url in data['sources']['live'].items()] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'duration': duration, + 'upload_date': upload_date, + 'thumbnails': thumbnails, + }