From a7e5f274123dcfe9afd53134a33f83d53d1e497e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 18:48:09 +0100 Subject: [PATCH] [bbc] improve extraction - extract f4m and dash formats - improve format sorting and listing - improve extraction of articles with `otherSettings.playlist` --- youtube_dl/extractor/bbc.py | 175 +++++++++++++++++------------------- 1 file changed, 83 insertions(+), 92 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9cb7630a1..45c562bd8 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -229,51 +229,6 @@ def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] - def _extract_connection(self, connection, programme_id): - formats = [] - kind = connection.get('kind') - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - transfer_format = connection.get('transferFormat') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Skip DASH until supported - elif transfer_format == 'dash': - pass - elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=supplier, fatal=False)) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier or kind or protocol, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) @@ -294,46 +249,6 @@ def _extract_medias(self, media_selection): def _extract_connections(self, media): return self._findall_ns(media, './{%s}connection') - def _extract_video(self, media, programme_id): - formats = [] - vbr = int_or_none(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - file_size = int_or_none(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - if service: - format['format_id'] = '%s_%s' % (service, format['format_id']) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int_or_none(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - 'vcodec': 'none', - }) - formats.extend(conn_formats) - return formats - def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): @@ -382,10 +297,77 @@ def _process_media_selector(self, media_selection, programme_id): for media in self._extract_medias(media_selection): kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + href = connection.get('href') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + if service: + format_id = '%s_%s' % (service, format_id) + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'vbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol == 'http': + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -820,13 +802,19 @@ def _real_extract(self, url): # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) playlist = data_playable.get('otherSettings', {}).get('playlist', {}) if playlist: - for key in ('progressiveDownload', 'streaming'): + entry = None + for key in ('streaming', 'progressiveDownload'): playlist_url = playlist.get('%sUrl' % key) if not playlist_url: continue try: - entries.append(self._extract_from_playlist_sxml( - playlist_url, playlist_id, timestamp)) + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) except Exception as e: # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. @@ -834,6 +822,9 @@ def _real_extract(self, url): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: continue raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)