[bbc] Support playlists of data-playable

2024-11-07 20:30:41 -05:00 · 2015-10-10 23:40:20 +06:00 · 2015-10-10 23:40:20 +06:00 · 78f9d84318
commit 78f9d84318
parent b1ec70e4a9
1 changed files with 44 additions and 29 deletions
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@ -716,6 +716,8 @@ def _real_extract(self, url):
             r'itemprop="datePublished"[^>]+datetime="([^"]+)"'],
            webpage, 'date', default=None))
        entries = []
        # article with multiple videos embedded with playlist.sxml (e.g.
        # http://www.bbc.com/sport/0/football/34475836)
        playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
@ -723,6 +725,48 @@ def _real_extract(self, url):
            entries = [
                self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
                for playlist_url in playlists]
        # news article with multiple videos embedded with data-playable
        data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
        if data_playables:
            for _, data_playable_json in data_playables:
                data_playable = self._parse_json(
                    unescapeHTML(data_playable_json), playlist_id, fatal=False)
                if not data_playable:
                    continue
                settings = data_playable.get('settings', {})
                if settings:
                    # data-playable with video vpid in settings.playlistObject.items (e.g.
                    # http://www.bbc.com/news/world-us-canada-34473351)
                    playlist_object = settings.get('playlistObject', {})
                    if playlist_object:
                        items = playlist_object.get('items')
                        if items and isinstance(items, list):
                            title = playlist_object['title']
                            description = playlist_object.get('summary')
                            duration = int_or_none(items[0].get('duration'))
                            programme_id = items[0].get('vpid')
                            formats, subtitles = self._download_media_selector(programme_id)
                            self._sort_formats(formats)
                            entries.append({
                                'id': programme_id,
                                'title': title,
                                'description': description,
                                'timestamp': timestamp,
                                'duration': duration,
                                'formats': formats,
                                'subtitles': subtitles,
                            })
                    else:
                        # data-playable without vpid but with a playlist.sxml URLs
                        # in otherSettings.playlist (e.g.
                        # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
                        playlist = data_playable.get('otherSettings', {}).get('playlist', {})
                        if playlist:
                            entries.append(self._extract_from_playlist_sxml(
                                playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
        if entries:
            playlist_title = self._og_search_title(webpage)
            playlist_description = self._og_search_description(webpage, default=None)
            return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
@ -733,35 +777,6 @@ def _real_extract(self, url):
             r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'],
            webpage, 'vpid', default=None)
        duration = None
        if not programme_id:
            # single video in news article embedded with data-playable (e.g.
            # http://www.bbc.com/news/world-us-canada-34473351)
            data_playable = self._parse_json(
                unescapeHTML(self._search_regex(
                    r'data-playable=(["\'])(?P<json>{.+?})\1', webpage,
                    'data playable', default='{}', group='json')),
                programme_id, fatal=False)
            if data_playable:
                # data-playable has video vpid in settings.playlistObject.items (e.g.
                # http://www.bbc.com/news/world-us-canada-34473351)
                settings = data_playable.get('settings', {})
                if settings:
                    playlist_object = settings.get('playlistObject', {})
                    if playlist_object:
                        items = playlist_object.get('items')
                        if items and isinstance(items, list):
                            duration = int_or_none(items[0].get('duration'))
                            programme_id = items[0].get('vpid')
                if not programme_id:
                    # data-playable has no vpid but has a playlist.sxml URLs
                    # in otherSettings.playlist (e.g.
                    # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
                    playlist = data_playable.get('otherSettings', {}).get('playlist', {})
                    if playlist:
                        return self._extract_from_playlist_sxml(
                            playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)
        if programme_id:
            formats, subtitles = self._download_media_selector(programme_id)
            self._sort_formats(formats)