[bbc] Add support for vxp-playlist-data embeds (Closes #6453)

2024-11-21 20:46:36 -05:00 · 2015-08-04 20:44:22 +06:00 · 2015-08-04 20:44:22 +06:00 · a346b1ff57
commit a346b1ff57
parent d96d604e53
1 changed files with 40 additions and 5 deletions
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@ -526,6 +526,18 @@ class BBCIE(BBCCoUkIE):
        'params': {
            'skip_download': True,
        }
+    }, {
+        # single video from video playlist embedded with vxp-playlist-data JSON
+        'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
+        'info_dict': {
+            'id': 'p02w6qjc',
+            'ext': 'mp4',
+            'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+            'duration': 56,
+        },
+        'params': {
+            'skip_download': True,
+        }
    }, {
        # single video story with digitalData
        'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
@ -695,14 +707,37 @@ def extract_all(pattern):

        if not medias:
            # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
-            media_asset_page = self._parse_json(
-                self._search_regex(
-                    r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'),
-                playlist_id)
+            media_asset = self._search_regex(
+                r'mediaAssetPage\.init\(\s*({.+?}), "/',
+                webpage, 'media asset', default=None)
+            if media_asset:
+                media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
                medias = []
                for video in media_asset_page.get('videos', {}).values():
                    medias.extend(video.values())

+        if not medias:
+            # Multiple video playlist with single `now playing` entry (e.g.
+            # http://www.bbc.com/news/video_and_audio/must_see/33767813)
+            vxp_playlist = self._parse_json(
+                self._search_regex(
+                    r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
+                    webpage, 'playlist data'),
+                playlist_id)
+            playlist_medias = []
+            for item in vxp_playlist:
+                media = item.get('media')
+                if not media:
+                    continue
+                playlist_medias.append(media)
+                # Download single video if found media with asset id matching the video id from URL
+                if item.get('advert', {}).get('assetId') == playlist_id:
+                    medias = [media]
+                    break
+            # Fallback to the whole playlist
+            if not medias:
+                medias = playlist_medias
+
        entries = []
        for num, media_meta in enumerate(medias, start=1):
            formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)