yt-dlp/yt_dlp/extractor/webofstories.py

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    orderedSet,
)


class WebOfStoriesIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'
    _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
    _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
    _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
    _TESTS = [{
        'url': 'http://www.webofstories.com/play/hans.bethe/71',
        'md5': '373e4dd915f60cfe3116322642ddf364',
        'info_dict': {
            'id': '4536',
            'ext': 'mp4',
            'title': 'The temperature of the sun',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'Hans Bethe talks about calculating the temperature of the sun',
            'duration': 238,
        },
    }, {
        'url': 'http://www.webofstories.com/play/55908',
        'md5': '2985a698e1fe3211022422c4b5ed962c',
        'info_dict': {
            'id': '55908',
            'ext': 'mp4',
            'title': 'The story of Gemmata obscuriglobus',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
            'duration': 169,
        },
        'skip': 'notfound',
    }, {
        # malformed og:title meta
        'url': 'http://www.webofstories.com/play/54215?o=MS',
        'info_dict': {
            'id': '54215',
            'ext': 'mp4',
            'title': '"A Leg to Stand On"',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'Oliver Sacks talks about the death and resurrection of a limb',
            'duration': 97,
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)
        # Sometimes og:title meta is malformed
        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
            r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
        description = self._html_search_meta('description', webpage)
        thumbnail = self._og_search_thumbnail(webpage)

        embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
            r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
            webpage, 'embed params').split(',')]

        (
            _, speaker_id, story_id, story_duration,
            speaker_type, great_life, _thumbnail, _has_subtitles,
            story_filename, _story_order) = embed_params

        is_great_life_series = great_life == 'true'
        duration = int_or_none(story_duration)

        # URL building, see: http://www.webofstories.com/scripts/player.js
        ms_prefix = ''
        if speaker_type.lower() == 'ms':
            ms_prefix = 'mini_sites/'

        if is_great_life_series:
            mp4_url = f'{self._VIDEO_DOMAIN}lives/{speaker_id}/{story_filename}.mp4'
            rtmp_ext = 'flv'
            streamer = self._GREAT_LIFE_STREAMER
            play_path = f'stories/{speaker_id}/{story_filename}'
        else:
            mp4_url = f'{self._VIDEO_DOMAIN}{ms_prefix}{speaker_id}/{story_filename}.mp4'
            rtmp_ext = 'mp4'
            streamer = self._USER_STREAMER
            play_path = f'mp4:{ms_prefix}{speaker_id}/{story_filename}.mp4'

        formats = [{
            'format_id': 'mp4_sd',
            'url': mp4_url,
        }, {
            'format_id': 'rtmp_sd',
            'page_url': url,
            'url': streamer,
            'ext': rtmp_ext,
            'play_path': play_path,
        }]

        return {
            'id': story_id,
            'title': title,
            'formats': formats,
            'thumbnail': thumbnail,
            'description': description,
            'duration': duration,
        }


class WebOfStoriesPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
    _TEST = {
        'url': 'http://www.webofstories.com/playAll/donald.knuth',
        'info_dict': {
            'id': 'donald.knuth',
            'title': 'Donald Knuth (Scientist)',
        },
        'playlist_mincount': 97,
    }

    def _real_extract(self, url):
        playlist_id = self._match_id(url)

        webpage = self._download_webpage(url, playlist_id)

        entries = [
            self.url_result(
                f'http://www.webofstories.com/play/{video_id}',
                'WebOfStories', video_id=video_id)
            for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))
        ]

        title = self._search_regex(
            r'<div id="speakerName">\s*<span>([^<]+)</span>',
            webpage, 'speaker', default=None)
        if title:
            field = self._search_regex(
                r'<span id="primaryField">([^<]+)</span>',
                webpage, 'field', default=None)
            if field:
                title += f' ({field})'

        if not title:
            title = self._search_regex(
                r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
                webpage, 'title')

        return self.playlist_result(entries, playlist_id, title)
Added a Playlist Info Extractor for WebOfStories 2015-06-26 11:57:43 -04:00			`import re`

[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`from .common import InfoExtractor`
[webofstories:playlist] Fix extraction (closes #16914) 2018-08-26 10:41:55 -04:00			`from ..utils import (`
			`int_or_none,`
			`orderedSet,`
			`)`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00

			`class WebOfStoriesIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'`
			`_VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'`
			`_GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'`
			`_USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`_TESTS = [{`
			`'url': 'http://www.webofstories.com/play/hans.bethe/71',`
			`'md5': '373e4dd915f60cfe3116322642ddf364',`
			`'info_dict': {`
			`'id': '4536',`
			`'ext': 'mp4',`
			`'title': 'The temperature of the sun',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 07:08:07 -05:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`'description': 'Hans Bethe talks about calculating the temperature of the sun',`
			`'duration': 238,`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`},`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`}, {`
			`'url': 'http://www.webofstories.com/play/55908',`
			`'md5': '2985a698e1fe3211022422c4b5ed962c',`
			`'info_dict': {`
			`'id': '55908',`
			`'ext': 'mp4',`
			`'title': 'The story of Gemmata obscuriglobus',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 07:08:07 -05:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',`
			`'duration': 169,`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`},`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`'skip': 'notfound',`
			`}, {`
			`# malformed og:title meta`
			`'url': 'http://www.webofstories.com/play/54215?o=MS',`
			`'info_dict': {`
			`'id': '54215',`
			`'ext': 'mp4',`
			`'title': '"A Leg to Stand On"',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 07:08:07 -05:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`'description': 'Oliver Sacks talks about the death and resurrection of a limb',`
			`'duration': 97,`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`},`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`'params': {`
			`'skip_download': True,`
			`},`
			`}]`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00
			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`

			`webpage = self._download_webpage(url, video_id)`
[webofstories] Tolerate malforder og:title (Closes #8417) 2016-02-27 16:37:48 -05:00			`# Sometimes og:title meta is malformed`
			`title = self._og_search_title(webpage, default=None) or self._html_search_regex(`
			`r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`description = self._html_search_meta('description', webpage)`
			`thumbnail = self._og_search_thumbnail(webpage)`

[webofstories] Fix extraction 2015-02-18 20:12:08 -05:00			`embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(`
			`r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',`
			`webpage, 'embed params').split(',')]`

			`(`
			`_, speaker_id, story_id, story_duration,`
			`speaker_type, great_life, _thumbnail, _has_subtitles,`
			`story_filename, _story_order) = embed_params`

[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`is_great_life_series = great_life == 'true'`
[webofstories] Fix extraction 2015-02-18 20:12:08 -05:00			`duration = int_or_none(story_duration)`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00
			`# URL building, see: http://www.webofstories.com/scripts/player.js`
			`ms_prefix = ''`
			`if speaker_type.lower() == 'ms':`
			`ms_prefix = 'mini_sites/'`

			`if is_great_life_series:`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`mp4_url = f'{self._VIDEO_DOMAIN}lives/{speaker_id}/{story_filename}.mp4'`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`rtmp_ext = 'flv'`
			`streamer = self._GREAT_LIFE_STREAMER`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`play_path = f'stories/{speaker_id}/{story_filename}'`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`else:`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`mp4_url = f'{self._VIDEO_DOMAIN}{ms_prefix}{speaker_id}/{story_filename}.mp4'`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00			`rtmp_ext = 'mp4'`
			`streamer = self._USER_STREAMER`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`play_path = f'mp4:{ms_prefix}{speaker_id}/{story_filename}.mp4'`
[webofstories] Add new extractor (Closes #4585) 2015-01-04 19:22:01 -05:00
			`formats = [{`
			`'format_id': 'mp4_sd',`
			`'url': mp4_url,`
			`}, {`
			`'format_id': 'rtmp_sd',`
			`'page_url': url,`
			`'url': streamer,`
			`'ext': rtmp_ext,`
			`'play_path': play_path,`
			`}]`

			`return {`
			`'id': story_id,`
			`'title': title,`
			`'formats': formats,`
			`'thumbnail': thumbnail,`
			`'description': description,`
			`'duration': duration,`
			`}`
Added a Playlist Info Extractor for WebOfStories 2015-06-26 11:57:43 -04:00

			`class WebOfStoriesPlaylistIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'`
[webofstories:playlist] Improve and add test 2015-07-10 18:43:29 -04:00			`_TEST = {`
			`'url': 'http://www.webofstories.com/playAll/donald.knuth',`
			`'info_dict': {`
			`'id': 'donald.knuth',`
			`'title': 'Donald Knuth (Scientist)',`
			`},`
			`'playlist_mincount': 97,`
			`}`
Added a Playlist Info Extractor for WebOfStories 2015-06-26 11:57:43 -04:00
			`def _real_extract(self, url):`
			`playlist_id = self._match_id(url)`

			`webpage = self._download_webpage(url, playlist_id)`

			`entries = [`
[webofstories:playlist] Fix extraction (closes #16914) 2018-08-26 10:41:55 -04:00			`self.url_result(`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`f'http://www.webofstories.com/play/{video_id}',`
[webofstories:playlist] Fix extraction (closes #16914) 2018-08-26 10:41:55 -04:00			`'WebOfStories', video_id=video_id)`
			`for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))`
Added a Playlist Info Extractor for WebOfStories 2015-06-26 11:57:43 -04:00			`]`

[webofstories:playlist] Improve and add test 2015-07-10 18:43:29 -04:00			`title = self._search_regex(`
			`r'<div id="speakerName">\s*<span>([^<]+)</span>',`
			`webpage, 'speaker', default=None)`
			`if title:`
			`field = self._search_regex(`
			`r'<span id="primaryField">([^<]+)</span>',`
			`webpage, 'field', default=None)`
			`if field:`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`title += f' ({field})'`
[webofstories:playlist] Improve and add test 2015-07-10 18:43:29 -04:00
			`if not title:`
			`title = self._search_regex(`
			`r'<title>Play\s+all\s+stories\s-\s([^<]+)\s-\sWeb\s+of\s+Stories</title>',`
			`webpage, 'title')`

			`return self.playlist_result(entries, playlist_id, title)`