yt-dlp/yt_dlp/extractor/amp.py

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    determine_ext,
    int_or_none,
    mimetype2ext,
    parse_iso8601,
    strip_jsonp,
    unified_timestamp,
    url_or_none,
)


class AMPIE(InfoExtractor):  # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
    # parse Akamai Adaptive Media Player feed
    def _extract_feed_info(self, url):
        feed = self._download_json(
            url, None, 'Downloading Akamai AMP feed',
            'Unable to download Akamai AMP feed', transform_source=strip_jsonp)
        item = feed.get('channel', {}).get('item')
        if not item:
            raise ExtractorError('{} said: {}'.format(self.IE_NAME, feed['error']))

        video_id = item['guid']

        def get_media_node(name, default=None):
            media_name = f'media-{name}'
            media_group = item.get('media-group') or item
            return media_group.get(media_name) or item.get(media_name) or item.get(name, default)

        thumbnails = []
        media_thumbnail = get_media_node('thumbnail')
        if media_thumbnail:
            if isinstance(media_thumbnail, dict):
                media_thumbnail = [media_thumbnail]
            for thumbnail_data in media_thumbnail:
                thumbnail = thumbnail_data.get('@attributes', {})
                thumbnail_url = url_or_none(thumbnail.get('url'))
                if not thumbnail_url:
                    continue
                thumbnails.append({
                    'url': self._proto_relative_url(thumbnail_url, 'http:'),
                    'width': int_or_none(thumbnail.get('width')),
                    'height': int_or_none(thumbnail.get('height')),
                })

        subtitles = {}
        media_subtitle = get_media_node('subTitle')
        if media_subtitle:
            if isinstance(media_subtitle, dict):
                media_subtitle = [media_subtitle]
            for subtitle_data in media_subtitle:
                subtitle = subtitle_data.get('@attributes', {})
                subtitle_href = url_or_none(subtitle.get('href'))
                if not subtitle_href:
                    continue
                subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
                    'url': subtitle_href,
                    'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
                })

        formats = []
        media_content = get_media_node('content')
        if isinstance(media_content, dict):
            media_content = [media_content]
        for media_data in media_content:
            media = media_data.get('@attributes', {})
            media_url = url_or_none(media.get('url'))
            if not media_url:
                continue
            ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
            if ext == 'f4m':
                formats.extend(self._extract_f4m_formats(
                    media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
                    video_id, f4m_id='hds', fatal=False))
            elif ext == 'm3u8':
                fmts, subs = self._extract_m3u8_formats_and_subtitles(
                    media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
                formats.extend(fmts)
                self._merge_subtitles(subs, target=subtitles)
            else:
                formats.append({
                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
                    'url': media_url,
                    'tbr': int_or_none(media.get('bitrate')),
                    'filesize': int_or_none(media.get('fileSize')),
                    'ext': ext,
                })

        timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))

        return {
            'id': video_id,
            'title': get_media_node('title'),
            'description': get_media_node('description'),
            'thumbnails': thumbnails,
            'timestamp': timestamp,
            'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
            'subtitles': subtitles,
            'formats': formats,
        }
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`from .common import InfoExtractor`
			`from ..utils import (`
[amp] extract error message(closes #12795) 2017-04-20 00:16:41 -04:00			`ExtractorError,`
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409) Authored by: bashonly, seproDev, Grub4K Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> 2024-05-26 15:27:21 -04:00			`determine_ext,`
Improve URL extraction 2018-07-21 08:08:28 -04:00			`int_or_none,`
			`mimetype2ext,`
			`parse_iso8601,`
[extractor/foxnews] Fix extractors (#7222) Closes #6050 Authored by: bashonly 2023-06-04 09:37:59 -04:00			`strip_jsonp,`
Update to ytdl-2021.02.04.1 except youtube 2021-02-04 02:56:01 -05:00			`unified_timestamp,`
Improve URL extraction 2018-07-21 08:08:28 -04:00			`url_or_none,`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`)`


[cleanup] Misc Closes #5541 2022-11-15 19:57:43 -05:00			`class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`# parse Akamai Adaptive Media Player feed`
			`def _extract_feed_info(self, url):`
[amp] extract error message(closes #12795) 2017-04-20 00:16:41 -04:00			`feed = self._download_json(`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`url, None, 'Downloading Akamai AMP feed',`
[extractor/foxnews] Fix extractors (#7222) Closes #6050 Authored by: bashonly 2023-06-04 09:37:59 -04:00			`'Unable to download Akamai AMP feed', transform_source=strip_jsonp)`
[amp] extract error message(closes #12795) 2017-04-20 00:16:41 -04:00			`item = feed.get('channel', {}).get('item')`
			`if not item:`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`raise ExtractorError('{} said: {}'.format(self.IE_NAME, feed['error']))`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00
			`video_id = item['guid']`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00
			`def get_media_node(name, default=None):`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 19:09:58 -04:00			`media_name = f'media-{name}'`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`media_group = item.get('media-group') or item`
			`return media_group.get(media_name) or item.get(media_name) or item.get(name, default)`

[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`thumbnails = []`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`media_thumbnail = get_media_node('thumbnail')`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`if media_thumbnail:`
			`if isinstance(media_thumbnail, dict):`
			`media_thumbnail = [media_thumbnail]`
			`for thumbnail_data in media_thumbnail:`
[amp] imporove thumbnail and subtitle extraction 2017-05-01 19:06:19 -04:00			`thumbnail = thumbnail_data.get('@attributes', {})`
Improve URL extraction 2018-07-21 08:08:28 -04:00			`thumbnail_url = url_or_none(thumbnail.get('url'))`
[amp] imporove thumbnail and subtitle extraction 2017-05-01 19:06:19 -04:00			`if not thumbnail_url:`
			`continue`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`thumbnails.append({`
[amp] imporove thumbnail and subtitle extraction 2017-05-01 19:06:19 -04:00			`'url': self._proto_relative_url(thumbnail_url, 'http:'),`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`'width': int_or_none(thumbnail.get('width')),`
			`'height': int_or_none(thumbnail.get('height')),`
			`})`

			`subtitles = {}`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`media_subtitle = get_media_node('subTitle')`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`if media_subtitle:`
			`if isinstance(media_subtitle, dict):`
			`media_subtitle = [media_subtitle]`
			`for subtitle_data in media_subtitle:`
[amp] imporove thumbnail and subtitle extraction 2017-05-01 19:06:19 -04:00			`subtitle = subtitle_data.get('@attributes', {})`
Improve URL extraction 2018-07-21 08:08:28 -04:00			`subtitle_href = url_or_none(subtitle.get('href'))`
[amp] imporove thumbnail and subtitle extraction 2017-05-01 19:06:19 -04:00			`if not subtitle_href:`
			`continue`
			`subtitles.setdefault(subtitle.get('lang') or 'en', []).append({`
			`'url': subtitle_href,`
			`'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),`
			`})`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00
			`formats = []`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`media_content = get_media_node('content')`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`if isinstance(media_content, dict):`
			`media_content = [media_content]`
			`for media_data in media_content:`
use mimetype2ext to determine manifest ext in multiple extractors 2016-07-06 04:11:46 -04:00			`media = media_data.get('@attributes', {})`
Improve URL extraction 2018-07-21 08:08:28 -04:00			`media_url = url_or_none(media.get('url'))`
use mimetype2ext to determine manifest ext in multiple extractors 2016-07-06 04:11:46 -04:00			`if not media_url:`
			`continue`
[amp] Fix a typo 2016-07-06 08:10:47 -04:00			`ext = mimetype2ext(media.get('type')) or determine_ext(media_url)`
use mimetype2ext to determine manifest ext in multiple extractors 2016-07-06 04:11:46 -04:00			`if ext == 'f4m':`
Simplify formats accumulation for f4m/m3u8/smil formats Now all _extract_*_formats routines return a list 2015-12-28 13:58:24 -05:00			`formats.extend(self._extract_f4m_formats(`
use mimetype2ext to determine manifest ext in multiple extractors 2016-07-06 04:11:46 -04:00			`media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',`
Simplify formats accumulation for f4m/m3u8/smil formats Now all _extract_*_formats routines return a list 2015-12-28 13:58:24 -05:00			`video_id, f4m_id='hds', fatal=False))`
use mimetype2ext to determine manifest ext in multiple extractors 2016-07-06 04:11:46 -04:00			`elif ext == 'm3u8':`
[extractor/foxnews] Fix extractors (#7222) Closes #6050 Authored by: bashonly 2023-06-04 09:37:59 -04:00			`fmts, subs = self._extract_m3u8_formats_and_subtitles(`
			`media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)`
			`formats.extend(fmts)`
			`self._merge_subtitles(subs, target=subtitles)`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`else:`
			`formats.append({`
[abcnews] Added a new extractor (closes #3992) Related: #6108, #8664, #9459 2016-05-17 03:38:57 -04:00			`'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),`
Improve URL extraction 2018-07-21 08:08:28 -04:00			`'url': media_url,`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`'tbr': int_or_none(media.get('bitrate')),`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`'filesize': int_or_none(media.get('fileSize')),`
use mimetype2ext to determine manifest ext in multiple extractors 2016-07-06 04:11:46 -04:00			`'ext': ext,`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`})`

Update to ytdl-2021.02.04.1 except youtube 2021-02-04 02:56:01 -05:00			`timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))`
[amp] Fix upload timestamp extraction (Closes #9007) 2016-03-27 15:13:47 -04:00
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`return {`
			`'id': video_id,`
[bleacherreport] fix style issues and simplify 2015-12-21 05:12:58 -05:00			`'title': get_media_node('title'),`
			`'description': get_media_node('description'),`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`'thumbnails': thumbnails,`
[amp] Fix upload timestamp extraction (Closes #9007) 2016-03-27 15:13:47 -04:00			`'timestamp': timestamp,`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),`
[amp] Add missing subtitles to info dict 2016-01-04 14:05:37 -05:00			`'subtitles': subtitles,`
[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors 2015-11-07 10:54:35 -05:00			`'formats': formats,`
			`}`