[extractor/anvato] Fix extractor and refactor (#5074)

Authored by: bashonly
2024-11-07 20:30:41 -05:00 · 2022-10-03 16:15:22 +00:00 · 2022-10-03 16:15:22 +00:00 · 4a61501db9
commit 4a61501db9
parent 7244895bde
6 changed files with 116 additions and 113 deletions
--- a/3
+++ b/3
@ -74,8 +74,7 @@ offlinetest: codetest
 	$(PYTHON) -m pytest -k "not download"
 # XXX: This is hard to maintain
-CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat \
+CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat
               yt_dlp/extractor/anvato_token_generator
 yt-dlp: yt_dlp/*.py yt_dlp/*/*.py
 	mkdir -p zip
 	for d in $(CODE_FOLDERS) ; do \
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,6 @@ def packages():
    return [
        'yt_dlp', 'yt_dlp.extractor', 'yt_dlp.downloader', 'yt_dlp.postprocessor', 'yt_dlp.compat',
        'yt_dlp.extractor.anvato_token_generator',
    ]
--- a/yt_dlp/extractor/anvato.py
+++ b/yt_dlp/extractor/anvato.py
@ -5,10 +5,8 @@
 import re
 import time
 from .anvato_token_generator import NFLTokenGenerator
 from .common import InfoExtractor
 from ..aes import aes_encrypt
 from ..compat import compat_str
 from ..utils import (
    bytes_to_intlist,
    determine_ext,
@ -16,20 +14,61 @@
    int_or_none,
    join_nonempty,
    strip_jsonp,
    smuggle_url,
    traverse_obj,
    unescapeHTML,
    unsmuggle_url,
 )
 def md5_text(s):
-    if not isinstance(s, compat_str):
+    return hashlib.md5(str(s).encode()).hexdigest()
        s = compat_str(s)
    return hashlib.md5(s.encode('utf-8')).hexdigest()
 class AnvatoIE(InfoExtractor):
    _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
    _API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2'
    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'  # from anvplayer.min.js
    _TESTS = [{
        # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14
        'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441',
        'md5': '921919dab3cd0b849ff3d624831ae3e2',
        'info_dict': {
            'id': '899441',
            'ext': 'mp4',
            'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14',
            'description': 'md5:85e05a3cc163f8c344340f220521136d',
            'upload_date': '20201215',
            'timestamp': 1608009755,
            'thumbnail': r're:^https?://.*\.jpg',
            'uploader': 'NFL',
            'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights',
                     'Player Highlights', 'Cleveland Browns', 'league'],
            'duration': 157,
            'categories': ['Entertainment', 'Game', 'Highlights'],
        },
    }, {
        # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/
        'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455',
        'md5': '837718bcfb3a7778d022f857f7a9b19e',
        'info_dict': {
            'id': '8032455',
            'ext': 'mp4',
            'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream',
            'description': 'md5:0a12bab8159445e78f52a297a35c6609',
            'upload_date': '20220928',
            'timestamp': 1664408881,
            'thumbnail': r're:^https?://.*\.jpg',
            'uploader': 'LIN',
            'tags': ['video', 'news', '5live'],
            'duration': 155,
            'categories': ['News'],
        },
    }]
    # Copied from anvplayer.min.js
    _ANVACK_TABLE = {
        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@ -202,86 +241,74 @@ class AnvatoIE(InfoExtractor):
        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
    }
    def _generate_nfl_token(self, anvack, mcp_id):
        reroute = self._download_json(
            'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials',
            headers={'X-Domain-Id': 100}, note='Fetching token info')
        token_type = reroute.get('token_type') or 'Bearer'
        auth_token = f'{token_type} {reroute["access_token"]}'
        response = self._download_json(
            'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
                'query': '''{
  viewer {
    mediaToken(anvack: "%s", id: %s) {
      token
    }
  }
 }''' % (anvack, mcp_id),
            }).encode(), headers={
                'Authorization': auth_token,
                'Content-Type': 'application/json',
            }, note='Fetching NFL API token')
        return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token'))
    _TOKEN_GENERATORS = {
-        'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator,
+        'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token,
    }
    _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
    _TESTS = [{
        # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
        'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
        'info_dict': {
            'id': '4465496',
            'ext': 'mp4',
            'title': 'VIDEO: Humpback whale breaches right next to NH boat',
            'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
            'duration': 22,
            'timestamp': 1534855680,
            'upload_date': '20180821',
            'uploader': 'ANV',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
        'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
        'only_matching': True,
    }]
    def __init__(self, *args, **kwargs):
        super(AnvatoIE, self).__init__(*args, **kwargs)
        self.__server_time = None
    def _server_time(self, access_key, video_id):
-        if self.__server_time is not None:
+        return int_or_none(traverse_obj(self._download_json(
-            return self.__server_time
+            f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key},
            note='Fetching server time', fatal=False), 'server_time')) or int(time.time())
-        self.__server_time = int(self._download_json(
+    def _get_video_json(self, access_key, video_id, extracted_token):
            self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
            note='Fetching server time')['server_time'])
        return self.__server_time
    def _api_prefix(self, access_key):
        return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
    def _get_video_json(self, access_key, video_id):
        # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
-        video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
+        video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}'
        server_time = self._server_time(access_key, video_id)
-        input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
+        input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}'
        auth_secret = intlist_to_bytes(aes_encrypt(
            bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
-
+        query = {
-        video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
+            'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'),
            'rtyp': 'fp',
        }
        anvrid = md5_text(time.time() * 1000 * random.random())[:30]
        api = {
            'anvrid': anvrid,
            'anvts': server_time,
        }
-        if self._TOKEN_GENERATORS.get(access_key) is not None:
+        if extracted_token is not None:
-            api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id)
+            api['anvstk2'] = extracted_token
        elif self._TOKEN_GENERATORS.get(access_key) is not None:
            api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id)
        elif self._ANVACK_TABLE.get(access_key) is not None:
            api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}')
        else:
-            api['anvstk'] = md5_text('%s|%s|%d|%s' % (
+            api['anvstk2'] = 'default'
                access_key, anvrid, server_time,
                self._ANVACK_TABLE.get(access_key, self._API_KEY)))
        return self._download_json(
-            video_data_url, video_id, transform_source=strip_jsonp,
+            video_data_url, video_id, transform_source=strip_jsonp, query=query,
-            data=json.dumps({'api': api}).encode('utf-8'))
+            data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8'))
-    def _get_anvato_videos(self, access_key, video_id):
+    def _get_anvato_videos(self, access_key, video_id, token):
-        video_data = self._get_video_json(access_key, video_id)
+        video_data = self._get_video_json(access_key, video_id, token)
        formats = []
        for published_url in video_data['published_urls']:
-            video_url = published_url['embed_url']
+            video_url = published_url.get('embed_url')
            if not video_url:
                continue
            media_format = published_url.get('format')
            ext = determine_ext(video_url)
@ -296,15 +323,27 @@ def _get_anvato_videos(self, access_key, video_id):
                'tbr': tbr or None,
            }
-            if media_format == 'm3u8' and tbr is not None:
+            vtt_subs, hls_subs = {}, {}
            if media_format == 'vtt':
                _, vtt_subs = self._extract_m3u8_formats_and_subtitles(
                    video_url, video_id, m3u8_id='vtt', fatal=False)
                continue
            elif media_format == 'm3u8' and tbr is not None:
                a_format.update({
                    'format_id': join_nonempty('hls', tbr),
                    'ext': 'mp4',
                })
            elif media_format == 'm3u8-variant' or ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
+                # For some videos the initial m3u8 URL returns JSON instead
-                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                manifest_json = self._download_json(
-                    m3u8_id='hls', fatal=False))
+                    video_url, video_id, note='Downloading manifest JSON', errnote=False)
                if manifest_json:
                    video_url = manifest_json.get('master_m3u8')
                    if not video_url:
                        continue
                hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles(
                    video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)
                formats.extend(hls_fmts)
                continue
            elif ext == 'mp3' or media_format == 'mp3':
                a_format['vcodec'] = 'none'
@ -324,6 +363,7 @@ def _get_anvato_videos(self, access_key, video_id):
                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
            }
            subtitles.setdefault(caption['language'], []).append(a_caption)
        subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs)
        return {
            'id': video_id,
@ -349,7 +389,10 @@ def _extract_from_webpage(cls, url, webpage):
                access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
            if not (video_id or '').isdigit() or not access_key:
                continue
-            yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)
+            url = f'anvato:{access_key}:{video_id}'
            if anvplayer_data.get('token'):
                url = smuggle_url(url, {'token': anvplayer_data['token']})
            yield cls.url_result(url, AnvatoIE, video_id)
    def _extract_anvato_videos(self, webpage, video_id):
        anvplayer_data = self._parse_json(
@ -357,7 +400,7 @@ def _extract_anvato_videos(self, webpage, video_id):
                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
            video_id)
        return self._get_anvato_videos(
-            anvplayer_data['accessKey'], anvplayer_data['video'])
+            anvplayer_data['accessKey'], anvplayer_data['video'], 'default')  # cbslocal token = 'default'
    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})
@ -365,9 +408,7 @@ def _real_extract(self, url):
            'countries': smuggled_data.get('geo_countries'),
        })
-        mobj = self._match_valid_url(url)
+        access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id')
        access_key, video_id = mobj.group('access_key_or_mcp', 'id')
        if access_key not in self._ANVACK_TABLE:
-            access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
+            access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key
-                access_key) or access_key
+        return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token'))
        return self._get_anvato_videos(access_key, video_id)
--- a/yt_dlp/extractor/anvato_token_generator/init.py
+++ b/yt_dlp/extractor/anvato_token_generator/init.py
@ -1,5 +0,0 @@
 from .nfl import NFLTokenGenerator
 __all__ = [
    'NFLTokenGenerator',
 ]
--- a/yt_dlp/extractor/anvato_token_generator/common.py
+++ b/yt_dlp/extractor/anvato_token_generator/common.py
@ -1,3 +0,0 @@
 class TokenGenerator:
    def generate(self, anvack, mcp_id):
        raise NotImplementedError('This method must be implemented by subclasses')
--- a/yt_dlp/extractor/anvato_token_generator/nfl.py
+++ b/yt_dlp/extractor/anvato_token_generator/nfl.py
@ -1,28 +0,0 @@
 import json
 from .common import TokenGenerator
 class NFLTokenGenerator(TokenGenerator):
    _AUTHORIZATION = None
    def generate(ie, anvack, mcp_id):
        if not NFLTokenGenerator._AUTHORIZATION:
            reroute = ie._download_json(
                'https://api.nfl.com/v1/reroute', mcp_id,
                data=b'grant_type=client_credentials',
                headers={'X-Domain-Id': 100})
            NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token'])
        return ie._download_json(
            'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
                'query': '''{
  viewer {
    mediaToken(anvack: "%s", id: %s) {
      token
    }
  }
 }''' % (anvack, mcp_id),
            }).encode(), headers={
                'Authorization': NFLTokenGenerator._AUTHORIZATION,
                'Content-Type': 'application/json',
            })['data']['viewer']['mediaToken']['token']