[youtube] Quick extraction tempfix (closes #22367, closes #22163)

2024-12-28 08:21:41 +00:00 · 2019-09-11 22:44:47 +07:00 · 2019-09-11 22:44:47 +07:00 · bf1317d257
commit bf1317d257
parent bff90fc518
1 changed files with 106 additions and 78 deletions
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1915,6 +1915,9 @@ def _extract_filesize(media_url):
            return int_or_none(self._search_regex(
                r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
        streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
        streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
        if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
            self.report_rtmp_download()
            formats = [{
@ -1923,10 +1926,11 @@ def _extract_filesize(media_url):
                'url': video_info['conn'][0],
                'player_url': player_url,
            }]
-        elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
+        elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
            encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
            if 'rtmpe%3Dyes' in encoded_url_map:
                raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
            formats = []
            formats_spec = {}
            fmt_list = video_info.get('fmt_list', [''])[0]
            if fmt_list:
@ -1941,90 +1945,105 @@ def _extract_filesize(media_url):
                                'height': int_or_none(width_height[1]),
                            }
            q = qualities(['small', 'medium', 'hd720'])
-            streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)
+            for fmt in streaming_formats:
-            if streaming_formats:
+                itag = str_or_none(fmt.get('itag'))
-                for fmt in streaming_formats:
+                if not itag:
                    itag = str_or_none(fmt.get('itag'))
                    if not itag:
                        continue
                    quality = fmt.get('quality')
                    quality_label = fmt.get('qualityLabel') or quality
                    formats_spec[itag] = {
                        'asr': int_or_none(fmt.get('audioSampleRate')),
                        'filesize': int_or_none(fmt.get('contentLength')),
                        'format_note': quality_label,
                        'fps': int_or_none(fmt.get('fps')),
                        'height': int_or_none(fmt.get('height')),
                        'quality': q(quality),
                        # bitrate for itag 43 is always 2147483647
                        'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
                        'width': int_or_none(fmt.get('width')),
                    }
            formats = []
            for url_data_str in encoded_url_map.split(','):
                url_data = compat_parse_qs(url_data_str)
                if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
                    continue
                quality = fmt.get('quality')
                quality_label = fmt.get('qualityLabel') or quality
                formats_spec[itag] = {
                    'asr': int_or_none(fmt.get('audioSampleRate')),
                    'filesize': int_or_none(fmt.get('contentLength')),
                    'format_note': quality_label,
                    'fps': int_or_none(fmt.get('fps')),
                    'height': int_or_none(fmt.get('height')),
                    'quality': q(quality),
                    # bitrate for itag 43 is always 2147483647
                    'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
                    'width': int_or_none(fmt.get('width')),
                }
            for fmt in streaming_formats:
                if fmt.get('drm_families'):
                    continue
                url = url_or_none(fmt.get('url'))
                if not url:
                    cipher = fmt.get('cipher')
                    if not cipher:
                        continue
                    url_data = compat_parse_qs(cipher)
                    url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
                    if not url:
                        continue
                else:
                    cipher = None
                    url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
                stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
                # Unsupported FORMAT_STREAM_TYPE_OTF
                if stream_type == 3:
                    continue
                format_id = url_data['itag'][0]
                url = url_data['url'][0]
-                if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
+                format_id = fmt.get('itag') or url_data['itag'][0]
-                    ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
+                if not format_id:
-                    jsplayer_url_json = self._search_regex(
+                    continue
-                        ASSETS_RE,
+                format_id = compat_str(format_id)
-                        embed_webpage if age_gate else video_webpage,
+
-                        'JS player URL (1)', default=None)
+                if cipher:
-                    if not jsplayer_url_json and not age_gate:
+                    if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
-                        # We need the embed website after all
+                        ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
                        if embed_webpage is None:
                            embed_url = proto + '://www.youtube.com/embed/%s' % video_id
                            embed_webpage = self._download_webpage(
                                embed_url, video_id, 'Downloading embed webpage')
                        jsplayer_url_json = self._search_regex(
-                            ASSETS_RE, embed_webpage, 'JS player URL')
+                            ASSETS_RE,
                            embed_webpage if age_gate else video_webpage,
                            'JS player URL (1)', default=None)
                        if not jsplayer_url_json and not age_gate:
                            # We need the embed website after all
                            if embed_webpage is None:
                                embed_url = proto + '://www.youtube.com/embed/%s' % video_id
                                embed_webpage = self._download_webpage(
                                    embed_url, video_id, 'Downloading embed webpage')
                            jsplayer_url_json = self._search_regex(
                                ASSETS_RE, embed_webpage, 'JS player URL')
-                    player_url = json.loads(jsplayer_url_json)
+                        player_url = json.loads(jsplayer_url_json)
                    if player_url is None:
                        player_url_json = self._search_regex(
                            r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
                            video_webpage, 'age gate player URL')
                        player_url = json.loads(player_url_json)
                if 'sig' in url_data:
                    url += '&signature=' + url_data['sig'][0]
                elif 's' in url_data:
                    encrypted_sig = url_data['s'][0]
                    if self._downloader.params.get('verbose'):
                        if player_url is None:
-                            player_version = 'unknown'
+                            player_url_json = self._search_regex(
-                            player_desc = 'unknown'
+                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
-                        else:
+                                video_webpage, 'age gate player URL')
-                            if player_url.endswith('swf'):
+                            player_url = json.loads(player_url_json)
-                                player_version = self._search_regex(
+
-                                    r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
+                    if 'sig' in url_data:
-                                    'flash player', fatal=False)
+                        url += '&signature=' + url_data['sig'][0]
-                                player_desc = 'flash player %s' % player_version
+                    elif 's' in url_data:
                        encrypted_sig = url_data['s'][0]
                        if self._downloader.params.get('verbose'):
                            if player_url is None:
                                player_version = 'unknown'
                                player_desc = 'unknown'
                            else:
-                                player_version = self._search_regex(
+                                if player_url.endswith('swf'):
-                                    [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+                                    player_version = self._search_regex(
-                                     r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
+                                        r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
-                                    player_url,
+                                        'flash player', fatal=False)
-                                    'html5 player', fatal=False)
+                                    player_desc = 'flash player %s' % player_version
-                                player_desc = 'html5 player %s' % player_version
+                                else:
                                    player_version = self._search_regex(
                                        [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
                                         r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
                                        player_url,
                                        'html5 player', fatal=False)
                                    player_desc = 'html5 player %s' % player_version
-                        parts_sizes = self._signature_cache_id(encrypted_sig)
+                            parts_sizes = self._signature_cache_id(encrypted_sig)
-                        self.to_screen('{%s} signature length %s, %s' %
+                            self.to_screen('{%s} signature length %s, %s' %
-                                       (format_id, parts_sizes, player_desc))
+                                           (format_id, parts_sizes, player_desc))
-                    signature = self._decrypt_signature(
+                        signature = self._decrypt_signature(
-                        encrypted_sig, video_id, player_url, age_gate)
+                            encrypted_sig, video_id, player_url, age_gate)
-                    sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
+                        sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
-                    url += '&%s=%s' % (sp, signature)
+                        url += '&%s=%s' % (sp, signature)
                if 'ratebypass' not in url:
                    url += '&ratebypass=yes'
@ -2044,24 +2063,33 @@ def _extract_filesize(media_url):
                mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
                width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
                if width is None:
                    width = int_or_none(fmt.get('width'))
                if height is None:
                    height = int_or_none(fmt.get('height'))
                filesize = int_or_none(url_data.get(
                    'clen', [None])[0]) or _extract_filesize(url)
-                quality = url_data.get('quality', [None])[0]
+                quality = url_data.get('quality', [None])[0] or fmt.get('quality')
                quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
                tbr = float_or_none(url_data.get('bitrate', [None])[0], 1000) or float_or_none(fmt.get('bitrate'), 1000)
                fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
                more_fields = {
                    'filesize': filesize,
-                    'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
+                    'tbr': tbr,
                    'width': width,
                    'height': height,
-                    'fps': int_or_none(url_data.get('fps', [None])[0]),
+                    'fps': fps,
-                    'format_note': url_data.get('quality_label', [None])[0] or quality,
+                    'format_note': quality_label or quality,
                    'quality': q(quality),
                }
                for key, value in more_fields.items():
                    if value:
                        dct[key] = value
-                type_ = url_data.get('type', [None])[0]
+                type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
                if type_:
                    type_split = type_.split(';')
                    kind_ext = type_split[0].split('/')