From 62bff2c170a8f12d937d62a2ca74586c5e0eff1f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 1 Mar 2021 05:18:37 +0530 Subject: [PATCH] Add option `--extractor-retries` to retry on known extractor errors * Currently only used by youtube Fixes https://github.com/ytdl-org/youtube-dl/issues/28194 Possibly also fixes: https://github.com/ytdl-org/youtube-dl/issues/28289 (can not confirm since the issue isn't reliably reproducible) --- yt_dlp/YoutubeDL.py | 17 ++++++------- yt_dlp/__init__.py | 9 ++++--- yt_dlp/extractor/youtube.py | 48 ++++++++++++++++++++++--------------- yt_dlp/options.py | 4 ++++ 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e9cb7e187..d1f365814 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -381,17 +381,18 @@ class YoutubeDL(object): Use 'default' as the name for arguments to passed to all PP The following options are used by the extractors: - dynamic_mpd: Whether to process dynamic DASH manifests (default: True) + extractor_retries: Number of times to retry for known errors + dynamic_mpd: Whether to process dynamic DASH manifests (default: True) hls_split_discontinuity: Split HLS playlists to different formats at - discontinuities such as ad breaks (default: False) + discontinuities such as ad breaks (default: False) youtube_include_dash_manifest: If True (default), DASH manifests and related - data will be downloaded and processed by extractor. - You can reduce network I/O by disabling it if you don't - care about DASH. (only for youtube) + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about DASH. (only for youtube) youtube_include_hls_manifest: If True (default), HLS manifests and related - data will be downloaded and processed by extractor. - You can reduce network I/O by disabling it if you don't - care about HLS. (only for youtube) + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about HLS. (only for youtube) """ _NUMERIC_FIELDS = set(( diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 2fd49cc8f..b8b8495e6 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -181,19 +181,21 @@ def _real_main(argv=None): # --yes-overwrites implies --no-continue opts.continue_dl = False - def parse_retries(retries): + def parse_retries(retries, name=''): if retries in ('inf', 'infinite'): parsed_retries = float('inf') else: try: parsed_retries = int(retries) except (TypeError, ValueError): - parser.error('invalid retry count specified') + parser.error('invalid %sretry count specified' % name) return parsed_retries if opts.retries is not None: opts.retries = parse_retries(opts.retries) if opts.fragment_retries is not None: - opts.fragment_retries = parse_retries(opts.fragment_retries) + opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ') + if opts.extractor_retries is not None: + opts.extractor_retries = parse_retries(opts.extractor_retries, 'extractor ') if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: @@ -458,6 +460,7 @@ def report_args_compat(arg, name): 'overwrites': opts.overwrites, 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, + 'extractor_retries': opts.extractor_retries, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'keep_fragments': opts.keep_fragments, 'buffersize': opts.buffersize, diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 804186b85..2e4ce4c12 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2762,28 +2762,36 @@ def extract_entries(parent_renderer): # this needs to called again for continua for page_num in itertools.count(1): if not continuation: break - count = 0 - retries = 3 - while count <= retries: + retries = self._downloader.params.get('extractor_retries', 3) + count = -1 + last_error = None + while count < retries: + count += 1 + if last_error: + self.report_warning('%s. Retrying ...' % last_error) try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry browse = self._download_json( 'https://www.youtube.com/browse_ajax', None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), headers=headers, query=continuation) - break except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404): + # Downloading page may result in intermittent 5xx HTTP error + # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + last_error = 'HTTP Error %s' % e.cause.code + if count < retries: continue raise - if not browse: - break - response = try_get(browse, lambda x: x[1]['response'], dict) - if not response: + else: + response = try_get(browse, lambda x: x[1]['response'], dict) + + # Youtube sometimes sends incomplete data + # See: https://github.com/ytdl-org/youtube-dl/issues/28194 + if response.get('continuationContents') or response.get('onResponseReceivedActions'): + break + last_error = 'Incomplete data recieved' + if not browse or not response: break known_continuation_renderers = { @@ -3004,11 +3012,16 @@ def _real_extract(self, url): return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - count = 0 - retries = 3 + retries = self._downloader.params.get('extractor_retries', 3) + count = -1 while count < retries: + count += 1 # Sometimes youtube returns a webpage with incomplete ytInitialData - webpage = self._download_webpage(url, item_id) + # See: https://github.com/yt-dlp/yt-dlp/issues/116 + if count: + self.report_warning('Incomplete yt initial data recieved. Retrying ...') + webpage = self._download_webpage(url, item_id, + 'Downloading webpage%s' % ' (retry #%d)' % count if count else '') identity_token = self._extract_identity_token(webpage, item_id) data = self._extract_yt_initial_data(item_id, webpage) err_msg = None @@ -3023,9 +3036,6 @@ def _real_extract(self, url): raise ExtractorError('YouTube said: %s' % err_msg, expected=True) if data.get('contents') or data.get('currentVideoEndpoint'): break - count += 1 - self.to_screen( - 'Incomplete yt initial data recieved. Retrying (attempt %d of %d)...' % (count, retries)) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 59c08fb18..866c50cb9 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1218,6 +1218,10 @@ def _dict_from_multiple_values_options_callback( help=optparse.SUPPRESS_HELP) extractor = optparse.OptionGroup(parser, 'Extractor Options') + extractor.add_option( + '--extractor-retries', + dest='extractor_retries', metavar='RETRIES', default=10, + help='Number of retries for known extractor errors (default is %default), or "infinite"') extractor.add_option( '--allow-dynamic-mpd', '--no-ignore-dynamic-mpd', action='store_true', dest='dynamic_mpd', default=True,