Add option --extractor-retries to retry on known extractor errors

* Currently only used by youtube

Fixes https://github.com/ytdl-org/youtube-dl/issues/28194
Possibly also fixes: https://github.com/ytdl-org/youtube-dl/issues/28289 (can not confirm since the issue isn't reliably reproducible)
This commit is contained in:
pukkandan 2021-03-01 05:18:37 +05:30
parent f0884c8b3f
commit 62bff2c170
No known key found for this signature in database
GPG key ID: 0F00D95A001F4698
4 changed files with 48 additions and 30 deletions

View file

@ -381,17 +381,18 @@ class YoutubeDL(object):
Use 'default' as the name for arguments to passed to all PP Use 'default' as the name for arguments to passed to all PP
The following options are used by the extractors: The following options are used by the extractors:
dynamic_mpd: Whether to process dynamic DASH manifests (default: True) extractor_retries: Number of times to retry for known errors
dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
hls_split_discontinuity: Split HLS playlists to different formats at hls_split_discontinuity: Split HLS playlists to different formats at
discontinuities such as ad breaks (default: False) discontinuities such as ad breaks (default: False)
youtube_include_dash_manifest: If True (default), DASH manifests and related youtube_include_dash_manifest: If True (default), DASH manifests and related
data will be downloaded and processed by extractor. data will be downloaded and processed by extractor.
You can reduce network I/O by disabling it if you don't You can reduce network I/O by disabling it if you don't
care about DASH. (only for youtube) care about DASH. (only for youtube)
youtube_include_hls_manifest: If True (default), HLS manifests and related youtube_include_hls_manifest: If True (default), HLS manifests and related
data will be downloaded and processed by extractor. data will be downloaded and processed by extractor.
You can reduce network I/O by disabling it if you don't You can reduce network I/O by disabling it if you don't
care about HLS. (only for youtube) care about HLS. (only for youtube)
""" """
_NUMERIC_FIELDS = set(( _NUMERIC_FIELDS = set((

View file

@ -181,19 +181,21 @@ def _real_main(argv=None):
# --yes-overwrites implies --no-continue # --yes-overwrites implies --no-continue
opts.continue_dl = False opts.continue_dl = False
def parse_retries(retries): def parse_retries(retries, name=''):
if retries in ('inf', 'infinite'): if retries in ('inf', 'infinite'):
parsed_retries = float('inf') parsed_retries = float('inf')
else: else:
try: try:
parsed_retries = int(retries) parsed_retries = int(retries)
except (TypeError, ValueError): except (TypeError, ValueError):
parser.error('invalid retry count specified') parser.error('invalid %sretry count specified' % name)
return parsed_retries return parsed_retries
if opts.retries is not None: if opts.retries is not None:
opts.retries = parse_retries(opts.retries) opts.retries = parse_retries(opts.retries)
if opts.fragment_retries is not None: if opts.fragment_retries is not None:
opts.fragment_retries = parse_retries(opts.fragment_retries) opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ')
if opts.extractor_retries is not None:
opts.extractor_retries = parse_retries(opts.extractor_retries, 'extractor ')
if opts.buffersize is not None: if opts.buffersize is not None:
numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
if numeric_buffersize is None: if numeric_buffersize is None:
@ -458,6 +460,7 @@ def report_args_compat(arg, name):
'overwrites': opts.overwrites, 'overwrites': opts.overwrites,
'retries': opts.retries, 'retries': opts.retries,
'fragment_retries': opts.fragment_retries, 'fragment_retries': opts.fragment_retries,
'extractor_retries': opts.extractor_retries,
'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'skip_unavailable_fragments': opts.skip_unavailable_fragments,
'keep_fragments': opts.keep_fragments, 'keep_fragments': opts.keep_fragments,
'buffersize': opts.buffersize, 'buffersize': opts.buffersize,

View file

@ -2762,28 +2762,36 @@ def extract_entries(parent_renderer): # this needs to called again for continua
for page_num in itertools.count(1): for page_num in itertools.count(1):
if not continuation: if not continuation:
break break
count = 0 retries = self._downloader.params.get('extractor_retries', 3)
retries = 3 count = -1
while count <= retries: last_error = None
while count < retries:
count += 1
if last_error:
self.report_warning('%s. Retrying ...' % last_error)
try: try:
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
browse = self._download_json( browse = self._download_json(
'https://www.youtube.com/browse_ajax', None, 'https://www.youtube.com/browse_ajax', None,
'Downloading page %d%s' 'Downloading page %d%s'
% (page_num, ' (retry #%d)' % count if count else ''), % (page_num, ' (retry #%d)' % count if count else ''),
headers=headers, query=continuation) headers=headers, query=continuation)
break
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
count += 1 # Downloading page may result in intermittent 5xx HTTP error
if count <= retries: # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
last_error = 'HTTP Error %s' % e.cause.code
if count < retries:
continue continue
raise raise
if not browse: else:
break response = try_get(browse, lambda x: x[1]['response'], dict)
response = try_get(browse, lambda x: x[1]['response'], dict)
if not response: # Youtube sometimes sends incomplete data
# See: https://github.com/ytdl-org/youtube-dl/issues/28194
if response.get('continuationContents') or response.get('onResponseReceivedActions'):
break
last_error = 'Incomplete data recieved'
if not browse or not response:
break break
known_continuation_renderers = { known_continuation_renderers = {
@ -3004,11 +3012,16 @@ def _real_extract(self, url):
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
count = 0 retries = self._downloader.params.get('extractor_retries', 3)
retries = 3 count = -1
while count < retries: while count < retries:
count += 1
# Sometimes youtube returns a webpage with incomplete ytInitialData # Sometimes youtube returns a webpage with incomplete ytInitialData
webpage = self._download_webpage(url, item_id) # See: https://github.com/yt-dlp/yt-dlp/issues/116
if count:
self.report_warning('Incomplete yt initial data recieved. Retrying ...')
webpage = self._download_webpage(url, item_id,
'Downloading webpage%s' % ' (retry #%d)' % count if count else '')
identity_token = self._extract_identity_token(webpage, item_id) identity_token = self._extract_identity_token(webpage, item_id)
data = self._extract_yt_initial_data(item_id, webpage) data = self._extract_yt_initial_data(item_id, webpage)
err_msg = None err_msg = None
@ -3023,9 +3036,6 @@ def _real_extract(self, url):
raise ExtractorError('YouTube said: %s' % err_msg, expected=True) raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
if data.get('contents') or data.get('currentVideoEndpoint'): if data.get('contents') or data.get('currentVideoEndpoint'):
break break
count += 1
self.to_screen(
'Incomplete yt initial data recieved. Retrying (attempt %d of %d)...' % (count, retries))
tabs = try_get( tabs = try_get(
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)

View file

@ -1218,6 +1218,10 @@ def _dict_from_multiple_values_options_callback(
help=optparse.SUPPRESS_HELP) help=optparse.SUPPRESS_HELP)
extractor = optparse.OptionGroup(parser, 'Extractor Options') extractor = optparse.OptionGroup(parser, 'Extractor Options')
extractor.add_option(
'--extractor-retries',
dest='extractor_retries', metavar='RETRIES', default=10,
help='Number of retries for known extractor errors (default is %default), or "infinite"')
extractor.add_option( extractor.add_option(
'--allow-dynamic-mpd', '--no-ignore-dynamic-mpd', '--allow-dynamic-mpd', '--no-ignore-dynamic-mpd',
action='store_true', dest='dynamic_mpd', default=True, action='store_true', dest='dynamic_mpd', default=True,