[extractor/youtube] Fix continuation loop with no comments (#7148)

Deep check the response for incomplete data.

Authored by: coletdjnz
This commit is contained in:
coletdjnz 2023-05-31 19:08:28 +12:00 committed by GitHub
parent c2502cfed9
commit 18f8fba7c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -3314,7 +3314,7 @@ def extract_header(contents):
expected_comment_count = self._get_count( expected_comment_count = self._get_count(
comments_header_renderer, 'countText', 'commentsCount') comments_header_renderer, 'countText', 'commentsCount')
if expected_comment_count: if expected_comment_count is not None:
tracker['est_total'] = expected_comment_count tracker['est_total'] = expected_comment_count
self.to_screen(f'Downloading ~{expected_comment_count} comments') self.to_screen(f'Downloading ~{expected_comment_count} comments')
comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top
@ -3385,7 +3385,7 @@ def extract_thread(contents):
if not tracker: if not tracker:
tracker = dict( tracker = dict(
running_total=0, running_total=0,
est_total=0, est_total=None,
current_page_thread=0, current_page_thread=0,
total_parent_comments=0, total_parent_comments=0,
total_reply_comments=0, total_reply_comments=0,
@ -3418,11 +3418,13 @@ def extract_thread(contents):
continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id)) continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id))
is_forced_continuation = True is_forced_continuation = True
continuation_items_path = (
'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems')
for page_num in itertools.count(0): for page_num in itertools.count(0):
if not continuation: if not continuation:
break break
headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response)) headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))
comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})" comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})"
if page_num == 0: if page_num == 0:
if is_first_continuation: if is_first_continuation:
note_prefix = 'Downloading comment section API JSON' note_prefix = 'Downloading comment section API JSON'
@ -3433,11 +3435,18 @@ def extract_thread(contents):
note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
' ' if parent else '', ' replies' if parent else '', ' ' if parent else '', ' replies' if parent else '',
page_num, comment_prog_str) page_num, comment_prog_str)
# Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation
# Ignore check if YouTube says the comment count is 0.
check_get_keys = None
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
check_get_keys = [[*continuation_items_path, ..., (
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
try: try:
response = self._extract_response( response = self._extract_response(
item_id=None, query=continuation, item_id=None, query=continuation,
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) check_get_keys=check_get_keys)
except ExtractorError as e: except ExtractorError as e:
# Ignore incomplete data error for replies if retries didn't work. # Ignore incomplete data error for replies if retries didn't work.
# This is to allow any other parent comments and comment threads to be downloaded. # This is to allow any other parent comments and comment threads to be downloaded.
@ -3449,15 +3458,8 @@ def extract_thread(contents):
else: else:
raise raise
is_forced_continuation = False is_forced_continuation = False
continuation_contents = traverse_obj(
response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
continuation = None continuation = None
for continuation_section in continuation_contents: for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
continuation_items = traverse_obj(
continuation_section,
(('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'),
get_all=False, expected_type=list) or []
if is_first_continuation: if is_first_continuation:
continuation = extract_header(continuation_items) continuation = extract_header(continuation_items)
is_first_continuation = False is_first_continuation = False