[extractor/youtube] Fix continuation loop with no comments (#7148)

Deep check the response for incomplete data. Authored by: coletdjnz
2024-11-21 20:46:36 -05:00 · 2023-05-31 19:08:28 +12:00 · 2023-05-31 19:08:28 +12:00 · 18f8fba7c8
commit 18f8fba7c8
parent c2502cfed9
1 changed files with 14 additions and 12 deletions
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -3314,7 +3314,7 @@ def extract_header(contents):
                expected_comment_count = self._get_count(
                    comments_header_renderer, 'countText', 'commentsCount')
-                if expected_comment_count:
+                if expected_comment_count is not None:
                    tracker['est_total'] = expected_comment_count
                    self.to_screen(f'Downloading ~{expected_comment_count} comments')
                comment_sort_index = int(get_single_config_arg('comment_sort') != 'top')  # 1 = new, 0 = top
@ -3385,7 +3385,7 @@ def extract_thread(contents):
        if not tracker:
            tracker = dict(
                running_total=0,
-                est_total=0,
+                est_total=None,
                current_page_thread=0,
                total_parent_comments=0,
                total_reply_comments=0,
@ -3418,11 +3418,13 @@ def extract_thread(contents):
            continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id))
            is_forced_continuation = True
        continuation_items_path = (
            'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems')
        for page_num in itertools.count(0):
            if not continuation:
                break
            headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))
-            comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})"
+            comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})"
            if page_num == 0:
                if is_first_continuation:
                    note_prefix = 'Downloading comment section API JSON'
@ -3433,11 +3435,18 @@ def extract_thread(contents):
                note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
                    '       ' if parent else '', ' replies' if parent else '',
                    page_num, comment_prog_str)
            # Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation
            # Ignore check if YouTube says the comment count is 0.
            check_get_keys = None
            if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
                check_get_keys = [[*continuation_items_path, ..., (
                    'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
            try:
                response = self._extract_response(
                    item_id=None, query=continuation,
                    ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
-                    check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None)
+                    check_get_keys=check_get_keys)
            except ExtractorError as e:
                # Ignore incomplete data error for replies if retries didn't work.
                # This is to allow any other parent comments and comment threads to be downloaded.
@ -3449,15 +3458,8 @@ def extract_thread(contents):
                else:
                    raise
            is_forced_continuation = False
            continuation_contents = traverse_obj(
                response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
            continuation = None
-            for continuation_section in continuation_contents:
+            for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
                continuation_items = traverse_obj(
                    continuation_section,
                    (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'),
                    get_all=False, expected_type=list) or []
                if is_first_continuation:
                    continuation = extract_header(continuation_items)
                    is_first_continuation = False