From a2160aa45f4019e02ced01c9030aa9519b40b24f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 12 Oct 2021 15:20:50 +0530 Subject: [PATCH] [extractor] Generalize `getcomments` implementation --- yt_dlp/extractor/bannedvideo.py | 17 +++++---------- yt_dlp/extractor/common.py | 26 +++++++++++++++++++++++ yt_dlp/extractor/youtube.py | 37 +++++++-------------------------- 3 files changed, 38 insertions(+), 42 deletions(-) diff --git a/yt_dlp/extractor/bannedvideo.py b/yt_dlp/extractor/bannedvideo.py index 8f8f5ef5f..3db1151f6 100644 --- a/yt_dlp/extractor/bannedvideo.py +++ b/yt_dlp/extractor/bannedvideo.py @@ -97,21 +97,16 @@ def _call_api(self, video_id, id, operation, note): 'query': self._GRAPHQL_QUERIES[operation] }).encode('utf8')).get('data') - def _extract_comments(self, video_id, comments, comment_data): + def _get_comments(self, video_id, comments, comment_data): + yield from comments for comment in comment_data.copy(): comment_id = comment.get('_id') if comment.get('replyCount') > 0: reply_json = self._call_api( video_id, comment_id, 'GetCommentReplies', f'Downloading replies for comment {comment_id}') - comments.extend( - self._parse_comment(reply, comment_id) - for reply in reply_json.get('getCommentReplies')) - - return { - 'comments': comments, - 'comment_count': len(comments), - } + for reply in reply_json.get('getCommentReplies'): + yield self._parse_comment(reply, comment_id) @staticmethod def _parse_comment(comment_data, parent): @@ -159,7 +154,5 @@ def _real_extract(self, url): 'tags': [tag.get('name') for tag in video_info.get('tags')], 'availability': self._availability(is_unlisted=video_info.get('unlisted')), 'comments': comments, - '__post_extractor': ( - (lambda: self._extract_comments(video_id, comments, video_json.get('getVideoComments'))) - if self.get_param('getcomments') else None) + '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments')) } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d02a808b6..5b7b8891a 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3502,6 +3502,32 @@ def extract_subtitles(self, *args, **kwargs): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def extract_comments(self, *args, **kwargs): + if not self.get_param('getcomments'): + return None + generator = self._get_comments(*args, **kwargs) + + def extractor(): + comments = [] + try: + while True: + comments.append(next(generator)) + except KeyboardInterrupt: + interrupted = True + self.to_screen('Interrupted by user') + except StopIteration: + interrupted = False + comment_count = len(comments) + self.to_screen(f'Extracted {comment_count} comments') + return { + 'comments': comments, + 'comment_count': None if interrupted else comment_count + } + return extractor + + def _get_comments(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + @staticmethod def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 41fd0aef7..3e93c9934 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2241,7 +2241,6 @@ def _extract_comment(self, comment_renderer, parent=None): def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None): def extract_header(contents): - _total_comments = 0 _continuation = None for content in contents: comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer']) @@ -2251,7 +2250,6 @@ def extract_header(contents): if expected_comment_count: comment_counts[1] = expected_comment_count self.to_screen('Downloading ~%d comments' % expected_comment_count) - _total_comments = comment_counts[1] sort_mode_str = self._configuration_arg('comment_sort', [''])[0] comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top @@ -2271,7 +2269,7 @@ def extract_header(contents): sort_text = 'top comments' if comment_sort_index == 0 else 'newest first' self.to_screen('Sorting comments by %s' % sort_text) break - return _total_comments, _continuation + return _continuation def extract_thread(contents): if not parent: @@ -2359,9 +2357,7 @@ def extract_thread(contents): lambda x: x['appendContinuationItemsAction']['continuationItems']), list) or [] if is_first_continuation: - total_comments, continuation = extract_header(continuation_items) - if total_comments: - yield total_comments + continuation = extract_header(continuation_items) is_first_continuation = False if continuation: break @@ -2389,9 +2385,7 @@ def extract_thread(contents): continue if is_first_continuation: header_continuation_items = [continuation_renderer.get('header') or {}] - total_comments, continuation = extract_header(header_continuation_items) - if total_comments: - yield total_comments + continuation = extract_header(header_continuation_items) is_first_continuation = False if continuation: break @@ -2419,35 +2413,19 @@ def _generate_comment_continuation(video_id): [bytes_to_intlist(base64.b64decode(part)) for part in parts])) return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8') - def _extract_comments(self, ytcfg, video_id, contents, webpage): + def _get_comments(self, ytcfg, video_id, contents, webpage): """Entry for comment extraction""" def _real_comment_extract(contents): yield from self._comment_entries( traverse_obj(contents, (..., 'itemSectionRenderer'), get_all=False), ytcfg, video_id) - comments = [] - estimated_total = 0 - max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf') + max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) # Force English regardless of account setting to prevent parsing issues # See: https://github.com/yt-dlp/yt-dlp/issues/532 ytcfg = copy.deepcopy(ytcfg) traverse_obj( ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en' - try: - for comment in _real_comment_extract(contents): - if len(comments) >= max_comments: - break - if isinstance(comment, int): - estimated_total = comment - continue - comments.append(comment) - except KeyboardInterrupt: - self.to_screen('Interrupted by user') - self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total)) - return { - 'comments': comments, - 'comment_count': len(comments), - } + return itertools.islice(_real_comment_extract(contents), 0, max_comments) @staticmethod def _get_checkok_params(): @@ -3209,8 +3187,7 @@ def process_language(container, base_url, lang_code, sub_name, query): needs_auth=info['age_limit'] >= 18, is_unlisted=None if is_private is None else is_unlisted) - if self.get_param('getcomments', False): - info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage) + info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) self.mark_watched(video_id, player_responses)