mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-21 20:46:36 -05:00
[extractor] Generalize getcomments
implementation
This commit is contained in:
parent
cc16383ff3
commit
a2160aa45f
3 changed files with 38 additions and 42 deletions
|
@ -97,21 +97,16 @@ def _call_api(self, video_id, id, operation, note):
|
||||||
'query': self._GRAPHQL_QUERIES[operation]
|
'query': self._GRAPHQL_QUERIES[operation]
|
||||||
}).encode('utf8')).get('data')
|
}).encode('utf8')).get('data')
|
||||||
|
|
||||||
def _extract_comments(self, video_id, comments, comment_data):
|
def _get_comments(self, video_id, comments, comment_data):
|
||||||
|
yield from comments
|
||||||
for comment in comment_data.copy():
|
for comment in comment_data.copy():
|
||||||
comment_id = comment.get('_id')
|
comment_id = comment.get('_id')
|
||||||
if comment.get('replyCount') > 0:
|
if comment.get('replyCount') > 0:
|
||||||
reply_json = self._call_api(
|
reply_json = self._call_api(
|
||||||
video_id, comment_id, 'GetCommentReplies',
|
video_id, comment_id, 'GetCommentReplies',
|
||||||
f'Downloading replies for comment {comment_id}')
|
f'Downloading replies for comment {comment_id}')
|
||||||
comments.extend(
|
for reply in reply_json.get('getCommentReplies'):
|
||||||
self._parse_comment(reply, comment_id)
|
yield self._parse_comment(reply, comment_id)
|
||||||
for reply in reply_json.get('getCommentReplies'))
|
|
||||||
|
|
||||||
return {
|
|
||||||
'comments': comments,
|
|
||||||
'comment_count': len(comments),
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_comment(comment_data, parent):
|
def _parse_comment(comment_data, parent):
|
||||||
|
@ -159,7 +154,5 @@ def _real_extract(self, url):
|
||||||
'tags': [tag.get('name') for tag in video_info.get('tags')],
|
'tags': [tag.get('name') for tag in video_info.get('tags')],
|
||||||
'availability': self._availability(is_unlisted=video_info.get('unlisted')),
|
'availability': self._availability(is_unlisted=video_info.get('unlisted')),
|
||||||
'comments': comments,
|
'comments': comments,
|
||||||
'__post_extractor': (
|
'__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments'))
|
||||||
(lambda: self._extract_comments(video_id, comments, video_json.get('getVideoComments')))
|
|
||||||
if self.get_param('getcomments') else None)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3502,6 +3502,32 @@ def extract_subtitles(self, *args, **kwargs):
|
||||||
def _get_subtitles(self, *args, **kwargs):
|
def _get_subtitles(self, *args, **kwargs):
|
||||||
raise NotImplementedError('This method must be implemented by subclasses')
|
raise NotImplementedError('This method must be implemented by subclasses')
|
||||||
|
|
||||||
|
def extract_comments(self, *args, **kwargs):
|
||||||
|
if not self.get_param('getcomments'):
|
||||||
|
return None
|
||||||
|
generator = self._get_comments(*args, **kwargs)
|
||||||
|
|
||||||
|
def extractor():
|
||||||
|
comments = []
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
comments.append(next(generator))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
interrupted = True
|
||||||
|
self.to_screen('Interrupted by user')
|
||||||
|
except StopIteration:
|
||||||
|
interrupted = False
|
||||||
|
comment_count = len(comments)
|
||||||
|
self.to_screen(f'Extracted {comment_count} comments')
|
||||||
|
return {
|
||||||
|
'comments': comments,
|
||||||
|
'comment_count': None if interrupted else comment_count
|
||||||
|
}
|
||||||
|
return extractor
|
||||||
|
|
||||||
|
def _get_comments(self, *args, **kwargs):
|
||||||
|
raise NotImplementedError('This method must be implemented by subclasses')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
|
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
|
||||||
""" Merge subtitle items for one language. Items with duplicated URLs
|
""" Merge subtitle items for one language. Items with duplicated URLs
|
||||||
|
|
|
@ -2241,7 +2241,6 @@ def _extract_comment(self, comment_renderer, parent=None):
|
||||||
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
|
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
|
||||||
|
|
||||||
def extract_header(contents):
|
def extract_header(contents):
|
||||||
_total_comments = 0
|
|
||||||
_continuation = None
|
_continuation = None
|
||||||
for content in contents:
|
for content in contents:
|
||||||
comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
|
comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
|
||||||
|
@ -2251,7 +2250,6 @@ def extract_header(contents):
|
||||||
if expected_comment_count:
|
if expected_comment_count:
|
||||||
comment_counts[1] = expected_comment_count
|
comment_counts[1] = expected_comment_count
|
||||||
self.to_screen('Downloading ~%d comments' % expected_comment_count)
|
self.to_screen('Downloading ~%d comments' % expected_comment_count)
|
||||||
_total_comments = comment_counts[1]
|
|
||||||
sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
|
sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
|
||||||
comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
|
comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
|
||||||
|
|
||||||
|
@ -2271,7 +2269,7 @@ def extract_header(contents):
|
||||||
sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
|
sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
|
||||||
self.to_screen('Sorting comments by %s' % sort_text)
|
self.to_screen('Sorting comments by %s' % sort_text)
|
||||||
break
|
break
|
||||||
return _total_comments, _continuation
|
return _continuation
|
||||||
|
|
||||||
def extract_thread(contents):
|
def extract_thread(contents):
|
||||||
if not parent:
|
if not parent:
|
||||||
|
@ -2359,9 +2357,7 @@ def extract_thread(contents):
|
||||||
lambda x: x['appendContinuationItemsAction']['continuationItems']),
|
lambda x: x['appendContinuationItemsAction']['continuationItems']),
|
||||||
list) or []
|
list) or []
|
||||||
if is_first_continuation:
|
if is_first_continuation:
|
||||||
total_comments, continuation = extract_header(continuation_items)
|
continuation = extract_header(continuation_items)
|
||||||
if total_comments:
|
|
||||||
yield total_comments
|
|
||||||
is_first_continuation = False
|
is_first_continuation = False
|
||||||
if continuation:
|
if continuation:
|
||||||
break
|
break
|
||||||
|
@ -2389,9 +2385,7 @@ def extract_thread(contents):
|
||||||
continue
|
continue
|
||||||
if is_first_continuation:
|
if is_first_continuation:
|
||||||
header_continuation_items = [continuation_renderer.get('header') or {}]
|
header_continuation_items = [continuation_renderer.get('header') or {}]
|
||||||
total_comments, continuation = extract_header(header_continuation_items)
|
continuation = extract_header(header_continuation_items)
|
||||||
if total_comments:
|
|
||||||
yield total_comments
|
|
||||||
is_first_continuation = False
|
is_first_continuation = False
|
||||||
if continuation:
|
if continuation:
|
||||||
break
|
break
|
||||||
|
@ -2419,35 +2413,19 @@ def _generate_comment_continuation(video_id):
|
||||||
[bytes_to_intlist(base64.b64decode(part)) for part in parts]))
|
[bytes_to_intlist(base64.b64decode(part)) for part in parts]))
|
||||||
return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
|
return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
|
||||||
|
|
||||||
def _extract_comments(self, ytcfg, video_id, contents, webpage):
|
def _get_comments(self, ytcfg, video_id, contents, webpage):
|
||||||
"""Entry for comment extraction"""
|
"""Entry for comment extraction"""
|
||||||
def _real_comment_extract(contents):
|
def _real_comment_extract(contents):
|
||||||
yield from self._comment_entries(
|
yield from self._comment_entries(
|
||||||
traverse_obj(contents, (..., 'itemSectionRenderer'), get_all=False), ytcfg, video_id)
|
traverse_obj(contents, (..., 'itemSectionRenderer'), get_all=False), ytcfg, video_id)
|
||||||
|
|
||||||
comments = []
|
max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0])
|
||||||
estimated_total = 0
|
|
||||||
max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
|
|
||||||
# Force English regardless of account setting to prevent parsing issues
|
# Force English regardless of account setting to prevent parsing issues
|
||||||
# See: https://github.com/yt-dlp/yt-dlp/issues/532
|
# See: https://github.com/yt-dlp/yt-dlp/issues/532
|
||||||
ytcfg = copy.deepcopy(ytcfg)
|
ytcfg = copy.deepcopy(ytcfg)
|
||||||
traverse_obj(
|
traverse_obj(
|
||||||
ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
|
ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
|
||||||
try:
|
return itertools.islice(_real_comment_extract(contents), 0, max_comments)
|
||||||
for comment in _real_comment_extract(contents):
|
|
||||||
if len(comments) >= max_comments:
|
|
||||||
break
|
|
||||||
if isinstance(comment, int):
|
|
||||||
estimated_total = comment
|
|
||||||
continue
|
|
||||||
comments.append(comment)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
self.to_screen('Interrupted by user')
|
|
||||||
self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
|
|
||||||
return {
|
|
||||||
'comments': comments,
|
|
||||||
'comment_count': len(comments),
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_checkok_params():
|
def _get_checkok_params():
|
||||||
|
@ -3209,8 +3187,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
|
||||||
needs_auth=info['age_limit'] >= 18,
|
needs_auth=info['age_limit'] >= 18,
|
||||||
is_unlisted=None if is_private is None else is_unlisted)
|
is_unlisted=None if is_private is None else is_unlisted)
|
||||||
|
|
||||||
if self.get_param('getcomments', False):
|
info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage)
|
||||||
info['__post_extractor'] = lambda: self._extract_comments(master_ytcfg, video_id, contents, webpage)
|
|
||||||
|
|
||||||
self.mark_watched(video_id, player_responses)
|
self.mark_watched(video_id, player_responses)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue