mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-21 20:46:36 -05:00
[extractor/youtube:tab] Fix video metadata from tabs (#5489)
Closes #5488 Authored by: coletdjnz
This commit is contained in:
parent
495322b95b
commit
4dc23a8051
1 changed files with 84 additions and 12 deletions
|
@ -912,7 +912,12 @@ def is_music_url(url):
|
||||||
|
|
||||||
def _extract_video(self, renderer):
|
def _extract_video(self, renderer):
|
||||||
video_id = renderer.get('videoId')
|
video_id = renderer.get('videoId')
|
||||||
title = self._get_text(renderer, 'title')
|
|
||||||
|
reel_header_renderer = traverse_obj(renderer, (
|
||||||
|
'navigationEndpoint', 'reelWatchEndpoint', 'overlay', 'reelPlayerOverlayRenderer',
|
||||||
|
'reelPlayerHeaderSupportedRenderers', 'reelPlayerHeaderRenderer'))
|
||||||
|
|
||||||
|
title = self._get_text(renderer, 'title', 'headline') or self._get_text(reel_header_renderer, 'reelTitleText')
|
||||||
description = self._get_text(renderer, 'descriptionSnippet')
|
description = self._get_text(renderer, 'descriptionSnippet')
|
||||||
|
|
||||||
duration = int_or_none(renderer.get('lengthSeconds'))
|
duration = int_or_none(renderer.get('lengthSeconds'))
|
||||||
|
@ -920,24 +925,23 @@ def _extract_video(self, renderer):
|
||||||
duration = parse_duration(self._get_text(
|
duration = parse_duration(self._get_text(
|
||||||
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
|
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
|
||||||
if duration is None:
|
if duration is None:
|
||||||
|
# XXX: should write a parser to be more general to support more cases (e.g. shorts in shorts tab)
|
||||||
duration = parse_duration(self._search_regex(
|
duration = parse_duration(self._search_regex(
|
||||||
r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
|
r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
|
||||||
traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
|
traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
|
||||||
video_id, default=None, group='duration'))
|
video_id, default=None, group='duration'))
|
||||||
|
|
||||||
# videoInfo is a string like '50K views • 10 years ago'.
|
|
||||||
view_count = self._get_count(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo')
|
|
||||||
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
|
|
||||||
channel_id = traverse_obj(
|
channel_id = traverse_obj(
|
||||||
renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
|
renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
|
||||||
expected_type=str, get_all=False)
|
expected_type=str, get_all=False)
|
||||||
time_text = self._get_text(renderer, 'publishedTimeText', 'videoInfo') or ''
|
if not channel_id:
|
||||||
scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
|
channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId'))
|
||||||
|
|
||||||
overlay_style = traverse_obj(
|
overlay_style = traverse_obj(
|
||||||
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
|
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
|
||||||
get_all=False, expected_type=str)
|
get_all=False, expected_type=str)
|
||||||
badges = self._extract_badges(renderer)
|
badges = self._extract_badges(renderer)
|
||||||
thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
|
|
||||||
navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
|
navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
|
||||||
renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'),
|
renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'),
|
||||||
expected_type=str)) or ''
|
expected_type=str)) or ''
|
||||||
|
@ -945,12 +949,22 @@ def _extract_video(self, renderer):
|
||||||
if overlay_style == 'SHORTS' or '/shorts/' in navigation_url:
|
if overlay_style == 'SHORTS' or '/shorts/' in navigation_url:
|
||||||
url = f'https://www.youtube.com/shorts/{video_id}'
|
url = f'https://www.youtube.com/shorts/{video_id}'
|
||||||
|
|
||||||
|
time_text = (self._get_text(renderer, 'publishedTimeText', 'videoInfo')
|
||||||
|
or self._get_text(reel_header_renderer, 'timestampText') or '')
|
||||||
|
scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
|
||||||
|
|
||||||
live_status = (
|
live_status = (
|
||||||
'is_upcoming' if scheduled_timestamp is not None
|
'is_upcoming' if scheduled_timestamp is not None
|
||||||
else 'was_live' if 'streamed' in time_text.lower()
|
else 'was_live' if 'streamed' in time_text.lower()
|
||||||
else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW)
|
else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW)
|
||||||
else None)
|
else None)
|
||||||
|
|
||||||
|
# videoInfo is a string like '50K views • 10 years ago'.
|
||||||
|
view_count_text = self._get_text(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo') or ''
|
||||||
|
view_count = (0 if 'no views' in view_count_text.lower()
|
||||||
|
else self._get_count({'simpleText': view_count_text}))
|
||||||
|
view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count'
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'_type': 'url',
|
'_type': 'url',
|
||||||
'ie_key': YoutubeIE.ie_key(),
|
'ie_key': YoutubeIE.ie_key(),
|
||||||
|
@ -959,9 +973,11 @@ def _extract_video(self, renderer):
|
||||||
'title': title,
|
'title': title,
|
||||||
'description': description,
|
'description': description,
|
||||||
'duration': duration,
|
'duration': duration,
|
||||||
'uploader': uploader,
|
|
||||||
'channel_id': channel_id,
|
'channel_id': channel_id,
|
||||||
'thumbnails': thumbnails,
|
'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText')
|
||||||
|
or self._get_text(reel_header_renderer, 'channelTitleText')),
|
||||||
|
'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
|
||||||
|
'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'),
|
||||||
'timestamp': (self._parse_time_text(time_text)
|
'timestamp': (self._parse_time_text(time_text)
|
||||||
if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE)
|
if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE)
|
||||||
else None),
|
else None),
|
||||||
|
@ -973,7 +989,7 @@ def _extract_video(self, renderer):
|
||||||
needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None,
|
needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None,
|
||||||
needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
|
needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
|
||||||
is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None),
|
is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None),
|
||||||
'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count': view_count,
|
view_count_field: view_count,
|
||||||
'live_status': live_status
|
'live_status': live_status
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5484,7 +5500,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
|
||||||
'title': '#cctv9',
|
'title': '#cctv9',
|
||||||
'tags': [],
|
'tags': [],
|
||||||
},
|
},
|
||||||
'playlist_mincount': 350,
|
'playlist_mincount': 300, # not consistent but should be over 300
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
|
'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
|
@ -5671,7 +5687,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
|
||||||
'tags': [],
|
'tags': [],
|
||||||
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||||
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||||
'description': '',
|
'description': 'test description',
|
||||||
'title': 'cole-dlp-test-acc - 再生リスト',
|
'title': 'cole-dlp-test-acc - 再生リスト',
|
||||||
'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||||
'uploader': 'cole-dlp-test-acc',
|
'uploader': 'cole-dlp-test-acc',
|
||||||
|
@ -5828,6 +5844,62 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
|
||||||
'tags': [],
|
'tags': [],
|
||||||
},
|
},
|
||||||
'playlist_mincount': 30,
|
'playlist_mincount': 30,
|
||||||
|
}, {
|
||||||
|
# Shorts url result in shorts tab
|
||||||
|
'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
'title': 'cole-dlp-test-acc - Shorts',
|
||||||
|
'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
'channel': 'cole-dlp-test-acc',
|
||||||
|
'channel_follower_count': int,
|
||||||
|
'description': 'test description',
|
||||||
|
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
'tags': [],
|
||||||
|
'uploader': 'cole-dlp-test-acc',
|
||||||
|
'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
|
||||||
|
},
|
||||||
|
'playlist': [{
|
||||||
|
'info_dict': {
|
||||||
|
'_type': 'url',
|
||||||
|
'ie_key': 'Youtube',
|
||||||
|
'url': 'https://www.youtube.com/shorts/sSM9J5YH_60',
|
||||||
|
'id': 'sSM9J5YH_60',
|
||||||
|
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
'title': 'SHORT short',
|
||||||
|
'channel': 'cole-dlp-test-acc',
|
||||||
|
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||||
|
'view_count': int,
|
||||||
|
'thumbnails': list,
|
||||||
|
}
|
||||||
|
}],
|
||||||
|
'params': {'extract_flat': True},
|
||||||
|
}, {
|
||||||
|
# Live video status should be extracted
|
||||||
|
'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'UCQvWX73GQygcwXOTSf_VDVg',
|
||||||
|
'title': 'UCQvWX73GQygcwXOTSf_VDVg - Live', # TODO, should be Minecraft - Live or Minecraft - Topic - Live
|
||||||
|
'tags': []
|
||||||
|
},
|
||||||
|
'playlist': [{
|
||||||
|
'info_dict': {
|
||||||
|
'_type': 'url',
|
||||||
|
'ie_key': 'Youtube',
|
||||||
|
'url': 'startswith:https://www.youtube.com/watch?v=',
|
||||||
|
'id': str,
|
||||||
|
'title': str,
|
||||||
|
'live_status': 'is_live',
|
||||||
|
'channel_id': str,
|
||||||
|
'channel_url': str,
|
||||||
|
'concurrent_view_count': int,
|
||||||
|
'channel': str,
|
||||||
|
}
|
||||||
|
}],
|
||||||
|
'params': {'extract_flat': True},
|
||||||
|
'playlist_mincount': 1
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
Loading…
Reference in a new issue