[extractor/youtube] Download post_live videos from start (#5091)

* The fragments are generated as a `LazyList`. So only the required formats are expanded during download, but all fragment lists are printed/written in infojson.
* The m3u8 formats which cannot be downloaded from start are not extracted by default, but can be enabled with an extractor-arg. The extractor-arg `include_live_dash` is renamed to `include_incomplete_formats` to account for this new use-case.

Closes #1564
Authored by: Lesmiscore, pukkandan
This commit is contained in:
Lesmiscore 2022-10-04 11:48:31 +09:00 committed by GitHub
parent dd4411aac2
commit 4d37720a0c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 98 additions and 63 deletions

View file

@ -1704,7 +1704,7 @@ #### youtube
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) * `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8)
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests * `innertube_key`: Innertube API key to use for all API requests

View file

@ -24,6 +24,7 @@
from ..utils import ( from ..utils import (
NO_DEFAULT, NO_DEFAULT,
ExtractorError, ExtractorError,
LazyList,
UserNotLive, UserNotLive,
bug_reports_message, bug_reports_message,
classproperty, classproperty,
@ -2493,10 +2494,8 @@ def __init__(self, *args, **kwargs):
self._code_cache = {} self._code_cache = {}
self._player_cache = {} self._player_cache = {}
def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live):
lock = threading.Lock() lock = threading.Lock()
is_live = True
start_time = time.time() start_time = time.time()
formats = [f for f in formats if f.get('is_from_start')] formats = [f for f in formats if f.get('is_from_start')]
@ -2511,7 +2510,8 @@ def refetch_manifest(format_id, delay):
microformats = traverse_obj( microformats = traverse_obj(
prs, (..., 'microformat', 'playerMicroformatRenderer'), prs, (..., 'microformat', 'playerMicroformatRenderer'),
expected_type=dict, default=[]) expected_type=dict, default=[])
_, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
is_live = live_status == 'is_live'
start_time = time.time() start_time = time.time()
def mpd_feed(format_id, delay): def mpd_feed(format_id, delay):
@ -2532,12 +2532,17 @@ def mpd_feed(format_id, delay):
return f['manifest_url'], f['manifest_stream_number'], is_live return f['manifest_url'], f['manifest_stream_number'], is_live
for f in formats: for f in formats:
f['is_live'] = True f['is_live'] = is_live
gen = functools.partial(self._live_dash_fragments, video_id, f['format_id'],
live_start_time, mpd_feed, not is_live and f.copy())
if is_live:
f['fragments'] = gen
f['protocol'] = 'http_dash_segments_generator' f['protocol'] = 'http_dash_segments_generator'
f['fragments'] = functools.partial( else:
self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) f['fragments'] = LazyList(gen({}))
del f['is_from_start']
def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx): def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, manifestless_orig_fmt, ctx):
FETCH_SPAN, MAX_DURATION = 5, 432000 FETCH_SPAN, MAX_DURATION = 5, 432000
mpd_url, stream_number, is_live = None, None, True mpd_url, stream_number, is_live = None, None, True
@ -2568,6 +2573,9 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
return False, last_seq return False, last_seq
elif old_mpd_url == mpd_url: elif old_mpd_url == mpd_url:
return True, last_seq return True, last_seq
if manifestless_orig_fmt:
fmt_info = manifestless_orig_fmt
else:
try: try:
fmts, _ = self._extract_mpd_formats_and_subtitles( fmts, _ = self._extract_mpd_formats_and_subtitles(
mpd_url, None, note=False, errnote=False, fatal=False) mpd_url, None, note=False, errnote=False, fatal=False)
@ -2584,6 +2592,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
_last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
return True, _last_seq return True, _last_seq
self.write_debug(f'[{video_id}] Generating fragments for format {format_id}')
while is_live: while is_live:
fetch_time = time.time() fetch_time = time.time()
if no_fragment_score > 30: if no_fragment_score > 30:
@ -2637,6 +2646,11 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
except ExtractorError: except ExtractorError:
continue continue
if manifestless_orig_fmt:
# Stop at the first iteration if running for post-live manifestless;
# fragment count no longer increase since it starts
break
time.sleep(max(0, FETCH_SPAN + fetch_time - time.time())) time.sleep(max(0, FETCH_SPAN + fetch_time - time.time()))
def _extract_player_url(self, *ytcfgs, webpage=None): def _extract_player_url(self, *ytcfgs, webpage=None):
@ -3397,7 +3411,12 @@ def append_client(*client_names):
self.report_warning(last_error) self.report_warning(last_error)
return prs, player_url return prs, player_url
def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration): def _needs_live_processing(self, live_status, duration):
if (live_status == 'is_live' and self.get_param('live_from_start')
or live_status == 'post_live' and (duration or 0) > 4 * 3600):
return live_status
def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration):
itags, stream_ids = {}, [] itags, stream_ids = {}, []
itag_qualities, res_qualities = {}, {0: None} itag_qualities, res_qualities = {}, {0: None}
q = qualities([ q = qualities([
@ -3544,15 +3563,22 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i
dct['container'] = dct['ext'] + '_dash' dct['container'] = dct['ext'] + '_dash'
yield dct yield dct
live_from_start = is_live and self.get_param('live_from_start') needs_live_processing = self._needs_live_processing(live_status, duration)
skip_manifests = self._configuration_arg('skip') skip_bad_formats = not self._configuration_arg('include_incomplete_formats')
if not self.get_param('youtube_include_hls_manifest', True):
skip_manifests.append('hls') skip_manifests = set(self._configuration_arg('skip'))
if (not self.get_param('youtube_include_hls_manifest', True)
or needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway
or needs_live_processing and skip_bad_formats):
skip_manifests.add('hls')
if not self.get_param('youtube_include_dash_manifest', True): if not self.get_param('youtube_include_dash_manifest', True):
skip_manifests.append('dash') skip_manifests.add('dash')
get_dash = 'dash' not in skip_manifests and ( if self._configuration_arg('include_live_dash'):
not is_live or live_from_start or self._configuration_arg('include_live_dash')) self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. '
get_hls = not live_from_start and 'hls' not in skip_manifests 'Use include_incomplete_formats extractor argument instead')
elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live':
skip_manifests.add('dash')
def process_manifest_format(f, proto, itag): def process_manifest_format(f, proto, itag):
if itag in itags: if itag in itags:
@ -3570,16 +3596,17 @@ def process_manifest_format(f, proto, itag):
subtitles = {} subtitles = {}
for sd in streaming_data: for sd in streaming_data:
hls_manifest_url = get_hls and sd.get('hlsManifestUrl') hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl')
if hls_manifest_url: if hls_manifest_url:
fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live) fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
subtitles = self._merge_subtitles(subs, subtitles) subtitles = self._merge_subtitles(subs, subtitles)
for f in fmts: for f in fmts:
if process_manifest_format(f, 'hls', self._search_regex( if process_manifest_format(f, 'hls', self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)): r'/itag/(\d+)', f['url'], 'itag', default=None)):
yield f yield f
dash_manifest_url = get_dash and sd.get('dashManifestUrl') dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl')
if dash_manifest_url: if dash_manifest_url:
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
@ -3587,7 +3614,7 @@ def process_manifest_format(f, proto, itag):
if process_manifest_format(f, 'dash', f['format_id']): if process_manifest_format(f, 'dash', f['format_id']):
f['filesize'] = int_or_none(self._search_regex( f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
if live_from_start: if needs_live_processing:
f['is_from_start'] = True f['is_from_start'] = True
yield f yield f
@ -3653,11 +3680,23 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
is_live = get_first(video_details, 'isLive') is_live = get_first(video_details, 'isLive')
if is_live is None: if is_live is None:
is_live = get_first(live_broadcast_details, 'isLiveNow') is_live = get_first(live_broadcast_details, 'isLiveNow')
live_content = get_first(video_details, 'isLiveContent')
is_upcoming = get_first(video_details, 'isUpcoming')
if is_live is None and is_upcoming or live_content is False:
is_live = False
if is_upcoming is None and (live_content or is_live):
is_upcoming = False
post_live = get_first(video_details, 'isPostLiveDvr')
live_status = ('post_live' if post_live
else 'is_live' if is_live
else 'is_upcoming' if is_upcoming
else None if None in (is_live, is_upcoming, live_content)
else 'was_live' if live_content else 'not_live')
streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
*formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live, duration) *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration)
return live_broadcast_details, is_live, streaming_data, formats, subtitles return live_broadcast_details, live_status, streaming_data, formats, subtitles
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
@ -3749,8 +3788,10 @@ def feed_entry(name):
or get_first(microformats, 'lengthSeconds') or get_first(microformats, 'lengthSeconds')
or parse_duration(search_meta('duration'))) or None or parse_duration(search_meta('duration'))) or None
live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \ live_broadcast_details, live_status, streaming_data, formats, automatic_captions = \
self._list_formats(video_id, microformats, video_details, player_responses, player_url) self._list_formats(video_id, microformats, video_details, player_responses, player_url, duration)
if live_status == 'post_live':
self.write_debug(f'{video_id}: Video is in Post-Live Manifestless mode')
if not formats: if not formats:
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@ -3809,7 +3850,7 @@ def feed_entry(name):
thumbnails.extend({ thumbnails.extend({
'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
video_id=video_id, name=name, ext=ext, video_id=video_id, name=name, ext=ext,
webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''), webp='_webp' if ext == 'webp' else '', live='_live' if live_status == 'is_live' else ''),
} for name in thumbnail_names for ext in ('webp', 'jpg')) } for name in thumbnail_names for ext in ('webp', 'jpg'))
for thumb in thumbnails: for thumb in thumbnails:
i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
@ -3824,20 +3865,27 @@ def feed_entry(name):
or search_meta('channelId')) or search_meta('channelId'))
owner_profile_url = get_first(microformats, 'ownerProfileUrl') owner_profile_url = get_first(microformats, 'ownerProfileUrl')
live_content = get_first(video_details, 'isLiveContent')
is_upcoming = get_first(video_details, 'isUpcoming')
if is_live is None:
if is_upcoming or live_content is False:
is_live = False
if is_upcoming is None and (live_content or is_live):
is_upcoming = False
live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
if not duration and live_end_time and live_start_time: if not duration and live_end_time and live_start_time:
duration = live_end_time - live_start_time duration = live_end_time - live_start_time
if is_live and self.get_param('live_from_start'): needs_live_processing = self._needs_live_processing(live_status, duration)
self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data)
def is_bad_format(fmt):
if needs_live_processing and not fmt.get('is_from_start'):
return True
elif (live_status == 'is_live' and needs_live_processing != 'is_live'
and fmt.get('protocol') == 'http_dash_segments'):
return True
for fmt in filter(is_bad_format, formats):
fmt['preference'] = (fmt.get('preference') or -1) - 10
fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ')
if needs_live_processing:
self._prepare_live_from_start_formats(
formats, video_id, live_start_time, url, webpage_url, smuggled_data, live_status == 'is_live')
formats.extend(self._extract_storyboard(player_responses, duration)) formats.extend(self._extract_storyboard(player_responses, duration))
@ -3872,22 +3920,10 @@ def feed_entry(name):
'categories': [category] if category else None, 'categories': [category] if category else None,
'tags': keywords, 'tags': keywords,
'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
'is_live': is_live, 'live_status': live_status,
'was_live': (False if is_live or is_upcoming or live_content is False
else None if is_live is None or is_upcoming is None
else live_content),
'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
'release_timestamp': live_start_time, 'release_timestamp': live_start_time,
} }
if get_first(video_details, 'isPostLiveDvr'):
self.write_debug('Video is in Post-Live Manifestless mode')
info['live_status'] = 'post_live'
if (duration or 0) > 4 * 3600:
self.report_warning(
'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
'This is a known issue and patches are welcome')
subtitles = {} subtitles = {}
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
if pctr: if pctr:
@ -4017,7 +4053,8 @@ def process_language(container, base_url, lang_code, sub_name, query):
'url': f'https://www.youtube.com/watch?v={video_id}&bpctr=9999999999&has_verified=1', 'url': f'https://www.youtube.com/watch?v={video_id}&bpctr=9999999999&has_verified=1',
'video_id': video_id, 'video_id': video_id,
'ext': 'json', 'ext': 'json',
'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay', 'protocol': ('youtube_live_chat' if live_status in ('is_live', 'is_upcoming')
else 'youtube_live_chat_replay'),
}] }]
if initial_data: if initial_data:
@ -4124,9 +4161,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
unified_strdate(get_first(microformats, 'uploadDate')) unified_strdate(get_first(microformats, 'uploadDate'))
or unified_strdate(search_meta('uploadDate'))) or unified_strdate(search_meta('uploadDate')))
if not upload_date or ( if not upload_date or (
not info.get('is_live') live_status in ('not_live', None)
and not info.get('was_live')
and info.get('live_status') != 'is_upcoming'
and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', [])
): ):
upload_date = strftime_or_none( upload_date = strftime_or_none(