[youtube] Extract formats from multiple DASH manifests (Closes #6093)

DASH manifest pointed by dashmpd from the video webpage and one pointed by get_video_info may
be different (namely different itag set) - some itags are missing from DASH manifest pointed by
webpage's dashmpd, some - from DASH manifest pointed by get_video_info's dashmpd).
The general idea is to take a union of itags of both DASH manifests (for example video with such
'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093).
This commit is contained in:
Sergey M․ 2015-06-27 00:36:23 +06:00
parent 2988835af5
commit d8d24a922a

View file

@ -853,6 +853,13 @@ def _real_extract(self, url):
else: else:
player_url = None player_url = None
dash_mpds = []
def add_dash_mpd(video_info):
dash_mpd = video_info.get('dashmpd')
if dash_mpd and dash_mpd[0] not in dash_mpds:
dash_mpds.append(dash_mpd[0])
# Get video info # Get video info
embed_webpage = None embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None: if re.search(r'player-age-gate-content">', video_webpage) is not None:
@ -873,24 +880,27 @@ def _real_extract(self, url):
note='Refetching age-gated info webpage', note='Refetching age-gated info webpage',
errnote='unable to download video info webpage') errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage) video_info = compat_parse_qs(video_info_webpage)
add_dash_mpd(video_info)
else: else:
age_gate = False age_gate = False
try:
# Try looking directly into the video webpage # Try looking directly into the video webpage
mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
if not mobj: if mobj:
raise ValueError('Could not find ytplayer.config') # caught below
json_code = uppercase_escape(mobj.group(1)) json_code = uppercase_escape(mobj.group(1))
ytplayer_config = json.loads(json_code) ytplayer_config = json.loads(json_code)
args = ytplayer_config['args'] args = ytplayer_config['args']
if args.get('url_encoded_fmt_stream_map'):
# Convert to the same format returned by compat_parse_qs # Convert to the same format returned by compat_parse_qs
video_info = dict((k, [v]) for k, v in args.items()) video_info = dict((k, [v]) for k, v in args.items())
if not args.get('url_encoded_fmt_stream_map'): add_dash_mpd(video_info)
raise ValueError('No stream_map present') # caught below # We also try looking in get_video_info since it may contain different dashmpd
except ValueError: # URL that points to a DASH manifest with possibly different itag set (some itags
# We fallback to the get_video_info pages (used by the embed page) # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
# manifest pointed by get_video_info's dashmpd).
# The general idea is to take a union of itags of both DASH manifests (for example
# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id) self.report_video_info_webpage_download(video_id)
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ( video_info_url = (
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (proto, video_id, el_type)) % (proto, video_id, el_type))
@ -898,8 +908,11 @@ def _real_extract(self, url):
video_info_url, video_info_url,
video_id, note=False, video_id, note=False,
errnote='unable to download video info webpage') errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage) get_video_info = compat_parse_qs(video_info_webpage)
if 'token' in video_info: add_dash_mpd(get_video_info)
if not video_info:
video_info = get_video_info
if 'token' in get_video_info:
break break
if 'token' not in video_info: if 'token' not in video_info:
if 'reason' in video_info: if 'reason' in video_info:
@ -1118,24 +1131,26 @@ def _map_to_format_list(urlmap):
# Look for the DASH manifest # Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True): if self._downloader.params.get('youtube_include_dash_manifest', True):
dash_mpd = video_info.get('dashmpd') for dash_manifest_url in dash_mpds:
if dash_mpd: dash_formats = {}
dash_manifest_url = dash_mpd[0]
try: try:
dash_formats = self._parse_dash_manifest( for df in self._parse_dash_manifest(
video_id, dash_manifest_url, player_url, age_gate) video_id, dash_manifest_url, player_url, age_gate):
# Do not overwrite DASH format found in some previous DASH manifest
if df['format_id'] not in dash_formats:
dash_formats[df['format_id']] = df
except (ExtractorError, KeyError) as e: except (ExtractorError, KeyError) as e:
self.report_warning( self.report_warning(
'Skipping DASH manifest: %r' % e, video_id) 'Skipping DASH manifest: %r' % e, video_id)
else: if dash_formats:
# Remove the formats we found through non-DASH, they # Remove the formats we found through non-DASH, they
# contain less info and it can be wrong, because we use # contain less info and it can be wrong, because we use
# fixed values (for example the resolution). See # fixed values (for example the resolution). See
# https://github.com/rg3/youtube-dl/issues/5774 for an # https://github.com/rg3/youtube-dl/issues/5774 for an
# example. # example.
dash_keys = set(df['format_id'] for df in dash_formats) dash_keys = set(df['format_id'] for df in dash_formats.values())
formats = [f for f in formats if f['format_id'] not in dash_keys] formats = [f for f in formats if f['format_id'] not in dash_keys]
formats.extend(dash_formats) formats.extend(dash_formats.values())
# Check for malformed aspect ratio # Check for malformed aspect ratio
stretched_m = re.search( stretched_m = re.search(