mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-21 20:46:36 -05:00
[extractor] Detect sttp
as subtitles in MPD
Closes #656 Solution by: fstirlitz
This commit is contained in:
parent
7be9ccff0b
commit
be2fc5b212
1 changed files with 204 additions and 196 deletions
|
@ -2596,215 +2596,223 @@ def extract_Initialization(source):
|
||||||
mime_type = representation_attrib['mimeType']
|
mime_type = representation_attrib['mimeType']
|
||||||
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
|
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
|
||||||
|
|
||||||
if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
|
codecs = representation_attrib.get('codecs', '')
|
||||||
base_url = ''
|
if content_type not in ('video', 'audio', 'text'):
|
||||||
for element in (representation, adaptation_set, period, mpd_doc):
|
if mime_type == 'image/jpeg':
|
||||||
base_url_e = element.find(_add_ns('BaseURL'))
|
content_type = 'image/jpeg'
|
||||||
if base_url_e is not None:
|
if codecs.split('.')[0] == 'stpp':
|
||||||
base_url = base_url_e.text + base_url
|
content_type = 'text'
|
||||||
if re.match(r'^https?://', base_url):
|
|
||||||
break
|
|
||||||
if mpd_base_url and not re.match(r'^https?://', base_url):
|
|
||||||
if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
|
|
||||||
mpd_base_url += '/'
|
|
||||||
base_url = mpd_base_url + base_url
|
|
||||||
representation_id = representation_attrib.get('id')
|
|
||||||
lang = representation_attrib.get('lang')
|
|
||||||
url_el = representation.find(_add_ns('BaseURL'))
|
|
||||||
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
|
|
||||||
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
|
|
||||||
if representation_id is not None:
|
|
||||||
format_id = representation_id
|
|
||||||
else:
|
else:
|
||||||
format_id = content_type
|
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
||||||
if mpd_id:
|
continue
|
||||||
format_id = mpd_id + '-' + format_id
|
|
||||||
if content_type in ('video', 'audio'):
|
|
||||||
f = {
|
|
||||||
'format_id': format_id,
|
|
||||||
'manifest_url': mpd_url,
|
|
||||||
'ext': mimetype2ext(mime_type),
|
|
||||||
'width': int_or_none(representation_attrib.get('width')),
|
|
||||||
'height': int_or_none(representation_attrib.get('height')),
|
|
||||||
'tbr': float_or_none(bandwidth, 1000),
|
|
||||||
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
|
|
||||||
'fps': int_or_none(representation_attrib.get('frameRate')),
|
|
||||||
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
|
|
||||||
'format_note': 'DASH %s' % content_type,
|
|
||||||
'filesize': filesize,
|
|
||||||
'container': mimetype2ext(mime_type) + '_dash',
|
|
||||||
}
|
|
||||||
f.update(parse_codecs(representation_attrib.get('codecs')))
|
|
||||||
elif content_type == 'text':
|
|
||||||
f = {
|
|
||||||
'ext': mimetype2ext(mime_type),
|
|
||||||
'manifest_url': mpd_url,
|
|
||||||
'filesize': filesize,
|
|
||||||
}
|
|
||||||
elif mime_type == 'image/jpeg':
|
|
||||||
# See test case in VikiIE
|
|
||||||
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
|
|
||||||
f = {
|
|
||||||
'format_id': format_id,
|
|
||||||
'ext': 'mhtml',
|
|
||||||
'manifest_url': mpd_url,
|
|
||||||
'format_note': 'DASH storyboards (jpeg)',
|
|
||||||
'acodec': 'none',
|
|
||||||
'vcodec': 'none',
|
|
||||||
}
|
|
||||||
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
|
|
||||||
|
|
||||||
def prepare_template(template_name, identifiers):
|
base_url = ''
|
||||||
tmpl = representation_ms_info[template_name]
|
for element in (representation, adaptation_set, period, mpd_doc):
|
||||||
# First of, % characters outside $...$ templates
|
base_url_e = element.find(_add_ns('BaseURL'))
|
||||||
# must be escaped by doubling for proper processing
|
if base_url_e is not None:
|
||||||
# by % operator string formatting used further (see
|
base_url = base_url_e.text + base_url
|
||||||
# https://github.com/ytdl-org/youtube-dl/issues/16867).
|
if re.match(r'^https?://', base_url):
|
||||||
t = ''
|
break
|
||||||
in_template = False
|
if mpd_base_url and not re.match(r'^https?://', base_url):
|
||||||
for c in tmpl:
|
if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
|
||||||
|
mpd_base_url += '/'
|
||||||
|
base_url = mpd_base_url + base_url
|
||||||
|
representation_id = representation_attrib.get('id')
|
||||||
|
lang = representation_attrib.get('lang')
|
||||||
|
url_el = representation.find(_add_ns('BaseURL'))
|
||||||
|
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
|
||||||
|
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
|
||||||
|
if representation_id is not None:
|
||||||
|
format_id = representation_id
|
||||||
|
else:
|
||||||
|
format_id = content_type
|
||||||
|
if mpd_id:
|
||||||
|
format_id = mpd_id + '-' + format_id
|
||||||
|
if content_type in ('video', 'audio'):
|
||||||
|
f = {
|
||||||
|
'format_id': format_id,
|
||||||
|
'manifest_url': mpd_url,
|
||||||
|
'ext': mimetype2ext(mime_type),
|
||||||
|
'width': int_or_none(representation_attrib.get('width')),
|
||||||
|
'height': int_or_none(representation_attrib.get('height')),
|
||||||
|
'tbr': float_or_none(bandwidth, 1000),
|
||||||
|
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
|
||||||
|
'fps': int_or_none(representation_attrib.get('frameRate')),
|
||||||
|
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
|
||||||
|
'format_note': 'DASH %s' % content_type,
|
||||||
|
'filesize': filesize,
|
||||||
|
'container': mimetype2ext(mime_type) + '_dash',
|
||||||
|
}
|
||||||
|
f.update(parse_codecs(codecs))
|
||||||
|
elif content_type == 'text':
|
||||||
|
f = {
|
||||||
|
'ext': mimetype2ext(mime_type),
|
||||||
|
'manifest_url': mpd_url,
|
||||||
|
'filesize': filesize,
|
||||||
|
}
|
||||||
|
elif content_type == 'image/jpeg':
|
||||||
|
# See test case in VikiIE
|
||||||
|
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
|
||||||
|
f = {
|
||||||
|
'format_id': format_id,
|
||||||
|
'ext': 'mhtml',
|
||||||
|
'manifest_url': mpd_url,
|
||||||
|
'format_note': 'DASH storyboards (jpeg)',
|
||||||
|
'acodec': 'none',
|
||||||
|
'vcodec': 'none',
|
||||||
|
}
|
||||||
|
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
|
||||||
|
|
||||||
|
def prepare_template(template_name, identifiers):
|
||||||
|
tmpl = representation_ms_info[template_name]
|
||||||
|
# First of, % characters outside $...$ templates
|
||||||
|
# must be escaped by doubling for proper processing
|
||||||
|
# by % operator string formatting used further (see
|
||||||
|
# https://github.com/ytdl-org/youtube-dl/issues/16867).
|
||||||
|
t = ''
|
||||||
|
in_template = False
|
||||||
|
for c in tmpl:
|
||||||
|
t += c
|
||||||
|
if c == '$':
|
||||||
|
in_template = not in_template
|
||||||
|
elif c == '%' and not in_template:
|
||||||
t += c
|
t += c
|
||||||
if c == '$':
|
# Next, $...$ templates are translated to their
|
||||||
in_template = not in_template
|
# %(...) counterparts to be used with % operator
|
||||||
elif c == '%' and not in_template:
|
if representation_id is not None:
|
||||||
t += c
|
t = t.replace('$RepresentationID$', representation_id)
|
||||||
# Next, $...$ templates are translated to their
|
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
|
||||||
# %(...) counterparts to be used with % operator
|
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
|
||||||
if representation_id is not None:
|
t.replace('$$', '$')
|
||||||
t = t.replace('$RepresentationID$', representation_id)
|
return t
|
||||||
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
|
|
||||||
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
|
|
||||||
t.replace('$$', '$')
|
|
||||||
return t
|
|
||||||
|
|
||||||
# @initialization is a regular template like @media one
|
# @initialization is a regular template like @media one
|
||||||
# so it should be handled just the same way (see
|
# so it should be handled just the same way (see
|
||||||
# https://github.com/ytdl-org/youtube-dl/issues/11605)
|
# https://github.com/ytdl-org/youtube-dl/issues/11605)
|
||||||
if 'initialization' in representation_ms_info:
|
if 'initialization' in representation_ms_info:
|
||||||
initialization_template = prepare_template(
|
initialization_template = prepare_template(
|
||||||
'initialization',
|
'initialization',
|
||||||
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
|
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
|
||||||
# $Time$ shall not be included for @initialization thus
|
# $Time$ shall not be included for @initialization thus
|
||||||
# only $Bandwidth$ remains
|
# only $Bandwidth$ remains
|
||||||
('Bandwidth', ))
|
('Bandwidth', ))
|
||||||
representation_ms_info['initialization_url'] = initialization_template % {
|
representation_ms_info['initialization_url'] = initialization_template % {
|
||||||
'Bandwidth': bandwidth,
|
'Bandwidth': bandwidth,
|
||||||
}
|
}
|
||||||
|
|
||||||
def location_key(location):
|
def location_key(location):
|
||||||
return 'url' if re.match(r'^https?://', location) else 'path'
|
return 'url' if re.match(r'^https?://', location) else 'path'
|
||||||
|
|
||||||
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
|
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
|
||||||
|
|
||||||
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
|
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
|
||||||
media_location_key = location_key(media_template)
|
media_location_key = location_key(media_template)
|
||||||
|
|
||||||
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
|
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
|
||||||
# can't be used at the same time
|
# can't be used at the same time
|
||||||
if '%(Number' in media_template and 's' not in representation_ms_info:
|
if '%(Number' in media_template and 's' not in representation_ms_info:
|
||||||
segment_duration = None
|
segment_duration = None
|
||||||
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
|
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
|
||||||
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
|
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
|
||||||
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
|
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
|
||||||
representation_ms_info['fragments'] = [{
|
representation_ms_info['fragments'] = [{
|
||||||
media_location_key: media_template % {
|
media_location_key: media_template % {
|
||||||
'Number': segment_number,
|
'Number': segment_number,
|
||||||
'Bandwidth': bandwidth,
|
'Bandwidth': bandwidth,
|
||||||
},
|
},
|
||||||
'duration': segment_duration,
|
'duration': segment_duration,
|
||||||
} for segment_number in range(
|
} for segment_number in range(
|
||||||
representation_ms_info['start_number'],
|
representation_ms_info['start_number'],
|
||||||
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
|
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
|
||||||
else:
|
else:
|
||||||
# $Number*$ or $Time$ in media template with S list available
|
# $Number*$ or $Time$ in media template with S list available
|
||||||
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
|
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
|
||||||
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
|
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
|
||||||
representation_ms_info['fragments'] = []
|
representation_ms_info['fragments'] = []
|
||||||
segment_time = 0
|
segment_time = 0
|
||||||
segment_d = None
|
segment_d = None
|
||||||
segment_number = representation_ms_info['start_number']
|
segment_number = representation_ms_info['start_number']
|
||||||
|
|
||||||
def add_segment_url():
|
def add_segment_url():
|
||||||
segment_url = media_template % {
|
segment_url = media_template % {
|
||||||
'Time': segment_time,
|
'Time': segment_time,
|
||||||
'Bandwidth': bandwidth,
|
'Bandwidth': bandwidth,
|
||||||
'Number': segment_number,
|
'Number': segment_number,
|
||||||
}
|
}
|
||||||
representation_ms_info['fragments'].append({
|
representation_ms_info['fragments'].append({
|
||||||
media_location_key: segment_url,
|
media_location_key: segment_url,
|
||||||
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
|
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
|
||||||
})
|
})
|
||||||
|
|
||||||
for num, s in enumerate(representation_ms_info['s']):
|
for num, s in enumerate(representation_ms_info['s']):
|
||||||
segment_time = s.get('t') or segment_time
|
segment_time = s.get('t') or segment_time
|
||||||
segment_d = s['d']
|
segment_d = s['d']
|
||||||
|
add_segment_url()
|
||||||
|
segment_number += 1
|
||||||
|
for r in range(s.get('r', 0)):
|
||||||
|
segment_time += segment_d
|
||||||
add_segment_url()
|
add_segment_url()
|
||||||
segment_number += 1
|
segment_number += 1
|
||||||
for r in range(s.get('r', 0)):
|
segment_time += segment_d
|
||||||
segment_time += segment_d
|
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
|
||||||
add_segment_url()
|
# No media template
|
||||||
segment_number += 1
|
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
|
||||||
segment_time += segment_d
|
# or any YouTube dashsegments video
|
||||||
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
|
fragments = []
|
||||||
# No media template
|
segment_index = 0
|
||||||
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
|
timescale = representation_ms_info['timescale']
|
||||||
# or any YouTube dashsegments video
|
for s in representation_ms_info['s']:
|
||||||
fragments = []
|
duration = float_or_none(s['d'], timescale)
|
||||||
segment_index = 0
|
for r in range(s.get('r', 0) + 1):
|
||||||
timescale = representation_ms_info['timescale']
|
segment_uri = representation_ms_info['segment_urls'][segment_index]
|
||||||
for s in representation_ms_info['s']:
|
fragments.append({
|
||||||
duration = float_or_none(s['d'], timescale)
|
location_key(segment_uri): segment_uri,
|
||||||
for r in range(s.get('r', 0) + 1):
|
'duration': duration,
|
||||||
segment_uri = representation_ms_info['segment_urls'][segment_index]
|
})
|
||||||
fragments.append({
|
segment_index += 1
|
||||||
location_key(segment_uri): segment_uri,
|
representation_ms_info['fragments'] = fragments
|
||||||
'duration': duration,
|
elif 'segment_urls' in representation_ms_info:
|
||||||
})
|
# Segment URLs with no SegmentTimeline
|
||||||
segment_index += 1
|
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
|
||||||
representation_ms_info['fragments'] = fragments
|
# https://github.com/ytdl-org/youtube-dl/pull/14844
|
||||||
elif 'segment_urls' in representation_ms_info:
|
fragments = []
|
||||||
# Segment URLs with no SegmentTimeline
|
segment_duration = float_or_none(
|
||||||
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
|
representation_ms_info['segment_duration'],
|
||||||
# https://github.com/ytdl-org/youtube-dl/pull/14844
|
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
|
||||||
fragments = []
|
for segment_url in representation_ms_info['segment_urls']:
|
||||||
segment_duration = float_or_none(
|
fragment = {
|
||||||
representation_ms_info['segment_duration'],
|
location_key(segment_url): segment_url,
|
||||||
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
|
}
|
||||||
for segment_url in representation_ms_info['segment_urls']:
|
if segment_duration:
|
||||||
fragment = {
|
fragment['duration'] = segment_duration
|
||||||
location_key(segment_url): segment_url,
|
fragments.append(fragment)
|
||||||
}
|
representation_ms_info['fragments'] = fragments
|
||||||
if segment_duration:
|
# If there is a fragments key available then we correctly recognized fragmented media.
|
||||||
fragment['duration'] = segment_duration
|
# Otherwise we will assume unfragmented media with direct access. Technically, such
|
||||||
fragments.append(fragment)
|
# assumption is not necessarily correct since we may simply have no support for
|
||||||
representation_ms_info['fragments'] = fragments
|
# some forms of fragmented media renditions yet, but for now we'll use this fallback.
|
||||||
# If there is a fragments key available then we correctly recognized fragmented media.
|
if 'fragments' in representation_ms_info:
|
||||||
# Otherwise we will assume unfragmented media with direct access. Technically, such
|
f.update({
|
||||||
# assumption is not necessarily correct since we may simply have no support for
|
# NB: mpd_url may be empty when MPD manifest is parsed from a string
|
||||||
# some forms of fragmented media renditions yet, but for now we'll use this fallback.
|
'url': mpd_url or base_url,
|
||||||
if 'fragments' in representation_ms_info:
|
'fragment_base_url': base_url,
|
||||||
f.update({
|
'fragments': [],
|
||||||
# NB: mpd_url may be empty when MPD manifest is parsed from a string
|
'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
|
||||||
'url': mpd_url or base_url,
|
})
|
||||||
'fragment_base_url': base_url,
|
if 'initialization_url' in representation_ms_info:
|
||||||
'fragments': [],
|
initialization_url = representation_ms_info['initialization_url']
|
||||||
'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
|
if not f.get('url'):
|
||||||
})
|
f['url'] = initialization_url
|
||||||
if 'initialization_url' in representation_ms_info:
|
f['fragments'].append({location_key(initialization_url): initialization_url})
|
||||||
initialization_url = representation_ms_info['initialization_url']
|
f['fragments'].extend(representation_ms_info['fragments'])
|
||||||
if not f.get('url'):
|
|
||||||
f['url'] = initialization_url
|
|
||||||
f['fragments'].append({location_key(initialization_url): initialization_url})
|
|
||||||
f['fragments'].extend(representation_ms_info['fragments'])
|
|
||||||
else:
|
|
||||||
# Assuming direct URL to unfragmented media.
|
|
||||||
f['url'] = base_url
|
|
||||||
if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
|
|
||||||
formats.append(f)
|
|
||||||
elif content_type == 'text':
|
|
||||||
subtitles.setdefault(lang or 'und', []).append(f)
|
|
||||||
else:
|
else:
|
||||||
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
# Assuming direct URL to unfragmented media.
|
||||||
|
f['url'] = base_url
|
||||||
|
if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
|
||||||
|
formats.append(f)
|
||||||
|
elif content_type == 'text':
|
||||||
|
subtitles.setdefault(lang or 'und', []).append(f)
|
||||||
|
|
||||||
return formats, subtitles
|
return formats, subtitles
|
||||||
|
|
||||||
def _extract_ism_formats(self, *args, **kwargs):
|
def _extract_ism_formats(self, *args, **kwargs):
|
||||||
|
|
Loading…
Reference in a new issue