[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

This commit is contained in:
Yen Chi Hsuan 2016-08-19 23:53:47 +08:00
parent 55af45fcab
commit 520251c093
No known key found for this signature in database
GPG key ID: 3FDDD575826C5C30
2 changed files with 26 additions and 11 deletions

View file

@ -1,6 +1,7 @@
version <unreleased> version <unreleased>
Core Core
* Support m3u8 manifests in HTML5 multimedia tags
* Fix js_to_json(): correct octal or hexadecimal number detection * Fix js_to_json(): correct octal or hexadecimal number detection
Extractors Extractors

View file

@ -1695,7 +1695,7 @@ def add_segment_url():
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats return formats
def _parse_html5_media_entries(self, base_url, webpage): def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None):
def absolute_url(video_url): def absolute_url(video_url):
return compat_urlparse.urljoin(base_url, video_url) return compat_urlparse.urljoin(base_url, video_url)
@ -1710,6 +1710,21 @@ def parse_content_type(content_type):
return f return f
return {} return {}
def _media_formats(src, cur_media_type):
full_url = absolute_url(src)
if determine_ext(full_url) == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id)
else:
is_plain_url = True
formats = [{
'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None,
}]
return is_plain_url, formats
entries = [] entries = []
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
media_info = { media_info = {
@ -1719,10 +1734,8 @@ def parse_content_type(content_type):
media_attributes = extract_attributes(media_tag) media_attributes = extract_attributes(media_tag)
src = media_attributes.get('src') src = media_attributes.get('src')
if src: if src:
media_info['formats'].append({ _, formats = _media_formats(src)
'url': absolute_url(src), media_info['formats'].extend(formats)
'vcodec': 'none' if media_type == 'audio' else None,
})
media_info['thumbnail'] = media_attributes.get('poster') media_info['thumbnail'] = media_attributes.get('poster')
if media_content: if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content): for source_tag in re.findall(r'<source[^>]+>', media_content):
@ -1730,12 +1743,13 @@ def parse_content_type(content_type):
src = source_attributes.get('src') src = source_attributes.get('src')
if not src: if not src:
continue continue
f = parse_content_type(source_attributes.get('type')) is_plain_url, formats = _media_formats(src, media_type)
f.update({ if is_plain_url:
'url': absolute_url(src), f = parse_content_type(source_attributes.get('type'))
'vcodec': 'none' if media_type == 'audio' else None, f.update(formats[0])
}) media_info['formats'].append(f)
media_info['formats'].append(f) else:
media_info['formats'].extend(formats)
for track_tag in re.findall(r'<track[^>]+>', media_content): for track_tag in re.findall(r'<track[^>]+>', media_content):
track_attributes = extract_attributes(track_tag) track_attributes = extract_attributes(track_tag)
kind = track_attributes.get('kind') kind = track_attributes.get('kind')