[extractor/generic] Pass through referer from json-ld

Closes #4941
This commit is contained in:
pukkandan 2022-09-16 23:05:49 +05:30
parent 8817a80d3a
commit 3166e6840c
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39

View file

@ -2621,7 +2621,7 @@ def _real_extract(self, url):
default_search += ':' default_search += ':'
return self.url_result(default_search + url) return self.url_result(default_search + url)
url, smuggled_data = unsmuggle_url(url) url, smuggled_data = unsmuggle_url(url, {})
force_videoid = None force_videoid = None
is_intentional = smuggled_data and smuggled_data.get('to_generic') is_intentional = smuggled_data and smuggled_data.get('to_generic')
if smuggled_data and 'force_videoid' in smuggled_data: if smuggled_data and 'force_videoid' in smuggled_data:
@ -2638,7 +2638,10 @@ def _real_extract(self, url):
# to accept raw bytes and being able to download only a chunk. # to accept raw bytes and being able to download only a chunk.
# It may probably better to solve this by checking Content-Type for application/octet-stream # It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this. # after a HEAD request, but not sure if we can rely on this.
full_response = self._request_webpage(url, video_id, headers={'Accept-Encoding': '*'}) full_response = self._request_webpage(url, video_id, headers={
'Accept-Encoding': '*',
**smuggled_data.get('http_headers', {})
})
new_url = full_response.geturl() new_url = full_response.geturl()
if url != new_url: if url != new_url:
self.report_following_redirect(new_url) self.report_following_redirect(new_url)
@ -2657,14 +2660,15 @@ def _real_extract(self, url):
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m: if m:
self.report_detected('direct video link') self.report_detected('direct video link')
headers = smuggled_data.get('http_headers', {})
format_id = str(m.group('format_id')) format_id = str(m.group('format_id'))
subtitles = {} subtitles = {}
if format_id.endswith('mpegurl'): if format_id.endswith('mpegurl'):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): elif format_id.endswith('mpd') or format_id.endswith('dash+xml'):
formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
elif format_id == 'f4m': elif format_id == 'f4m':
formats = self._extract_f4m_formats(url, video_id) formats = self._extract_f4m_formats(url, video_id, headers=headers)
else: else:
formats = [{ formats = [{
'format_id': format_id, 'format_id': format_id,
@ -2673,8 +2677,11 @@ def _real_extract(self, url):
}] }]
info_dict['direct'] = True info_dict['direct'] = True
self._sort_formats(formats) self._sort_formats(formats)
info_dict['formats'] = formats info_dict.update({
info_dict['subtitles'] = subtitles 'formats': formats,
'subtitles': subtitles,
'http_headers': headers,
})
return info_dict return info_dict
if not self.get_param('test', False) and not is_intentional: if not self.get_param('test', False) and not is_intentional:
@ -2919,7 +2926,11 @@ def _real_extract(self, url):
self.report_detected('JSON LD') self.report_detected('JSON LD')
return merge_dicts({ return merge_dicts({
'_type': 'url_transparent', '_type': 'url_transparent',
'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}), 'url': smuggle_url(json_ld['url'], {
'force_videoid': video_id,
'to_generic': True,
'http_headers': {'Referer': url},
}),
}, json_ld, info_dict) }, json_ld, info_dict)
def check_video(vurl): def check_video(vurl):