mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-21 20:46:36 -05:00
[youtube] Parse API parameters from initial webpage (#230)
* Obtain innertube_context, api_key and x-goog-visitor-id from webpage
* Generalize the header & Innertube_context extraction across YouTube extractors
Related: 1b0a13f33c
Authored by: colethedj
This commit is contained in:
parent
1988fab7e3
commit
f4f751af40
1 changed files with 80 additions and 50 deletions
|
@ -284,21 +284,8 @@ def _real_initialize(self):
|
||||||
if not self._login():
|
if not self._login():
|
||||||
return
|
return
|
||||||
|
|
||||||
_YT_WEB_CLIENT_VERSION = '2.20210301.08.00'
|
_YT_WEB_CLIENT_VERSION = '2.20210407.08.00'
|
||||||
_DEFAULT_API_DATA = {
|
_YT_INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
|
||||||
'context': {
|
|
||||||
'client': {
|
|
||||||
'clientName': 'WEB',
|
|
||||||
'clientVersion': _YT_WEB_CLIENT_VERSION,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
_DEFAULT_BASIC_API_HEADERS = {
|
|
||||||
'X-YouTube-Client-Name': '1',
|
|
||||||
'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION
|
|
||||||
}
|
|
||||||
|
|
||||||
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
|
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
|
||||||
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
||||||
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
|
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
|
||||||
|
@ -312,19 +299,23 @@ def _generate_sapisidhash_header(self):
|
||||||
return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
|
return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
|
||||||
|
|
||||||
def _call_api(self, ep, query, video_id, fatal=True, headers=None,
|
def _call_api(self, ep, query, video_id, fatal=True, headers=None,
|
||||||
note='Downloading API JSON', errnote='Unable to download API page'):
|
note='Downloading API JSON', errnote='Unable to download API page',
|
||||||
data = self._DEFAULT_API_DATA.copy()
|
context=None, api_key=None):
|
||||||
|
|
||||||
|
data = {'context': context} if context else {'context': self._extract_context()}
|
||||||
data.update(query)
|
data.update(query)
|
||||||
headers = headers or {}
|
real_headers = self._generate_api_headers()
|
||||||
headers.update({'content-type': 'application/json'})
|
real_headers.update({'content-type': 'application/json'})
|
||||||
auth = self._generate_sapisidhash_header()
|
if headers:
|
||||||
if auth is not None:
|
real_headers.update(headers)
|
||||||
headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
|
|
||||||
return self._download_json(
|
return self._download_json(
|
||||||
'https://www.youtube.com/youtubei/v1/%s' % ep,
|
'https://www.youtube.com/youtubei/v1/%s' % ep,
|
||||||
video_id=video_id, fatal=fatal, note=note, errnote=errnote,
|
video_id=video_id, fatal=fatal, note=note, errnote=errnote,
|
||||||
data=json.dumps(data).encode('utf8'), headers=headers,
|
data=json.dumps(data).encode('utf8'), headers=real_headers,
|
||||||
query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
|
query={'key': api_key or self._extract_api_key()})
|
||||||
|
|
||||||
|
def _extract_api_key(self, ytcfg=None):
|
||||||
|
return try_get(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str) or self._YT_INNERTUBE_API_KEY
|
||||||
|
|
||||||
def _extract_yt_initial_data(self, video_id, webpage):
|
def _extract_yt_initial_data(self, video_id, webpage):
|
||||||
return self._parse_json(
|
return self._parse_json(
|
||||||
|
@ -358,7 +349,47 @@ def _extract_ytcfg(self, video_id, webpage):
|
||||||
return self._parse_json(
|
return self._parse_json(
|
||||||
self._search_regex(
|
self._search_regex(
|
||||||
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
|
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
|
||||||
default='{}'), video_id, fatal=False)
|
default='{}'), video_id, fatal=False) or {}
|
||||||
|
|
||||||
|
def __extract_client_version(self, ytcfg):
|
||||||
|
return try_get(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or self._YT_WEB_CLIENT_VERSION
|
||||||
|
|
||||||
|
def _extract_context(self, ytcfg=None):
|
||||||
|
context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict)
|
||||||
|
if context:
|
||||||
|
return context
|
||||||
|
|
||||||
|
# Recreate the client context (required)
|
||||||
|
client_version = self.__extract_client_version(ytcfg)
|
||||||
|
client_name = try_get(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str) or 'WEB'
|
||||||
|
context = {
|
||||||
|
'client': {
|
||||||
|
'clientName': client_name,
|
||||||
|
'clientVersion': client_version,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
|
||||||
|
if visitor_data:
|
||||||
|
context['client']['visitorData'] = visitor_data
|
||||||
|
return context
|
||||||
|
|
||||||
|
def _generate_api_headers(self, ytcfg=None, identity_token=None, account_syncid=None, visitor_data=None):
|
||||||
|
headers = {
|
||||||
|
'X-YouTube-Client-Name': '1',
|
||||||
|
'X-YouTube-Client-Version': self.__extract_client_version(ytcfg),
|
||||||
|
}
|
||||||
|
if identity_token:
|
||||||
|
headers['x-youtube-identity-token'] = identity_token
|
||||||
|
if account_syncid:
|
||||||
|
headers['X-Goog-PageId'] = account_syncid
|
||||||
|
headers['X-Goog-AuthUser'] = 0
|
||||||
|
if visitor_data:
|
||||||
|
headers['x-goog-visitor-id'] = visitor_data
|
||||||
|
auth = self._generate_sapisidhash_header()
|
||||||
|
if auth is not None:
|
||||||
|
headers['Authorization'] = auth
|
||||||
|
headers['X-Origin'] = 'https://www.youtube.com'
|
||||||
|
return headers
|
||||||
|
|
||||||
def _extract_video(self, renderer):
|
def _extract_video(self, renderer):
|
||||||
video_id = renderer.get('videoId')
|
video_id = renderer.get('videoId')
|
||||||
|
@ -1576,7 +1607,7 @@ def _extract_comment(self, comment_renderer, parent=None):
|
||||||
}
|
}
|
||||||
|
|
||||||
def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
|
def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
|
||||||
session_token_list, parent=None, comment_counts=None):
|
ytcfg, session_token_list, parent=None, comment_counts=None):
|
||||||
|
|
||||||
def extract_thread(parent_renderer):
|
def extract_thread(parent_renderer):
|
||||||
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
||||||
|
@ -1602,7 +1633,7 @@ def extract_thread(parent_renderer):
|
||||||
if comment_replies_renderer:
|
if comment_replies_renderer:
|
||||||
comment_counts[2] += 1
|
comment_counts[2] += 1
|
||||||
comment_entries_iter = self._comment_entries(
|
comment_entries_iter = self._comment_entries(
|
||||||
comment_replies_renderer, identity_token, account_syncid,
|
comment_replies_renderer, identity_token, account_syncid, ytcfg,
|
||||||
parent=comment.get('id'), session_token_list=session_token_list,
|
parent=comment.get('id'), session_token_list=session_token_list,
|
||||||
comment_counts=comment_counts)
|
comment_counts=comment_counts)
|
||||||
|
|
||||||
|
@ -1612,16 +1643,10 @@ def extract_thread(parent_renderer):
|
||||||
if not comment_counts:
|
if not comment_counts:
|
||||||
# comment so far, est. total comments, current comment thread #
|
# comment so far, est. total comments, current comment thread #
|
||||||
comment_counts = [0, 0, 0]
|
comment_counts = [0, 0, 0]
|
||||||
headers = self._DEFAULT_BASIC_API_HEADERS.copy()
|
|
||||||
|
|
||||||
# TODO: Generalize the download code with TabIE
|
# TODO: Generalize the download code with TabIE
|
||||||
if identity_token:
|
context = self._extract_context(ytcfg)
|
||||||
headers['x-youtube-identity-token'] = identity_token
|
visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
|
||||||
|
|
||||||
if account_syncid:
|
|
||||||
headers['X-Goog-PageId'] = account_syncid
|
|
||||||
headers['X-Goog-AuthUser'] = 0
|
|
||||||
|
|
||||||
continuation = YoutubeTabIE._extract_continuation(root_continuation_data) # TODO
|
continuation = YoutubeTabIE._extract_continuation(root_continuation_data) # TODO
|
||||||
first_continuation = False
|
first_continuation = False
|
||||||
if parent is None:
|
if parent is None:
|
||||||
|
@ -1630,6 +1655,7 @@ def extract_thread(parent_renderer):
|
||||||
for page_num in itertools.count(0):
|
for page_num in itertools.count(0):
|
||||||
if not continuation:
|
if not continuation:
|
||||||
break
|
break
|
||||||
|
headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
|
||||||
retries = self._downloader.params.get('extractor_retries', 3)
|
retries = self._downloader.params.get('extractor_retries', 3)
|
||||||
count = -1
|
count = -1
|
||||||
last_error = None
|
last_error = None
|
||||||
|
@ -1711,6 +1737,10 @@ def extract_thread(parent_renderer):
|
||||||
|
|
||||||
if not response:
|
if not response:
|
||||||
break
|
break
|
||||||
|
visitor_data = try_get(
|
||||||
|
response,
|
||||||
|
lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
|
||||||
|
compat_str) or visitor_data
|
||||||
|
|
||||||
known_continuation_renderers = {
|
known_continuation_renderers = {
|
||||||
'itemSectionContinuation': extract_thread,
|
'itemSectionContinuation': extract_thread,
|
||||||
|
@ -1777,6 +1807,7 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
|
||||||
renderer,
|
renderer,
|
||||||
identity_token=self._extract_identity_token(webpage, item_id=video_id),
|
identity_token=self._extract_identity_token(webpage, item_id=video_id),
|
||||||
account_syncid=self._extract_account_syncid(ytcfg),
|
account_syncid=self._extract_account_syncid(ytcfg),
|
||||||
|
ytcfg=ytcfg,
|
||||||
session_token_list=[xsrf_token])
|
session_token_list=[xsrf_token])
|
||||||
|
|
||||||
for comment in comment_iter:
|
for comment in comment_iter:
|
||||||
|
@ -1804,9 +1835,11 @@ def _real_extract(self, url):
|
||||||
player_response = self._extract_yt_initial_variable(
|
player_response = self._extract_yt_initial_variable(
|
||||||
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
|
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
|
||||||
video_id, 'initial player response')
|
video_id, 'initial player response')
|
||||||
|
|
||||||
|
ytcfg = self._extract_ytcfg(video_id, webpage)
|
||||||
if not player_response:
|
if not player_response:
|
||||||
player_response = self._call_api(
|
player_response = self._call_api(
|
||||||
'player', {'videoId': video_id}, video_id)
|
'player', {'videoId': video_id}, video_id, api_key=self._extract_api_key(ytcfg))
|
||||||
|
|
||||||
playability_status = player_response.get('playabilityStatus') or {}
|
playability_status = player_response.get('playabilityStatus') or {}
|
||||||
if playability_status.get('reason') == 'Sign in to confirm your age':
|
if playability_status.get('reason') == 'Sign in to confirm your age':
|
||||||
|
@ -2190,7 +2223,7 @@ def process_language(container, base_url, lang_code, query):
|
||||||
'yt initial data')
|
'yt initial data')
|
||||||
if not initial_data:
|
if not initial_data:
|
||||||
initial_data = self._call_api(
|
initial_data = self._call_api(
|
||||||
'next', {'videoId': video_id}, video_id, fatal=False)
|
'next', {'videoId': video_id}, video_id, fatal=False, api_key=self._extract_api_key(ytcfg))
|
||||||
|
|
||||||
if not is_live:
|
if not is_live:
|
||||||
try:
|
try:
|
||||||
|
@ -2942,7 +2975,7 @@ def _extract_continuation(cls, renderer):
|
||||||
ctp = continuation_ep.get('clickTrackingParams')
|
ctp = continuation_ep.get('clickTrackingParams')
|
||||||
return YoutubeTabIE._build_continuation_query(continuation, ctp)
|
return YoutubeTabIE._build_continuation_query(continuation, ctp)
|
||||||
|
|
||||||
def _entries(self, tab, item_id, identity_token, account_syncid):
|
def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
|
||||||
|
|
||||||
def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
|
def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
|
||||||
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
||||||
|
@ -2994,21 +3027,13 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
for entry in extract_entries(parent_renderer):
|
for entry in extract_entries(parent_renderer):
|
||||||
yield entry
|
yield entry
|
||||||
continuation = continuation_list[0]
|
continuation = continuation_list[0]
|
||||||
|
context = self._extract_context(ytcfg)
|
||||||
headers = {
|
visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
|
||||||
'x-youtube-client-name': '1',
|
|
||||||
'x-youtube-client-version': '2.20201112.04.01',
|
|
||||||
}
|
|
||||||
if identity_token:
|
|
||||||
headers['x-youtube-identity-token'] = identity_token
|
|
||||||
|
|
||||||
if account_syncid:
|
|
||||||
headers['X-Goog-PageId'] = account_syncid
|
|
||||||
headers['X-Goog-AuthUser'] = 0
|
|
||||||
|
|
||||||
for page_num in itertools.count(1):
|
for page_num in itertools.count(1):
|
||||||
if not continuation:
|
if not continuation:
|
||||||
break
|
break
|
||||||
|
headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
|
||||||
retries = self._downloader.params.get('extractor_retries', 3)
|
retries = self._downloader.params.get('extractor_retries', 3)
|
||||||
count = -1
|
count = -1
|
||||||
last_error = None
|
last_error = None
|
||||||
|
@ -3024,6 +3049,8 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
'continuation': continuation['continuation'],
|
'continuation': continuation['continuation'],
|
||||||
'clickTracking': {'clickTrackingParams': continuation['itct']},
|
'clickTracking': {'clickTrackingParams': continuation['itct']},
|
||||||
},
|
},
|
||||||
|
context=context,
|
||||||
|
api_key=self._extract_api_key(ytcfg),
|
||||||
note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
|
note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
|
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
|
||||||
|
@ -3049,6 +3076,8 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
|
|
||||||
if not response:
|
if not response:
|
||||||
break
|
break
|
||||||
|
visitor_data = try_get(
|
||||||
|
response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
|
||||||
|
|
||||||
known_continuation_renderers = {
|
known_continuation_renderers = {
|
||||||
'playlistVideoListContinuation': self._playlist_entries,
|
'playlistVideoListContinuation': self._playlist_entries,
|
||||||
|
@ -3196,7 +3225,8 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs):
|
||||||
self._entries(
|
self._entries(
|
||||||
selected_tab, playlist_id,
|
selected_tab, playlist_id,
|
||||||
self._extract_identity_token(webpage, item_id),
|
self._extract_identity_token(webpage, item_id),
|
||||||
self._extract_account_syncid(data)),
|
self._extract_account_syncid(data),
|
||||||
|
self._extract_ytcfg(item_id, webpage)),
|
||||||
**metadata)
|
**metadata)
|
||||||
|
|
||||||
def _extract_mix_playlist(self, playlist, playlist_id):
|
def _extract_mix_playlist(self, playlist, playlist_id):
|
||||||
|
|
Loading…
Reference in a new issue