From 8072ef2bbd1721e4c79156b422e4fccc1e062853 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 20 Jun 2022 03:03:19 +0530 Subject: [PATCH] [extractor/BiliIntl] Fix metadata extraction Closes #4116 --- yt_dlp/extractor/bilibili.py | 13 ++++++------- yt_dlp/extractor/common.py | 18 +++++++----------- yt_dlp/extractor/fourzerostudio.py | 9 +++------ yt_dlp/utils.py | 4 ++++ 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index ead0dd88b..2912e0cad 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -947,12 +947,11 @@ def _real_extract(self, url): video_id = ep_id or aid webpage = self._download_webpage(url, video_id) # Bstation layout - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), video_id, fatal=False) or {} - video_data = ( - traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) - or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) + initial_data = ( + self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={}) + or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None)) + video_data = traverse_obj( + initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) if season_id and not video_data: # Non-Bstation layout, read through episode list @@ -960,7 +959,7 @@ def _real_extract(self, url): video_data = traverse_obj(season_json, ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), expected_type=dict, get_all=False) - return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) + return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 093a9b5cd..3e3e55798 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1588,15 +1588,13 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', return_full_data=False): - ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' - # not all website do this, but it can be changed - # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) + FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( - (r'' % rectx, - r'%s\(.*?\(function\((?P.*?)\)\{return\s(?P\{.*?\})\}\((?P.*?)\)' % rectx), - webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal) args = dict(zip(arg_keys.split(','), arg_vals.split(','))) @@ -1604,10 +1602,8 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', return_f if val in ('undefined', 'void 0'): args[key] = 'null' - ret = self._parse_json(js_to_json(js, args), video_id) - if return_full_data: - return ret - return ret['data'][0] + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} @staticmethod def _hidden_inputs(html): diff --git a/yt_dlp/extractor/fourzerostudio.py b/yt_dlp/extractor/fourzerostudio.py index 3fa159987..e1804e39e 100644 --- a/yt_dlp/extractor/fourzerostudio.py +++ b/yt_dlp/extractor/fourzerostudio.py @@ -1,8 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - traverse_obj, - unified_timestamp, -) +from ..utils import traverse_obj, unified_timestamp class FourZeroStudioArchiveIE(InfoExtractor): @@ -25,7 +22,7 @@ class FourZeroStudioArchiveIE(InfoExtractor): def _real_extract(self, url): video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') webpage = self._download_webpage(url, video_id) - nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) uploader_internal_id = traverse_obj(nuxt_data, ( @@ -82,7 +79,7 @@ class FourZeroStudioClipIE(InfoExtractor): def _real_extract(self, url): video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') webpage = self._download_webpage(url, video_id) - nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 72223d771..7614839fb 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3216,7 +3216,11 @@ def fix_kv(m): return '"%s"' % v + def create_map(mobj): + return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|