From 63b569bc5e7d461753637a20ad84a575adee4c0a Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 23 May 2024 14:15:56 -0400 Subject: [PATCH] [ie/taptap] Add extractors (#9776) Closes #9643 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 6 + yt_dlp/extractor/taptap.py | 275 ++++++++++++++++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 yt_dlp/extractor/taptap.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9dfa28c4b..dcdd24ce5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1905,6 +1905,12 @@ from .syfy import SyfyIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE +from .taptap import ( + TapTapMomentIE, + TapTapAppIE, + TapTapAppIntlIE, + TapTapPostIntlIE, +) from .tass import TassIE from .tbs import TBSIE from .tbsjp import ( diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py new file mode 100644 index 000000000..56f2f0ef4 --- /dev/null +++ b/yt_dlp/extractor/taptap.py @@ -0,0 +1,275 @@ +import re +import uuid + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + join_nonempty, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class TapTapBaseIE(InfoExtractor): + _X_UA = 'V=1&PN=WebApp&LANG=zh_CN&VN_CODE=102&LOC=CN&PLT=PC&DS=Android&UID={uuid}&OS=Windows&OSV=10&DT=PC' + _VIDEO_API = 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get' + _INFO_API = None + _INFO_QUERY_KEY = 'id' + _DATA_PATH = None + _ID_PATH = None + _META_PATH = None + + def _get_api(self, url, video_id, query, **kwargs): + query = {**query, 'X-UA': self._X_UA.format(uuid=uuid.uuid4())} + return self._download_json(url, video_id, query=query, **kwargs)['data'] + + def _extract_video(self, video_id): + video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['list'][0] + + # h265 playlist contains both h265 and h264 formats + video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}, any)) + formats = self._extract_m3u8_formats(video_url, video_id, fatal=False) + for format in formats: + if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')): + format['format_id'] = join_nonempty(format.get('format_id'), 'h265', delim='_') + + return { + 'id': str(video_id), + 'formats': formats, + **traverse_obj(video_data, ({ + 'duration': ('info', 'duration', {int_or_none}), + 'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}), + }), get_all=False) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + query = {self._INFO_QUERY_KEY: video_id} + + data = traverse_obj( + self._get_api(self._INFO_API, video_id, query=query), self._DATA_PATH) + + metainfo = traverse_obj(data, self._META_PATH) + entries = [{ + **metainfo, + **self._extract_video(id) + } for id in set(traverse_obj(data, self._ID_PATH))] + + return self.playlist_result(entries, **metainfo, id=video_id) + + +class TapTapMomentIE(TapTapBaseIE): + _VALID_URL = r'https?://www\.taptap\.cn/moment/(?P\d+)' + _INFO_API = 'https://www.taptap.cn/webapiv2/moment/v3/detail' + _ID_PATH = ('moment', 'topic', (('videos', ...), 'pin_video'), 'video_id') + _META_PATH = ('moment', { + 'timestamp': ('created_time', {int_or_none}), + 'modified_timestamp': ('edited_time', {int_or_none}), + 'uploader': ('author', 'user', 'name', {str}), + 'uploader_id': ('author', 'user', 'id', {int}, {str_or_none}), + 'title': ('topic', 'title', {str}), + 'description': ('topic', 'summary', {str}), + }) + _TESTS = [{ + 'url': 'https://www.taptap.cn/moment/194618230982052443', + 'info_dict': { + 'id': '194618230982052443', + 'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星', + 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', + 'timestamp': 1633453402, + 'upload_date': '20211005', + 'modified_timestamp': 1633453402, + 'modified_date': '20211005', + 'uploader': '乌酱', + 'uploader_id': '532896', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '2202584', + 'ext': 'mp4', + 'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星', + 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', + 'duration': 66, + 'timestamp': 1633453402, + 'upload_date': '20211005', + 'modified_timestamp': 1633453402, + 'modified_date': '20211005', + 'uploader': '乌酱', + 'uploader_id': '532896', + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }], + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.taptap.cn/moment/521630629209573493', + 'info_dict': { + 'id': '521630629209573493', + 'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」', + 'description': 'md5:2c81245da864428c904d53ae4ad2182b', + 'timestamp': 1711425600, + 'upload_date': '20240326', + 'modified_timestamp': 1711425600, + 'modified_date': '20240326', + 'uploader': '崩坏:星穹铁道', + 'uploader_id': '414732580', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4006511', + 'ext': 'mp4', + 'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」', + 'description': 'md5:2c81245da864428c904d53ae4ad2182b', + 'duration': 173, + 'timestamp': 1711425600, + 'upload_date': '20240326', + 'modified_timestamp': 1711425600, + 'modified_date': '20240326', + 'uploader': '崩坏:星穹铁道', + 'uploader_id': '414732580', + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }], + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.taptap.cn/moment/540493587511511299', + 'playlist_count': 2, + 'info_dict': { + 'id': '540493587511511299', + 'title': '中式民俗解谜《纸嫁衣7》、新系列《纸不语》公布!', + 'description': 'md5:d60842350e686ddb242291ddfb8e39c9', + 'timestamp': 1715920200, + 'upload_date': '20240517', + 'modified_timestamp': 1715942225, + 'modified_date': '20240517', + 'uploader': 'TapTap 编辑', + 'uploader_id': '7159244', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + +class TapTapAppIE(TapTapBaseIE): + _VALID_URL = r'https?://www\.taptap\.cn/app/(?P\d+)' + _INFO_API = 'https://www.taptap.cn/webapiv2/app/v4/detail' + _ID_PATH = (('app_videos', 'videos'), ..., 'video_id') + _META_PATH = { + 'title': ('title', {str}), + 'description': ('description', 'text', {str}, {clean_html}), + } + _TESTS = [{ + 'url': 'https://www.taptap.cn/app/168332', + 'info_dict': { + 'id': '168332', + 'title': '原神', + 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', + }, + 'playlist_count': 2, + 'playlist': [{ + 'info_dict': { + 'id': '4058443', + 'ext': 'mp4', + 'title': '原神', + 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', + 'duration': 26, + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }, { + 'info_dict': { + 'id': '4058462', + 'ext': 'mp4', + 'title': '原神', + 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', + 'duration': 295, + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }], + 'params': {'skip_download': 'm3u8'}, + }] + + +class TapTapIntlBase(TapTapBaseIE): + _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' + _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' + + +class TapTapAppIntlIE(TapTapIntlBase): + _VALID_URL = r'https?://www\.taptap\.io/app/(?P\d+)' + _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' + _DATA_PATH = 'app' + _ID_PATH = (('app_videos', 'videos'), ..., 'video_id') + _META_PATH = { + 'title': ('title', {str}), + 'description': ('description', 'text', {str}, {clean_html}), + } + _TESTS = [{ + 'url': 'https://www.taptap.io/app/233287', + 'info_dict': { + 'id': '233287', + 'title': '《虹彩六號 M》', + 'description': 'md5:418285f9c15347fc3cf3e3a3c649f182', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '2149708997', + 'ext': 'mp4', + 'title': '《虹彩六號 M》', + 'description': 'md5:418285f9c15347fc3cf3e3a3c649f182', + 'duration': 78, + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }], + 'params': {'skip_download': 'm3u8'}, + }] + + +class TapTapPostIntlIE(TapTapIntlBase): + _VALID_URL = r'https?://www\.taptap\.io/post/(?P\d+)' + _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail' + _INFO_QUERY_KEY = 'id_str' + _DATA_PATH = 'post' + _ID_PATH = ((('videos', ...), 'pin_video'), 'video_id') + _META_PATH = { + 'timestamp': ('published_time', {int_or_none}), + 'modified_timestamp': ('edited_time', {int_or_none}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'id', {int}, {str_or_none}), + 'title': ('title', {str}), + 'description': ('list_fields', 'summary', {str}), + } + _TESTS = [{ + 'url': 'https://www.taptap.io/post/571785', + 'info_dict': { + 'id': '571785', + 'title': 'Arknights x Rainbow Six Siege | Event PV', + 'description': 'md5:f7717c13f6d3108e22db7303e6690bf7', + 'timestamp': 1614664951, + 'upload_date': '20210302', + 'modified_timestamp': 1614664951, + 'modified_date': '20210302', + 'uploader': 'TapTap Editor', + 'uploader_id': '80224473', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '2149491903', + 'ext': 'mp4', + 'title': 'Arknights x Rainbow Six Siege | Event PV', + 'description': 'md5:f7717c13f6d3108e22db7303e6690bf7', + 'duration': 122, + 'timestamp': 1614664951, + 'upload_date': '20210302', + 'modified_timestamp': 1614664951, + 'modified_date': '20210302', + 'uploader': 'TapTap Editor', + 'uploader_id': '80224473', + 'thumbnail': r're:^https?://.*\.(png|jpg)', + } + }], + 'params': {'skip_download': 'm3u8'}, + }]