[ie/douyutv] Fix extractors (#7652)

Closes #2494, Closes #7295
Authored by: c-basalt
This commit is contained in:
c-basalt 2023-09-21 13:34:35 -04:00 committed by GitHub
parent b3febedbeb
commit 21f40e75df
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,31 +1,72 @@
import time import time
import hashlib import hashlib
import re
import urllib import urllib
import uuid
from .common import InfoExtractor from .common import InfoExtractor
from .openload import PhantomJSwrapper
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
UserNotLive,
determine_ext,
int_or_none,
js_to_json,
parse_resolution,
str_or_none,
traverse_obj,
unescapeHTML, unescapeHTML,
unified_strdate, url_or_none,
urlencode_postdata,
urljoin, urljoin,
) )
class DouyuTVIE(InfoExtractor): class DouyuBaseIE(InfoExtractor):
IE_DESC = '斗鱼' def _download_cryptojs_md5(self, video_id):
for url in [
'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
]:
js_code = self._download_webpage(
url, video_id, note='Downloading signing dependency', fatal=False)
if js_code:
self.cache.store('douyu', 'crypto-js-md5', js_code)
return js_code
raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
def _get_cryptojs_md5(self, video_id):
return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)
def _calc_sign(self, sign_func, video_id, a):
b = uuid.uuid4().hex
c = round(time.time())
js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
phantom = PhantomJSwrapper(self)
result = phantom.execute(js_script, video_id,
note='Executing JS signing script').strip()
return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
def _search_js_sign_func(self, webpage, fatal=True):
# The greedy look-behind ensures last possible script tag is matched
return self._search_regex(
r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal)
class DouyuTVIE(DouyuBaseIE):
IE_DESC = '斗鱼直播'
_VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)' _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.douyutv.com/iseven', 'url': 'https://www.douyu.com/pigff',
'info_dict': { 'info_dict': {
'id': '17732', 'id': '24422',
'display_id': 'iseven', 'display_id': 'pigff',
'ext': 'flv', 'ext': 'mp4',
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*', 'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
'thumbnail': r're:^https?://.*\.png', 'thumbnail': str,
'uploader': '7师傅', 'uploader': 'pigff',
'is_live': True, 'is_live': True,
'live_status': 'is_live',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _get_sign_func(self, room_id, video_id):
return self._download_json(
f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
note='Getting signing script')['data'][f'room{room_id}']
def _extract_stream_formats(self, stream_formats):
formats = []
for stream_info in traverse_obj(stream_formats, (..., 'data')):
stream_url = urljoin(
traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
if stream_url:
rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False)
ext = determine_ext(stream_url)
formats.append({
'url': stream_url,
'format_id': str_or_none(rate_id),
'ext': 'mp4' if ext == 'm3u8' else ext,
'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
'quality': rate_id % -10000 if rate_id is not None else None,
**traverse_obj(rate_info, {
'format': ('name', {str_or_none}),
'tbr': ('bit', {int_or_none}),
}),
})
return formats
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
if video_id.isdigit(): webpage = self._download_webpage(url, video_id)
room_id = video_id room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')
else:
page = self._download_webpage(url, video_id) if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1':
room_id = self._html_search_regex( raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2':
raise UserNotLive(video_id=video_id)
# Grab metadata from API # Grab metadata from API
params = { params = {
@ -102,110 +171,136 @@ def _real_extract(self, url):
'time': int(time.time()), 'time': int(time.time()),
} }
params['auth'] = hashlib.md5( params['auth'] = hashlib.md5(
f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
room = self._download_json( room = traverse_obj(self._download_json(
f'http://www.douyutv.com/api/v1/room/{room_id}', video_id, f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
note='Downloading room info', query=params)['data'] note='Downloading room info', query=params, fatal=False), 'data')
# 1 = live, 2 = offline # 1 = live, 2 = offline
if room.get('show_status') == '2': if traverse_obj(room, 'show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True) raise UserNotLive(video_id=video_id)
video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL')) js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id)
formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id) form_data = {
'rate': 0,
**self._calc_sign(js_sign_func, video_id, room_id),
}
stream_formats = [self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note="Downloading livestream format",
data=urlencode_postdata(form_data))]
title = unescapeHTML(room['room_name']) for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
description = room.get('show_details') if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
thumbnail = room.get('room_src') form_data['rate'] = rate_id
uploader = room.get('nickname') stream_formats.append(self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note=f'Downloading livestream format {rate_id}',
data=urlencode_postdata(form_data)))
return { return {
'id': room_id, 'id': room_id,
'display_id': video_id, 'formats': self._extract_stream_formats(stream_formats),
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'is_live': True, 'is_live': True,
'subtitles': subs, **traverse_obj(room, {
'formats': formats, 'display_id': ('url', {str}, {lambda i: i[1:]}),
'title': ('room_name', {unescapeHTML}),
'description': ('show_details', {str}),
'uploader': ('nickname', {str}),
'thumbnail': ('room_src', {url_or_none}),
})
} }
class DouyuShowIE(InfoExtractor): class DouyuShowIE(DouyuBaseIE):
_VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', 'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
'md5': '0c2cfd068ee2afe657801269b2d86214',
'info_dict': { 'info_dict': {
'id': 'rjNBdvnVXNzvE2yw', 'id': 'mPyq7oVNe5Yv1gLY',
'ext': 'mp4', 'ext': 'mp4',
'title': '陈一发儿:砒霜 我有个室友系列04-01 22点场', 'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃',
'duration': 7150.08, 'duration': 633,
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': str,
'uploader': '陈一发儿', 'uploader': '美食作家王刚V',
'uploader_id': 'XrZwYelr5wbK', 'uploader_id': 'OVAO4NVx1m7Q',
'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', 'timestamp': 1661850002,
'upload_date': '20170402', 'upload_date': '20220830',
'view_count': int,
'tags': ['美食', '美食综合'],
}, },
}, { }, {
'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
'only_matching': True, 'only_matching': True,
}] }]
_FORMATS = {
'super': '原画',
'high': '超清',
'normal': '高清',
}
_QUALITIES = {
'super': -1,
'high': -2,
'normal': -3,
}
_RESOLUTIONS = {
'super': '1920x1080',
'high': '1280x720',
'normal': '852x480',
}
def _real_extract(self, url): def _real_extract(self, url):
url = url.replace('vmobile.', 'v.') url = url.replace('vmobile.', 'v.')
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
room_info = self._parse_json(self._search_regex( video_info = self._search_json(
r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) r'<script>\s*window\.\$DATA\s*=', webpage,
'video info', video_id, transform_source=js_to_json)
video_info = None js_sign_func = self._search_js_sign_func(webpage)
form_data = {
'vid': video_id,
**self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']),
}
url_info = self._download_json(
'https://v.douyu.com/api/stream/getStreamUrl', video_id,
data=urlencode_postdata(form_data), note="Downloading video formats")
for trial in range(5): formats = []
# Sometimes Douyu rejects our request. Let's try it more times for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)):
try: video_url = traverse_obj(url, ('url', {url_or_none}))
video_info = self._download_json( if video_url:
'https://vmobile.douyu.com/video/getInfo', video_id, ext = determine_ext(video_url)
query={'vid': video_id}, formats.append({
headers={ 'format': self._FORMATS.get(name),
'Referer': url, 'format_id': name,
'x-requested-with': 'XMLHttpRequest', 'url': video_url,
}) 'quality': self._QUALITIES.get(name),
break 'ext': 'mp4' if ext == 'm3u8' else ext,
except ExtractorError: 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
self._sleep(1, video_id) **parse_resolution(self._RESOLUTIONS.get(name))
})
if not video_info: else:
raise ExtractorError('Can\'t fetch video info') self.to_screen(
f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')
formats = self._extract_m3u8_formats(
video_info['data']['video_url'], video_id,
entry_protocol='m3u8_native', ext='mp4')
upload_date = unified_strdate(self._html_search_regex(
r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
'upload date', fatal=False))
uploader = uploader_id = uploader_url = None
mobj = re.search(
r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
webpage)
if mobj:
uploader_id, uploader = mobj.groups()
uploader_url = urljoin(url, '/author/' + uploader_id)
return { return {
'id': video_id, 'id': video_id,
'title': room_info['name'],
'formats': formats, 'formats': formats,
'duration': room_info.get('duration'), **traverse_obj(video_info, ('DATA', {
'thumbnail': room_info.get('pic'), 'title': ('content', 'title', {str}),
'upload_date': upload_date, 'uploader': ('content', 'author', {str}),
'uploader': uploader, 'uploader_id': ('content', 'up_id', {str_or_none}),
'uploader_id': uploader_id, 'duration': ('content', 'video_duration', {int_or_none}),
'uploader_url': uploader_url, 'thumbnail': ('content', 'video_pic', {url_or_none}),
'timestamp': ('content', 'create_time', {int_or_none}),
'view_count': ('content', 'view_num', {int_or_none}),
'tags': ('videoTag', ..., 'tagName', {str}),
}))
} }