mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-21 20:46:36 -05:00
parent
54bb39065c
commit
5e51f4a8ad
1 changed files with 30 additions and 40 deletions
|
@ -7,6 +7,7 @@
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
determine_ext,
|
determine_ext,
|
||||||
|
extract_attributes,
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
parse_qs,
|
parse_qs,
|
||||||
|
@ -177,49 +178,38 @@ def build_player_url(cls, video_id, integration, origin_url=None):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _extract_urls(cls, webpage, origin_url):
|
def _extract_urls(cls, webpage, origin_url):
|
||||||
VALID_SRC = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
|
|
||||||
|
|
||||||
# https://docs.glomex.com/publisher/video-player-integration/javascript-api/
|
# https://docs.glomex.com/publisher/video-player-integration/javascript-api/
|
||||||
EMBED_RE = r'''(?x)(?:
|
quot_re = r'["\']'
|
||||||
<iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)|
|
|
||||||
<(?P<html_tag>glomex-player|div)(?:
|
|
||||||
data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)|
|
|
||||||
data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)|
|
|
||||||
data-glomex-player=(?P<_q4>%(quot_re)s)(?P<glomex_player>true)(?P=_q4)|
|
|
||||||
[^>]*?
|
|
||||||
)+>|
|
|
||||||
# naive parsing of inline scripts for hard-coded integration parameters
|
|
||||||
<(?P<script_tag>script)[^<]*?>(?:
|
|
||||||
(?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s*
|
|
||||||
(?P<_q5>%(quot_re)s)(?P<integration_js>(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?|
|
|
||||||
(?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s*
|
|
||||||
(?P<_q6>%(quot_re)s)(?P<id_js>(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?|
|
|
||||||
(?:\s|.)*?
|
|
||||||
)+</script>
|
|
||||||
)''' % {'quot_re': r'["\']', 'url_re': VALID_SRC}
|
|
||||||
|
|
||||||
for mtup in re.findall(EMBED_RE, webpage):
|
regex = fr'''(?x)
|
||||||
# re.finditer causes a memory spike. See https://github.com/yt-dlp/yt-dlp/issues/2512
|
<iframe[^>]+?src=(?P<q>{quot_re})(?P<url>
|
||||||
mdict = dict(zip((
|
(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
|
||||||
'url', '_',
|
)(?P=q)'''
|
||||||
'html_tag', '_', 'integration_html', '_', 'id_html', '_', 'glomex_player',
|
for mobj in re.finditer(regex, webpage):
|
||||||
'script_tag', '_', '_', 'integration_js', '_', 'id_js',
|
url = unescapeHTML(mobj.group('url'))
|
||||||
), mtup))
|
if cls.suitable(url):
|
||||||
if mdict.get('url'):
|
|
||||||
url = unescapeHTML(mdict['url'])
|
|
||||||
if not cls.suitable(url):
|
|
||||||
continue
|
|
||||||
yield cls._smuggle_origin_url(url, origin_url)
|
yield cls._smuggle_origin_url(url, origin_url)
|
||||||
elif mdict.get('html_tag'):
|
|
||||||
if mdict['html_tag'] == 'div' and not mdict.get('glomex_player'):
|
regex = fr'''(?x)
|
||||||
continue
|
<glomex-player [^>]+?>|
|
||||||
if not mdict.get('video_id_html') or not mdict.get('integration_html'):
|
<div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>'''
|
||||||
continue
|
for mobj in re.finditer(regex, webpage):
|
||||||
yield cls.build_player_url(mdict['video_id_html'], mdict['integration_html'], origin_url)
|
attrs = extract_attributes(mobj.group(0))
|
||||||
elif mdict.get('script_tag'):
|
if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
|
||||||
if not mdict.get('video_id_js') or not mdict.get('integration_js'):
|
yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url)
|
||||||
continue
|
|
||||||
yield cls.build_player_url(mdict['video_id_js'], mdict['integration_js'], origin_url)
|
# naive parsing of inline scripts for hard-coded integration parameters
|
||||||
|
regex = fr'''(?x)
|
||||||
|
(?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s*
|
||||||
|
(?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s'''
|
||||||
|
for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage):
|
||||||
|
script = mobj.group(0)
|
||||||
|
integration_id = re.search(regex % 'integrationId', script)
|
||||||
|
if not integration_id:
|
||||||
|
continue
|
||||||
|
playlist_id = re.search(regex % 'playlistId', script)
|
||||||
|
if playlist_id:
|
||||||
|
yield cls.build_player_url(playlist_id, integration_id, origin_url)
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
url, origin_url = self._unsmuggle_origin_url(url)
|
url, origin_url = self._unsmuggle_origin_url(url)
|
||||||
|
|
Loading…
Reference in a new issue