yt-dlp/yt_dlp/extractor/promodj.py
2024-02-14 16:23:17 +03:00

634 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import datetime
import functools
import re
import urllib.parse
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
OnDemandPagedList,
clean_html,
dict_get,
extract_attributes,
ExtractorError,
float_or_none,
get_element_by_class,
get_elements_html_by_class,
int_or_none,
js_to_json,
merge_dicts,
parse_duration,
str_or_none,
traverse_obj,
urlencode_postdata,
url_or_none,
)
# promodj.com
# Playlist types:
# /:login/:media_type - default
# /:login/groups/:id/:slug - user defined (groups). Can contain audios and/or videos
# A single media by default is attached to default playlist
# But it can be reattached to a user playlist (group), and no longer appears in the default one
# User pages
# /:login - all non-empty playlists
# /:login/music - all non-empty playlists with at least one audio (shows 10 audios per playlist max)
# /:login/video - all non-empty playlists with at least one video (shows 10 videos per playlist max)
# /:login/pages - a list of user pages
# /:login/:page_name - a single user page
# /:login/blog - a list of blog posts
# /:login/blog/:id/:slug - a single blog post
# If default playlist is empty, it redirects to the user's page
# Pages and blog posts can contain: audios, videos, youtube videos
# Tracks and remixes can be paid. See /shop page
class PromoDJBaseIE(InfoExtractor):
_MEDIA_TYPES = [
'tracks',
'remixes',
'mixes',
'promos',
'lives',
'podcasts',
'radioshows',
'tools',
'realtones', # doesn't appear on the site menu but still exists
'acapellas', # redirects to /tools, creates default playlist
'samples', # redirects to /tools, doesn't create default playlist
'videos',
]
_PAGES = ['featured', 'shop', *_MEDIA_TYPES]
_BASE_URL_RE = r'https?://(?:www\.)?promodj\.com'
_MEDIA_TYPES_RE = '|'.join(_MEDIA_TYPES)
_NOT_PAGE_RE = '|'.join(['radio', *_PAGES])
_LOGIN_RE = rf'(?:(?!{_NOT_PAGE_RE}).)[\w.-]+'
def _set_url_page(self, url, page):
parsed_url = urllib.parse.urlparse(url)
qs = urllib.parse.parse_qs(parsed_url.query)
qs['page'] = page
return parsed_url._replace(query=urllib.parse.urlencode(qs, doseq=True)).geturl()
def _fetch_page(self, url, allowed_media_cats, playlist_id, page):
page_url = self._set_url_page(url, page + 1)
html = self._download_webpage(page_url, f'{playlist_id}-page-{page + 1}')
if self._get_current_page(html) != page + 1:
return
for a in get_elements_html_by_class('player_standard_tool__play', html):
url = traverse_obj(extract_attributes(a), ('href', {url_or_none}))
if not url:
continue
url = url.replace('?play=1', '')
is_video = '/videos/' in url
if is_video and 'video' in allowed_media_cats or not is_video and 'music' in allowed_media_cats:
yield self.url_result(url, PromoDJIE)
def _parse_playlist_links(self, html):
PLAYLISTS_RE = r'<a class=\"files_group_title\" href=\"([^\"]+)\">'
DEFAULT_VIDEO_PLAYLIST_RE = r'<h5><a href=\"https://promodj\.com/([\w.-]+)/video\">Видео</a></h5>'
playlist_links = []
for playlist_url in re.findall(PLAYLISTS_RE, html):
playlist_links.append(playlist_url)
login = self._search_regex(
DEFAULT_VIDEO_PLAYLIST_RE, html, 'video playlist url', None)
if login:
playlist_links.append(f'https://promodj.com/{login}/videos')
return playlist_links
def _parse_page_content(self, html):
for id in re.findall(r'CORE\.Player\(\'[^\']+\', \'(?:standalone|cover)\.big\', (\d+),', html):
yield self.url_result(f'https://promodj.com/embed/{id}/big', PromoDJEmbedIE, id)
for iframe_url in re.findall(r'<iframe[^>]+src=\"([^\"]+)\"', html):
if YoutubeIE.suitable(iframe_url):
yield self.url_result(iframe_url, YoutubeIE)
def _get_playlist_page_size(self, url):
is_default_playlist = '/groups/' not in url
return 30 if is_default_playlist else 20
def _get_current_page(self, html):
return int(clean_html(get_element_by_class('NavigatorCurrentPage', html)) or '1')
def _fetch_media_data(self, ids, video_id):
data = {}
for i, id in enumerate(ids):
data[f'multi[{i}][method]'] = 'players/config'
data[f'multi[{i}][params][kind]'] = 'standalone.big'
data[f'multi[{i}][params][fileID]'] = id
return self._download_json(
'https://promodj.com/api/multi.json', video_id, data=urlencode_postdata(data),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
def _parse_media_data(self, media_data, id):
if player_error := media_data.get('player_error'):
raise ExtractorError(player_error, expected=True)
if media_data.get('video'):
video = traverse_obj(
self._parse_json(media_data['config'], id), ('playlist', 'item', 0))
formats = [{
'url': traverse_obj(video, ('play', '@url', {url_or_none})),
**traverse_obj(media_data, {
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
})
}]
return {
'id': id,
'formats': formats,
**traverse_obj(video, {
'title': ('title', 'line', 1, 0, '$', {str_or_none}),
'webpage_url': ('title', '@ico_url', {url_or_none}),
'duration': ('play', '@duration', {int_or_none}),
'thumbnail': ('background', '@url', {url_or_none}),
'channel': ('title', 'line', 0, 0, '$', {str_or_none}),
'channel_url': ('title', 'line', 0, 0, '@url', {url_or_none}),
})
}
formats = [traverse_obj(source, {
'url': ('URL', {url_or_none}),
'size': ('size', {int_or_none}),
}) for source in traverse_obj(media_data, ('sources'))]
return {
'id': id,
'title': clean_html(dict_get(media_data, ('title_html', 'title'))),
'formats': formats,
'webpage_url': traverse_obj(media_data, ('titleURL', {url_or_none}))
}
class PromoDJPageIE(PromoDJBaseIE):
_PAGES_RE = '|'.join(PromoDJBaseIE._PAGES)
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<id>{_PAGES_RE})'
_TESTS = [{
'url': 'https://promodj.com/featured',
'only_matching': True,
}, {
# second page
'url': 'https://promodj.com/featured/rap?download=1&page=2',
'only_matching': True,
}, {
# filtered
'url': 'https://promodj.com/remixes?top=1',
'only_matching': True,
}, {
# with genre
'url': 'https://promodj.com/tracks/hip_hop',
'only_matching': True,
}, {
# with search
'url': 'https://promodj.com/mixes?kind=mixes&styleID=&searchfor=dance',
'only_matching': True,
}, {
# no download button
'url': 'https://promodj.com/shop',
'only_matching': True,
}]
_PAGE_SIZE = 20
def _real_extract(self, url):
page_type = self._match_id(url)
return self.playlist_result(
OnDemandPagedList(
functools.partial(self._fetch_page, url, ['music', 'video'], page_type),
self._PAGE_SIZE),
playlist_id=page_type)
class PromoDJUserIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})$'
_TESTS = [{
'url': 'https://promodj.com/djperetse',
'only_matching': True,
}, {
'url': 'https://promodj.com/dj-trojan',
'only_matching': True,
}]
def _real_extract(self, url):
login = self._match_valid_url(url).group('login')
html = self._download_webpage(url, login)
def entries():
for playlist_url in self._parse_playlist_links(html):
yield self.url_result(playlist_url, PromoDJPlaylistIE)
return self.playlist_result(entries(), playlist_id=login)
class PromoDJUserMediaIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>music|video)$'
_TESTS = [{
'url': 'https://promodj.com/feel/music',
'only_matching': True,
}, {
'url': 'https://promodj.com/djmikis/video',
'only_matching': True,
}, {
# a user without any videos
'url': 'https://promodj.com/worobyev/video',
'only_matching': True,
}]
def _real_extract(self, url):
login, type = self._match_valid_url(url).groups()
page_id = f'{login}-{type}'
html = self._download_webpage(url, page_id)
def entries():
for playlist_url in self._parse_playlist_links(html):
ie = PromoDJMusicPlaylistIE if type == 'music' else PromoDJVideoPlaylistIE
yield self.url_result(playlist_url, ie)
return self.playlist_result(entries(), playlist_id=page_id)
class PromoDJUserPagesIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>pages|blog)$'
_TESTS = [{
'url': 'https://promodj.com/djperetse/pages',
'only_matching': True,
}, {
'url': 'https://promodj.com/golub/blog',
'only_matching': True,
}]
_PAGE_SIZE = 10
def _parse_pages(self, url, playlist_id):
html = self._download_webpage(url, playlist_id)
content_html = get_element_by_class('dj_universal', get_element_by_class('dj_bblock', html))
for page_url, page_title in re.findall(r'<a href=\"([^\"]+)\">([^<]+)</a>', content_html):
yield self.url_result(page_url, PromoDJUserPageIE, video_title=page_title)
def _fetch_blogs_page(self, url, playlist_id, page):
page_url = self._set_url_page(url, page + 1)
html = self._download_webpage(page_url, f'{playlist_id}-page-{page + 1}')
if self._get_current_page(html) != page + 1:
return
for a in get_elements_html_by_class('post_title_moderated', html):
if url := traverse_obj(extract_attributes(a), ('href', {url_or_none})):
yield self.url_result(url, PromoDJBlogPageIE)
def _real_extract(self, url):
login, type = self._match_valid_url(url).groups()
playlist_id = f'{login}-{type}'
if type == 'pages':
entries = self._parse_pages(url, playlist_id)
elif type == 'blog':
entries = OnDemandPagedList(
functools.partial(self._fetch_blogs_page, url, playlist_id),
self._PAGE_SIZE)
return self.playlist_result(entries, playlist_id)
class PromoDJUserPageIE(PromoDJBaseIE):
_USER_PAGES = [
'pages',
'music',
'video',
'foto',
'avisha',
'blog',
'feedback',
'contact',
*PromoDJBaseIE._MEDIA_TYPES,
]
_NOT_USER_PAGE_RE = '|'.join(_USER_PAGES)
_USER_PAGE_RE = rf'(?:(?!{_NOT_USER_PAGE_RE}).)[\w-]+'
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<slug>{_USER_PAGE_RE})$'
_TESTS = [{
'url': 'https://promodj.com/djperetse/MaxMixes',
'only_matching': True,
}]
def _real_extract(self, url):
login, slug = self._match_valid_url(url).groups()
page_id = f'{login}-{slug}'
html = self._download_webpage(url, page_id)
content_html = get_element_by_class('perfect', html)
return self.playlist_result(
self._parse_page_content(content_html), playlist_id=page_id)
class PromoDJBlogPageIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/blog/(?P<id>\d+)(?:/(?P<slug>\w+))?'
_TESTS = [{
# with small and big audio players and youtube video
'url': 'https://promodj.com/golub/blog/1163895/DJ_Andrey_Golubev_To_Depeche_Mode_with_love_part_9_special_dj_edits_mix',
'only_matching': True,
}, {
# with audio and video
'url': 'https://promodj.com/svetmusic/blog/1101958/SVET_I_Like_It_Extra_Sound_Recordings',
'only_matching': True,
}, {
# without any media
'url': 'https://promodj.com/svetmusic/blog/915878/DJ_SVET_pobeditel_konkursa_Burn_City_Sound',
'only_matching': True,
}]
def _real_extract(self, url):
login, id, slug = self._match_valid_url(url).groups()
page_id = f'{login}-blog-{id}-{slug}'
html = self._download_webpage(url, page_id)
content_html = get_element_by_class('post_body', html)
return self.playlist_result(
self._parse_page_content(content_html), playlist_id=page_id)
class PromoDJPlaylistIE(PromoDJBaseIE):
_VALID_URL = [
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>{PromoDJBaseIE._MEDIA_TYPES_RE})$',
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>groups)/(?P<id>\d+)(?:/(?P<slug>\w+))?',
]
_TESTS = [{
# default playlist: tracks (audio)
'url': 'https://promodj.com/gluk/tracks',
'only_matching': True,
}, {
# default playlist: video
'url': 'https://promodj.com/djperetse/videos',
'only_matching': True,
}, {
# user playlist: audio
'url': 'https://promodj.com/fonarev/groups/608158/Digital_Emotions_Night',
'only_matching': True,
}, {
# two pages
'url': 'https://promodj.com/lavrov/groups/677132/VINYL',
'only_matching': True,
}, {
# user playlist: video
'url': 'https://promodj.com/deeplecture/groups/672782/LAROCCA_TV',
'only_matching': True,
}, {
# user playlist: audio and video
'url': 'https://promodj.com/djperetse/groups/637358/Russkie_treki',
'only_matching': True,
}, {
# 900+ items
'url': 'https://promodj.com/fonarev/groups/17350/Digital_Emotions_Podcast',
'only_matching': True,
}]
_ALLOWED_MEDIA_CATS = ['music', 'video']
def _real_extract(self, url):
match = self._match_valid_url(url)
login = match.group('login')
type = match.group('type')
playlist_id = f'{login}-{type}' if len(match.groups()) == 2 else f'{login}-{type}-{match.group("id")}'
page_size = self._get_playlist_page_size(url)
entries = OnDemandPagedList(
functools.partial(self._fetch_page, url, self._ALLOWED_MEDIA_CATS, playlist_id),
page_size)
return self.playlist_result(entries, playlist_id=playlist_id)
class PromoDJMusicPlaylistIE(PromoDJPlaylistIE):
_ALLOWED_MEDIA_CATS = ['music']
class PromoDJVideoPlaylistIE(PromoDJPlaylistIE):
_ALLOWED_MEDIA_CATS = ['video']
class PromoDJIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/{PromoDJBaseIE._LOGIN_RE}/(?P<type>{PromoDJBaseIE._MEDIA_TYPES_RE})/(?P<id>\d+)(?:/\w+)?',
_TESTS = [{
'url': 'https://promodj.com/antonpavlovsky/remixes/6259208/David_Usher_Black_Black_Heart_Anton_Pavlovsky_Cover',
'only_matching': True,
}, {
'url': 'https://promodj.com/j-factory/samples/7560171/Amedici_BW1_Intro',
'only_matching': True,
}, {
# music: no download links in html
'url': 'https://promodj.com/gluk/tracks/4713922/DJ_Glyuk_Folk_ing_DJ_Steven_Smile_Remix_2005',
'only_matching': True,
}, {
# video: no download link in html
'url': 'https://promodj.com/psywanderer/videos/7559147/Chu_de_sa',
'only_matching': True,
}, {
# no player
'url': 'https://promodj.com/gluk/tracks/420310/IMpulse_Zakat',
'only_matching': True,
}, {
# without slug
'url': 'https://promodj.com/djlykov/tracks/7551590',
'only_matching': True,
}, {
# lossless wav
'url': 'https://promodj.com/modi-glu/tracks/6081339/Modi_Glyu_Anabel',
'only_matching': True,
}, {
# lossless flac
'url': 'https://promodj.com/sashaorbeat/mixes/7422493/Sasha_Orbeat_Pure_Love_3',
'only_matching': True,
}, {
# paid lossless
'url': 'https://promodj.com/boyko/tracks/1435682/Dj_Boyko_Katy_Queen_Nad_Oblakami',
'only_matching': True,
}, {
# paid lossy
'url': 'https://promodj.com/tesla/tracks/342938/Library_Of_Bugs',
'only_matching': True,
}, {
'url': 'https://promodj.com/sergeyfedotov306/videos/7457627/V_Matrice_Sboy',
'only_matching': True,
}, {
'url': 'https://promodj.com/djperetse/videos/5868236/Fatalist_Project_feat_DJ_Peretse_Den_pobedi_Videoklip',
'only_matching': True,
}, {
# avi
'url': 'https://promodj.com/djmikis/videos/5311597/Mikis_Live_SDJ_Show',
'only_matching': True,
}, {
# asf
'url': 'https://promodj.com/gigsiphonic/videos/7559341/Gigsiphonic_PODCAST_309_Extended_video_version',
'only_matching': True,
}, {
# not valid html
'url': 'https://promodj.com/martin.sehnal/videos/7555841/Martin_Sehnal_CII_33_Plus_CII_32_Clothes_on_the_peg_2_020_2_024_02_01th',
'only_matching': True,
}]
_IS_PAID_RE = r'<b>Цена:</b>'
# examples: MP3, 320 Кбит | MP4, 20157 Кбит | WAV, 1412 Кбит | AVI, 1731 Кбит | ASF, 6905 Кбит | FLAC, 1509 Кбит
# https://regex101.com/r/2AuaxB/1
_FORMATS_RE = r'(?:<a\s+href=\"(?P<url>[^\"]+)\">)?\s*(?P<format>\w+), (?P<bitrate>\d+) Кбит'
_VIEW_COUNT_RE = r'<b>(?:Прослушиваний|Просмотров):</b>\s*(\d+)'
# examples: 0:21 | 1:07 | 74:38
_DURATION_RE = r'<b>Продолжительность:</b>\s*(\d+:\d{2})'
# examples: 818.4 Кб | 12.9 Мб | 4 Гб | 1.76 Гб | 1001.5 Мб
_SIZE_RE = r'<b>Размер:</b>\s*(?P<size>\d+(?:\.\d+)?)\s*(?P<unit>Кб|Мб|Гб)'
# examples: сегодня 2:55 | вчера 23:17 | 1 июня 2016 3:46
_TIMESTAMP_RE = r'<b>Публикация:</b>\s*(?P<day>вчера|сегодня|\d{1,2})(?: (?P<month>[а-я]+) (?P<year>\d{4}))?\s*(?P<hours>\d{1,2}):(?P<minutes>\d{2})'
_TAGS_RE = r'<span\s+class=\"styles\">([^\n]+)</span>'
# https://regex101.com/r/2ZkUmW/1
_MUSIC_DATA_REGEX = r'({\"no_preroll\":false,\"seekAny\":true,\"sources\":[^\n]+)\);'
# https://regex101.com/r/b9utBf/1
_VIDEO_DATA_REGEX = r'({\"video\":true,\"config\":[^\n]+)\);'
def _parse_ru_date(self, raw_date):
RU_MONTHS = ['января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря']
day, month, year, hours, minutes = raw_date
if day == 'сегодня':
d = datetime.date.today()
day = d.day
month = d.month
year = d.year
elif day == 'вчера':
d = datetime.date.today() - datetime.timedelta(days=1)
day = d.day
month = d.month
year = d.year
else:
day = int(day)
month = RU_MONTHS.index(month) + 1
year = int(year)
return datetime.datetime(year, month, day, int(hours), int(minutes)).timestamp()
def _parse_ru_size(self, raw_size):
RU_SIZE_UNITS = ['Б', 'Кб', 'Мб', 'Гб']
size, size_unit = raw_size
return int(float(size) * pow(1024, RU_SIZE_UNITS.index(size_unit)))
def _parse_media(self, html, id, type):
# videos always have one format
# audios can have one or two formats
# always returns only one format
# if audio has two formats, returns only lossy
media_data = self._search_json(
'', html, 'media data', id,
contains_pattern=self._VIDEO_DATA_REGEX if type == 'videos' else self._MUSIC_DATA_REGEX,
transform_source=js_to_json)
metadata = self._parse_media_data(media_data, id)
# html can be invalid
try:
meta_html = get_elements_html_by_class('dj_universal', html)[1]
except Exception:
meta_html = html
# returns one or two formats but sometimes without download links
# best quality always comes first
formats_from_html = re.findall(self._FORMATS_RE, meta_html)
is_paid = re.search(self._IS_PAID_RE, meta_html)
bitrate_key = 'tbr' if type == 'videos' else 'abr'
for i, match in enumerate(formats_from_html):
url, _, bitrate = match
is_last = i == len(formats_from_html) - 1
if is_last:
metadata['formats'][0][bitrate_key] = int(bitrate)
elif url_or_none(url) and not is_paid:
metadata['formats'].append({
'url': url,
bitrate_key: int(bitrate),
})
# size field describes best quality
size = self._parse_ru_size(re.search(self._SIZE_RE, meta_html).groups())
metadata['formats'][-1]['size'] = size
return merge_dicts(metadata, {
'title': clean_html(get_element_by_class('file_title', html)),
'view_count': int_or_none(self._search_regex(self._VIEW_COUNT_RE, meta_html, 'view_count', default=None)),
'duration': parse_duration(self._search_regex(self._DURATION_RE, meta_html, 'duration')),
'timestamp': self._parse_ru_date(re.findall(self._TIMESTAMP_RE, meta_html)[0]),
'tags': self._html_search_regex(self._TAGS_RE, meta_html, 'tags').split(', '),
})
def _real_extract(self, url):
type, id = self._match_valid_url(url).groups()
html = self._download_webpage(url, id)
return self._parse_media(html, id, type)
class PromoDJEmbedIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/embed/(?P<id>\d+)/(?P<type>cover|big)'
_TESTS = [{
'url': 'https://promodj.com/embed/7555440/cover',
'only_matching': True,
}, {
'url': 'https://promodj.com/embed/7540163/big',
'only_matching': True,
}, {
# video (can be only big)
'url': 'https://promodj.com/embed/3922099/big',
'only_matching': True,
}, {
# blocked
'url': 'https://promodj.com/embed/5586967/big',
'only_matching': True,
}, {
# deleted
'url': 'https://promodj.com/embed/5606804/big',
'only_matching': True,
}]
def _real_extract(self, url):
id = self._match_id(url)
metadata = self._parse_media_data(
self._fetch_media_data([id], id)[0], id)
return self.url_result(metadata['webpage_url'], PromoDJIE, id)
class PromoDJShortIE(PromoDJBaseIE):
_VALID_URL = r'https://pdj.cc/(?P<id>\w+)'
_TESTS = [{
'url': 'https://pdj.cc/fv8VD',
'only_matching': True,
}]
def _real_extract(self, url):
id = self._match_id(url)
html = self._download_webpage(url, id)
try:
url = self._og_search_url(html)
except Exception:
raise ExtractorError('Unable to extract full URL')
return self.url_result(url, PromoDJIE, id)
class PromoDJRadioIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/radio#(?P<id>\w+)'
_TESTS = [{
'url': 'https://promodj.com/radio#dubstep',
'only_matching': True,
}, {
'url': 'https://promodj.com/radio#oldschool',
'only_matching': True,
}]
def _real_extract(self, url):
id = self._match_id(url)
return {
'id': id,
'formats': [{
'url': f'https://radio.promodj.com/{id}-192',
'abr': 192,
}],
'is_live': True,
}