[bandcamp] Extract more metadata (closes #13197)

This commit is contained in:
Sergey M․ 2018-08-31 03:35:55 +07:00
parent 14b7a24c19
commit 4991e16c2a
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import random import random
import re import re
import time import time
@ -16,15 +15,18 @@
int_or_none, int_or_none,
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
parse_filesize, parse_filesize,
str_or_none,
try_get,
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
unified_strdate, unified_strdate,
unified_timestamp,
url_or_none, url_or_none,
) )
class BandcampIE(InfoExtractor): class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439', 'md5': 'c557841d5e50261777a6585648adf439',
@ -36,24 +38,44 @@ class BandcampIE(InfoExtractor):
}, },
'_skip': 'There is a limit of 200 free downloads / month for the test song' '_skip': 'There is a limit of 200 free downloads / month for the test song'
}, { }, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '0369ace6b939f0927e62c67a1a8d9fa7', 'md5': '853e35bf34aa1d6fe2615ae612564b36',
'info_dict': { 'info_dict': {
'id': '2650410135', 'id': '2650410135',
'ext': 'aiff', 'ext': 'aiff',
'title': 'Ben Prunty - Lanius (Battle)', 'title': 'Ben Prunty - Lanius (Battle)',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty', 'uploader': 'Ben Prunty',
'timestamp': 1396508491,
'upload_date': '20140403',
'release_date': '20140403',
'duration': 260.877,
'track': 'Lanius (Battle)',
'track_number': 1,
'track_id': '2650410135',
'artist': 'Ben Prunty',
'album': 'FTL: Advanced Edition Soundtrack',
}, },
}, { }, {
# no free download, mp3 128
'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
'info_dict': { 'info_dict': {
'id': '2584466013', 'id': '2584466013',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Hail to Fire', 'title': 'Mastodon - Hail to Fire',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Mastodon',
'timestamp': 1322005399,
'upload_date': '20111122',
'release_date': '20040207',
'duration': 120.79,
'track': 'Hail to Fire',
'track_number': 5, 'track_number': 5,
}, 'track_id': '2584466013',
'params': { 'artist': 'Mastodon',
'skip_download': True, 'album': 'Call of the Mastodon',
}, },
}] }]
@ -62,19 +84,23 @@ def _real_extract(self, url):
title = mobj.group('title') title = mobj.group('title')
webpage = self._download_webpage(url, title) webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None) thumbnail = self._html_search_meta('og:image', webpage, default=None)
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if not m_download:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
if m_trackinfo:
json_code = m_trackinfo.group(1)
data = json.loads(json_code)[0]
track_id = compat_str(data['id'])
if not data.get('file'): track_id = None
raise ExtractorError('Not streamable', video_id=track_id, expected=True) track = None
track_number = None
duration = None
formats = [] formats = []
for format_id, format_url in data['file'].items(): track_info = self._parse_json(
self._search_regex(
r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
webpage, 'track info', default='{}'), title)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1) ext, abr_str = format_id.split('-', 1)
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -84,86 +110,110 @@ def _real_extract(self, url):
'acodec': ext, 'acodec': ext,
'abr': int_or_none(abr_str), 'abr': int_or_none(abr_str),
}) })
track = track_info.get('title')
track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
self._sort_formats(formats) def extract(key):
return self._search_regex(
r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
webpage, key, default=None, group='value')
return { artist = extract('artist')
'id': track_id, album = extract('album_title')
'title': data['title'], timestamp = unified_timestamp(
'thumbnail': thumbnail, extract('publish_date') or extract('album_publish_date'))
'formats': formats, release_date = unified_strdate(extract('album_release_date'))
'duration': float_or_none(data.get('duration')),
'track_number': int_or_none(data.get('track_num')),
}
else:
raise ExtractorError('No free songs found')
download_link = m_download.group(1) download_link = self._search_regex(
video_id = self._search_regex( r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', 'download link', default=None, group='url')
webpage, 'video id') if download_link:
track_id = self._search_regex(
r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
webpage, 'track id')
download_webpage = self._download_webpage( download_webpage = self._download_webpage(
download_link, video_id, 'Downloading free downloads page') download_link, track_id, 'Downloading free downloads page')
blob = self._parse_json( blob = self._parse_json(
self._search_regex( self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'), 'blob', group='blob'),
video_id, transform_source=unescapeHTML) track_id, transform_source=unescapeHTML)
info = blob['digital_items'][0] info = try_get(
blob, (lambda x: x['digital_items'][0],
lambda x: x['download_items'][0]), dict)
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
thumbnail = info.get('thumb_url')
downloads = info['downloads'] download_formats = {}
track = info['title'] download_formats_list = blob.get('download_formats')
if isinstance(download_formats_list, list):
for f in blob['download_formats']:
name, ext = f.get('name'), f.get('file_extension')
if all(isinstance(x, compat_str) for x in (name, ext)):
download_formats[name] = ext.strip('.')
artist = info.get('artist') for format_id, f in downloads.items():
title = '%s - %s' % (artist, track) if artist else track format_url = f.get('url')
if not format_url:
continue
# Stat URL generation algorithm is reverse engineered from
# download_*_bundle_*.js
stat_url = update_url_query(
format_url.replace('/download/', '/statdownload/'), {
'.rand': int(time.time() * 1000 * random.random()),
})
format_id = f.get('encoding_name') or format_id
stat = self._download_json(
stat_url, track_id, 'Downloading %s JSON' % format_id,
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
fatal=False)
if not stat:
continue
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
})
download_formats = {}
for f in blob['download_formats']:
name, ext = f.get('name'), f.get('file_extension')
if all(isinstance(x, compat_str) for x in (name, ext)):
download_formats[name] = ext.strip('.')
formats = []
for format_id, f in downloads.items():
format_url = f.get('url')
if not format_url:
continue
# Stat URL generation algorithm is reverse engineered from
# download_*_bundle_*.js
stat_url = update_url_query(
format_url.replace('/download/', '/statdownload/'), {
'.rand': int(time.time() * 1000 * random.random()),
})
format_id = f.get('encoding_name') or format_id
stat = self._download_json(
stat_url, video_id, 'Downloading %s JSON' % format_id,
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
fatal=False)
if not stat:
continue
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
})
self._sort_formats(formats) self._sort_formats(formats)
title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return { return {
'id': video_id, 'id': track_id,
'title': title, 'title': title,
'thumbnail': info.get('thumb_url') or thumbnail, 'thumbnail': thumbnail,
'uploader': info.get('artist'), 'uploader': artist,
'artist': artist, 'timestamp': timestamp,
'release_date': release_date,
'duration': duration,
'track': track, 'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': album,
'formats': formats, 'formats': formats,
} }