From a0566bbf5c3dea282447efb2926d71bafe1b7720 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 21 Nov 2020 20:20:42 +0530 Subject: [PATCH] Updated to release 2020.11.21.1 --- test/parameters.json | 2 +- test/test_YoutubeDL.py | 70 +++++++++ youtube_dlc/YoutubeDL.py | 79 ++++++---- youtube_dlc/compat.py | 2 +- youtube_dlc/downloader/http.py | 4 +- youtube_dlc/extractor/amara.py | 103 +++++++++++++ youtube_dlc/extractor/brightcove.py | 8 +- youtube_dlc/extractor/common.py | 2 +- youtube_dlc/extractor/discoverynetworks.py | 5 +- youtube_dlc/extractor/europa.py | 4 +- youtube_dlc/extractor/extractors.py | 2 +- youtube_dlc/extractor/francetv.py | 2 +- youtube_dlc/extractor/generic.py | 2 +- youtube_dlc/extractor/googledrive.py | 58 +++---- youtube_dlc/extractor/infoq.py | 7 +- youtube_dlc/extractor/kusi.py | 4 +- youtube_dlc/extractor/npr.py | 2 +- youtube_dlc/extractor/pbs.py | 2 +- youtube_dlc/extractor/rai.py | 132 +++++----------- youtube_dlc/extractor/soundcloud.py | 2 +- youtube_dlc/extractor/svt.py | 48 ++++-- youtube_dlc/extractor/tagesschau.py | 2 +- youtube_dlc/extractor/theplatform.py | 2 +- youtube_dlc/extractor/turner.py | 6 +- youtube_dlc/extractor/viki.py | 171 ++++++++++++--------- youtube_dlc/extractor/vimeo.py | 4 +- youtube_dlc/extractor/xiami.py | 8 +- youtube_dlc/extractor/youtube.py | 170 +++++++++++--------- youtube_dlc/utils.py | 16 +- 29 files changed, 559 insertions(+), 360 deletions(-) create mode 100644 youtube_dlc/extractor/amara.py diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..65fd54428 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -37,7 +37,7 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false, + "listsubtitles": false, "socket_timeout": 20, "fixup": "never" } diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6d02c2a54..a9e649191 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -919,6 +919,76 @@ def _real_extract(self, url): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 3c2970d9f..ef6fe0a78 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -830,34 +830,23 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in 'and will probably not work.') try: - try: - temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) - except (AssertionError, IndexError, AttributeError): - temp_id = None - if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen("[%s] %s: has already been recorded in archive" % ( - ie_key, temp_id)) - break + temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) + except (AssertionError, IndexError, AttributeError): + temp_id = None + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): + self.to_screen("[%s] %s: has already been recorded in archive" % ( + ie_key, temp_id)) + break - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - if info_dict: - if info_dict.get('id'): - ie_result['id'] = info_dict['id'] - if info_dict.get('title'): - ie_result['title'] = info_dict['title'] - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return self.__extract_info(url, ie, download, extra_info, process, info_dict) + + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -865,20 +854,38 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process, info_dict): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + if info_dict: + if info_dict.get('id'): + ie_result['id'] = info_dict['id'] + if info_dict.get('title'): + ie_result['title'] = info_dict['title'] + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -1057,9 +1064,8 @@ def report_download(num_entries): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1088,6 +1094,11 @@ def _fixup(r): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " diff --git a/youtube_dlc/compat.py b/youtube_dlc/compat.py index 1cf7efed6..ac889ddd7 100644 --- a/youtube_dlc/compat.py +++ b/youtube_dlc/compat.py @@ -2345,7 +2345,7 @@ def __init__(self, version, name, value, *args, **kwargs): # HTMLParseError has been deprecated in Python 3.3 and removed in # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exceptiong handling + # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass diff --git a/youtube_dlc/downloader/http.py b/youtube_dlc/downloader/http.py index 96379caf1..d8ac41dcc 100644 --- a/youtube_dlc/downloader/http.py +++ b/youtube_dlc/downloader/http.py @@ -109,7 +109,9 @@ def establish_connection(): try: ctx.data = self.ydl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked diff --git a/youtube_dlc/extractor/amara.py b/youtube_dlc/extractor/amara.py new file mode 100644 index 000000000..61d469574 --- /dev/null +++ b/youtube_dlc/extractor/amara.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P\w+)' + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, + } + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] + + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) + + info = { + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': title, + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), + } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 638673c31..c6ca939dd 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor): ] @classmethod - def _build_brighcove_url(cls, object_str): + def _build_brightcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing {params} @@ -217,7 +217,7 @@ def find_param(name): return cls._make_brightcove_url(params) @classmethod - def _build_brighcove_url_from_js(cls, object_js): + def _build_brightcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove XML @@ -272,12 +272,12 @@ def _extract_brightcove_urls(cls, webpage): ).+?>\s*''', webpage) if matches: - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) if matches: return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) + cls._build_brightcove_url_from_js(custom_bc) for custom_bc in matches])) return [src for _, src in re.findall( r']+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index f90cf36ed..2bc94acdd 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -1664,7 +1664,7 @@ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playist and vice versa. + # master playlist tags MUST NOT appear in a media playlist and vice versa. # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every # media playlist and MUST NOT appear in master playlist thus we can # clearly detect media playlist with this criterion. diff --git a/youtube_dlc/extractor/discoverynetworks.py b/youtube_dlc/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/youtube_dlc/extractor/discoverynetworks.py +++ b/youtube_dlc/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P[^/]+)/video/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P[^/]+)/(?:video/)?(?P[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/europa.py b/youtube_dlc/extractor/europa.py index 1efc0b2ec..2c1c747a1 100644 --- a/youtube_dlc/extractor/europa.py +++ b/youtube_dlc/extractor/europa.py @@ -60,7 +60,7 @@ def get_item(type_, preference): title = get_item('title', preferred_langs) or video_id description = get_item('description', preferred_langs) - thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) @@ -85,7 +85,7 @@ def get_item(type_, preference): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnmail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index ee52492dc..15522f942 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -36,6 +36,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .alura import ( AluraIE, AluraCourseIE @@ -1507,7 +1508,6 @@ YoutubeIE, YoutubeFavouritesIE, YoutubeHistoryIE, - YoutubeLiveIE, YoutubeTabIE, YoutubePlaylistIE, YoutubeRecommendedIE, diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py index dbedfc091..ab0df1bed 100644 --- a/youtube_dlc/extractor/francetv.py +++ b/youtube_dlc/extractor/francetv.py @@ -211,7 +211,7 @@ def sign(manifest_url, manifest_id): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index ce8cac5c1..db4d3a933 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -842,7 +842,7 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # MTVSercices embed + # MTVServices embed { 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', 'md5': 'ca1aef97695ef2c1d6973256a57e5252', diff --git a/youtube_dlc/extractor/googledrive.py b/youtube_dlc/extractor/googledrive.py index ec0d58a57..fdb15795a 100644 --- a/youtube_dlc/extractor/googledrive.py +++ b/youtube_dlc/extractor/googledrive.py @@ -3,11 +3,13 @@ import re from .common import InfoExtractor +from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, int_or_none, lowercase_escape, + try_get, update_url_query, ) @@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor): # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', - 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', - 'info_dict': { - 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', - 'ext': 'mp4', - 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', - } + 'only_matching': True, }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'info_dict': { - 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', - 'ext': 'mp4', - 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', - 'duration': 189, - }, 'only_matching': True, }, { 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', @@ -171,23 +162,21 @@ def _get_automatic_captions(self, video_id, subtitles_id, hl): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id) + video_info = compat_parse_qs(self._download_webpage( + 'https://drive.google.com/get_video_info', + video_id, query={'docid': video_id})) - title = self._search_regex( - r'"title"\s*,\s*"([^"]+)', webpage, 'title', - default=None) or self._og_search_title(webpage) - duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', - default=None)) + def get_value(key): + return try_get(video_info, lambda x: x[key][0]) + + reason = get_value('reason') + title = get_value('title') + if not title and reason: + raise ExtractorError(reason, expected=True) formats = [] - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map', default='').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, - 'fmt_list', default='').split(',') + fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') + fmt_list = (get_value('fmt_list') or '').split(',') if fmt_stream_map and fmt_list: resolutions = {} for fmt in fmt_list: @@ -257,19 +246,14 @@ def add_source_format(urlh): if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) - if not formats: - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason, expected=True) + if not formats and reason: + raise ExtractorError(reason, expected=True) self._sort_formats(formats) - hl = self._search_regex( - r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + hl = get_value('hl') subtitles_id = None - ttsurl = self._search_regex( - r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + ttsurl = get_value('ttsurl') if ttsurl: # the video Id for subtitles will be the last value in the ttsurl # query string @@ -281,8 +265,8 @@ def add_source_format(urlh): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, + 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, + 'duration': int_or_none(get_value('length_seconds')), 'formats': formats, 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), 'automatic_captions': self.extract_automatic_captions( diff --git a/youtube_dlc/extractor/infoq.py b/youtube_dlc/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/youtube_dlc/extractor/infoq.py +++ b/youtube_dlc/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ def _extract_http_video(self, webpage): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link diff --git a/youtube_dlc/extractor/kusi.py b/youtube_dlc/extractor/kusi.py index 6a7e3baa7..9833d35eb 100644 --- a/youtube_dlc/extractor/kusi.py +++ b/youtube_dlc/extractor/kusi.py @@ -64,7 +64,7 @@ def _real_extract(self, url): duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) description = xpath_text(doc, 'ABSTRACT') thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') formats = [] @@ -84,5 +84,5 @@ def _real_extract(self, url): 'duration': duration, 'formats': formats, 'thumbnail': thumbnail, - 'timestamp': createtion_time, + 'timestamp': creation_time, } diff --git a/youtube_dlc/extractor/npr.py b/youtube_dlc/extractor/npr.py index 53acc6e57..9d1122f0c 100644 --- a/youtube_dlc/extractor/npr.py +++ b/youtube_dlc/extractor/npr.py @@ -33,7 +33,7 @@ class NprIE(InfoExtractor): }, }], }, { - # mutlimedia, not media title + # multimedia, not media title 'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert', 'info_dict': { 'id': '533198237', diff --git a/youtube_dlc/extractor/pbs.py b/youtube_dlc/extractor/pbs.py index 4dbe661be..d4baa16ee 100644 --- a/youtube_dlc/extractor/pbs.py +++ b/youtube_dlc/extractor/pbs.py @@ -477,7 +477,7 @@ def _extract_webpage(self, url): if media_id: return media_id, presumptive_id, upload_date, description - # Fronline video embedded via flp + # Frontline video embedded via flp video_id = self._search_regex( r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index a0836bf58..5eef7c633 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -16,8 +16,9 @@ GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, - unescapeHTML, + try_get, unified_strdate, unified_timestamp, update_url_query, @@ -67,7 +68,7 @@ def _extract_relinker_info(self, relinker_url, video_id): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) @@ -122,27 +123,8 @@ def _extract_subtitles(url, subtitle_url): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s)\.(?:html|json))' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - 'skip': 'This content is not available', - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { @@ -166,10 +148,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - url, video_id = re.match(self._VALID_URL, url).groups() + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - url.replace('.html', '.json'), video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] video = media['video'] @@ -195,7 +177,8 @@ def _real_extract(self, url): season = media.get('season') info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, 'alt_title': strip_or_none(media.get('subtitle')), @@ -217,16 +200,16 @@ def _real_extract(self, url): return info -class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+)' - _TEST = { +class RaiPlayLiveIE(RaiPlayIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+))' + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -234,58 +217,50 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') - - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, - 'id': video_id, - 'display_id': display_id, - } + }] class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + base, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) - - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for mobj in re.finditer( - r']+\bhref=(["\'])(?P/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): @@ -328,19 +303,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, - 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -352,18 +314,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', @@ -473,7 +423,7 @@ def _real_extract(self, url): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -485,7 +435,7 @@ def _real_extract(self, url): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) diff --git a/youtube_dlc/extractor/soundcloud.py b/youtube_dlc/extractor/soundcloud.py index ed70b7169..47f68bf19 100644 --- a/youtube_dlc/extractor/soundcloud.py +++ b/youtube_dlc/extractor/soundcloud.py @@ -649,7 +649,7 @@ def _real_extract(self, url): class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): - # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200. + # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. # https://developers.soundcloud.com/blog/offset-pagination-deprecated COMMON_QUERY = { 'limit': 200, diff --git a/youtube_dlc/extractor/svt.py b/youtube_dlc/extractor/svt.py index 2f6887d86..a0b6ef4db 100644 --- a/youtube_dlc/extractor/svt.py +++ b/youtube_dlc/extractor/svt.py @@ -9,6 +9,7 @@ determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ def _extract_video(self, video_info, video_id): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ def _extract_video(self, video_info, video_id): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ def _extract_video(self, video_info, video_id): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -136,26 +140,39 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det här är himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -172,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -236,7 +259,10 @@ def _real_extract(self, url): r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): @@ -360,7 +386,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dlc/extractor/tagesschau.py b/youtube_dlc/extractor/tagesschau.py index c351b7545..8ceab7e35 100644 --- a/youtube_dlc/extractor/tagesschau.py +++ b/youtube_dlc/extractor/tagesschau.py @@ -86,7 +86,7 @@ def _real_extract(self, url): # return self._extract_via_api(kind, video_id) # JSON api does not provide some audio formats (e.g. ogg) thus - # extractiong audio via webpage + # extracting audio via webpage webpage = self._download_webpage(url, video_id) diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py index 07055513a..41bfbe80f 100644 --- a/youtube_dlc/extractor/theplatform.py +++ b/youtube_dlc/extractor/theplatform.py @@ -208,7 +208,7 @@ def _extract_urls(cls, webpage): if m: return [m.group('url')] - # Are whitesapces ignored in URLs? + # Are whitespaces ignored in URLs? # https://github.com/ytdl-org/youtube-dl/issues/12044 matches = re.findall( r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py index 4a6cbfbb8..2964504a2 100644 --- a/youtube_dlc/extractor/turner.py +++ b/youtube_dlc/extractor/turner.py @@ -56,9 +56,9 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): content_id = xpath_text(video_data, 'contentId') or video_id # rtmp_src = xpath_text(video_data, 'akamai/src') # if rtmp_src: - # splited_rtmp_src = rtmp_src.split(',') - # if len(splited_rtmp_src) == 2: - # rtmp_src = splited_rtmp_src[1] + # split_rtmp_src = rtmp_src.split(',') + # if len(split_rtmp_src) == 2: + # rtmp_src = split_rtmp_src[1] # aifp = xpath_text(video_data, 'akamai/aifp', default='') urls = [] diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 6bddf8be9..2e3794344 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,6 +10,10 @@ import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -166,19 +171,20 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '94e0e34fd58f169f40c184f232356cfe', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -195,14 +201,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -218,71 +225,11 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') - - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict.get('url') - format_drms = format_dict.get('drms') - format_stream_id = format_dict.get('id') - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_id == 'mpd': - mpd_formats = self._extract_mpd_formats( - format_url, video_id, - mpd_id='mpd-%s' % protocol, fatal=False) - formats.extend(mpd_formats) - elif format_id == 'mpd': - - formats.extend(mpd_formats) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', - format_url) - if not mobj: - continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - 'drms': format_drms, - 'stream_id': format_stream_id, - }) - else: - urlh = self._request_webpage( - HEADRequest(format_url), video_id, 'Checking file size', fatal=False) - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - 'drms': format_drms, - 'stream_id': format_stream_id, - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - }) - self._sort_formats(formats) + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', + headers={'x-viki-app-ver': '4.0.57'}) + video = resp['video'] self._check_errors(video) @@ -342,12 +289,84 @@ def _real_extract(self, url): 'subtitles': subtitles, } - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result + formats = [] + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + format_drms = format_dict.get('drms') + format_stream_id = format_dict.get('id') + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: + continue + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + 'drms': format_drms, + 'stream_id': format_stream_id, + }) + else: + urlh = self._request_webpage( + HEADRequest(format_url), video_id, 'Checking file size', fatal=False) + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + 'drms': format_drms, + 'stream_id': format_stream_id, + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) + self._sort_formats(formats) result['formats'] = formats return result diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py index a0662a369..51a0ab2fa 100644 --- a/youtube_dlc/extractor/vimeo.py +++ b/youtube_dlc/extractor/vimeo.py @@ -922,7 +922,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): }] _PAGE_SIZE = 100 - def _fetch_page(self, album_id, authorizaion, hashed_pass, page): + def _fetch_page(self, album_id, authorization, hashed_pass, page): api_page = page + 1 query = { 'fields': 'link,uri', @@ -934,7 +934,7 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page): videos = self._download_json( 'https://api.vimeo.com/albums/%s/videos' % album_id, album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorizaion, + 'Authorization': 'jwt ' + authorization, })['data'] for video in videos: link = video.get('link') diff --git a/youtube_dlc/extractor/xiami.py b/youtube_dlc/extractor/xiami.py index 618da8382..769aab331 100644 --- a/youtube_dlc/extractor/xiami.py +++ b/youtube_dlc/extractor/xiami.py @@ -54,17 +54,17 @@ def _extract_tracks(self, item_id, referer, typ=None): def _decrypt(origin): n = int(origin[0]) origin = origin[1:] - short_lenth = len(origin) // n - long_num = len(origin) - short_lenth * n + short_length = len(origin) // n + long_num = len(origin) - short_length * n l = tuple() for i in range(0, n): - length = short_lenth + length = short_length if i < long_num: length += 1 l += (origin[0:length], ) origin = origin[length:] ans = '' - for i in range(0, short_lenth + 1): + for i in range(0, short_length + 1): for j in range(0, n): if len(l[j]) > i: ans += l[j][i] diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3f3f9c58b..fb2702d68 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -306,6 +306,8 @@ def _real_initialize(self): }, } + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + def _call_api(self, ep, query, video_id): data = self._DEFAULT_API_DATA.copy() data.update(query) @@ -322,8 +324,8 @@ def _call_api(self, ep, query, video_id): def _extract_yt_initial_data(self, video_id, webpage): return self._parse_json( self._search_regex( - r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;', - webpage, 'yt initial data'), + (r'%s\s*\n' % self._YT_INITIAL_DATA_RE, + self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), video_id) @@ -1089,6 +1091,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093) + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -2138,6 +2156,21 @@ def _extract_filesize(media_url): formats.append(a_format) else: error_message = extract_unavailable_message() + if not error_message: + reason_list = try_get( + player_response, + lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], + list) or [] + for reason in reason_list: + if not isinstance(reason, dict): + continue + reason_text = try_get(reason, lambda x: x['text'], compat_str) + if reason_text: + if not error_message: + error_message = '' + error_message += reason_text + if error_message: + error_message = clean_html(error_message) if not error_message: error_message = clean_html(try_get( player_response, lambda x: x['playabilityStatus']['reason'], @@ -2319,8 +2352,8 @@ def extract_meta(field): def _extract_count(count_name): return str_to_int(self._search_regex( - r'-%s-button[^>]+>]+class="yt-uix-button-content"[^>]*>([\d,]+)' - % re.escape(count_name), + (r'-%s-button[^>]+>]+class="yt-uix-button-content"[^>]*>([\d,]+)' % re.escape(count_name), + r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), video_webpage, count_name, default=None)) like_count = _extract_count('like') @@ -2613,13 +2646,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, 'playlist_mincount': 138, }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { - 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { - 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ', + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', @@ -2666,7 +2699,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, 'playlist_mincount': 11, }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'only_matching': True, }, { # Playlist URL that does not actually serve a playlist @@ -2698,14 +2731,59 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - IGNORE = (YoutubeLiveIE,) - return ( - False if any(ie.suitable(url) for ie in IGNORE) - else super(YoutubeTabIE, cls).suitable(url)) + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '9Auq9mYxFEE', + 'ext': 'mp4', + 'title': 'Watch Sky News live', + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': '20191102', + 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, + # TODO + # { + # 'url': 'https://www.youtube.com/TheYoungTurks/live', + # 'only_matching': True, + # } + ] def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -3147,7 +3225,7 @@ def _real_extract(self, url): self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) identity_token = self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + r'\bID_TOKEN["\']\s*:\s/l*["\'](.+?)["\']', webpage, 'identity token', default=None) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( @@ -3158,7 +3236,11 @@ def _real_extract(self, url): data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: return self._extract_from_playlist(item_id, data, playlist) - # Fallback to video extraction if no playlist alike page is recognized + # Fallback to video extraction if no playlist alike page is recognized. + # First check for the current video then try the v attribute of URL query. + video_id = try_get( + data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], + compat_str) or video_id if video_id: return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) # Failed to recognize @@ -3279,58 +3361,6 @@ def _real_extract(self, url): ie=YoutubeTabIE.ie_key(), video_id=user_id) -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P%s)/live' % YoutubeTabIE._VALID_URL - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 975b741c5..68b4ca944 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -2460,7 +2460,7 @@ def __init__(self, code=None, msg='Unknown error'): # Parsing code and msg if (self.code in (errno.ENOSPC, errno.EDQUOT) - or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg): self.reason = 'NO_SPACE' elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: self.reason = 'VALUE_TOO_LONG' @@ -4215,10 +4215,10 @@ def parse_codecs(codecs_str): # http://tools.ietf.org/html/rfc6381 if not codecs_str: return {} - splited_codecs = list(filter(None, map( + split_codecs = list(filter(None, map( lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) vcodec, acodec = None, None - for full_codec in splited_codecs: + for full_codec in split_codecs: codec = full_codec.split('.')[0] if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'): if not vcodec: @@ -4229,10 +4229,10 @@ def parse_codecs(codecs_str): else: write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) if not vcodec and not acodec: - if len(splited_codecs) == 2: + if len(split_codecs) == 2: return { - 'vcodec': splited_codecs[0], - 'acodec': splited_codecs[1], + 'vcodec': split_codecs[0], + 'acodec': split_codecs[1], } else: return { @@ -5471,7 +5471,7 @@ def encode_base_n(num, n, table=None): def decode_packed_codes(code): mobj = re.search(PACKED_CODES_RE, code) - obfucasted_code, base, count, symbols = mobj.groups() + obfuscated_code, base, count, symbols = mobj.groups() base = int(base) count = int(count) symbols = symbols.split('|') @@ -5484,7 +5484,7 @@ def decode_packed_codes(code): return re.sub( r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfucasted_code) + obfuscated_code) def caesar(s, alphabet, shift):