From 5cbe7563bece11e52c833a79b0197ca4444ffe37 Mon Sep 17 00:00:00 2001 From: xarantolus Date: Tue, 23 Jun 2020 11:27:02 +0200 Subject: [PATCH] [youtube] Return to old feed extraction code as it *seems* like that change was reverted The old code now works again, but it downloads without limit. This is why a limit of 1000 videos is added, it can be overwritten with the `--max-downloads` option - that way, only so many ids will be extracted as videos downloaded --- youtube_dl/extractor/youtube.py | 119 ++++++++------------------------ 1 file changed, 30 insertions(+), 89 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 69cc4a017..745e14fa3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3303,7 +3303,7 @@ def _real_extract(self, url): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties as well as an _extract_video_info function. + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True @@ -3319,41 +3319,44 @@ def _real_initialize(self): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - info = [] + limit = self._downloader.params.get('max_downloads') or 1000 + ids = [] + more_widget_html = content_html = page for page_num in itertools.count(1): - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - video_info, continuation = self._extract_video_info(search_response) - - new_info = [] - - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break - - if not have_video: - new_info.append(v) - - if not new_info: + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) + if not new_ids: break - info.extend(new_info) + done = False + if len(new_ids) + len(ids) > limit: + new_ids = new_ids[:limit - len(ids)] + done = True - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) + ids.extend(new_ids) - if not continuation: + for entry in self._ids_to_results(new_ids): + yield entry + + if done: break - # TODO: Fix continuation request to download more pages + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] def _real_extract(self, url): page = self._download_webpage( @@ -3402,33 +3405,6 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) - - for renderer in renderers: - vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) - if vid is not None: - videos.append(vid) - continue - - if 'richSectionRenderer' in renderer: - vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) - for v in vids: - vid = try_get(v, lambda x: x['richItemRenderer']['content']['videoRenderer']) - if vid is not None: - videos.append(vid) - continue - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' @@ -3436,24 +3412,6 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) - for renderer in renderers: - for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): - vid = try_get(item, lambda x: x['gridVideoRenderer']) - if vid is not None: - videos.append(vid) - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' @@ -3461,23 +3419,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) - for renderer in renderers: - vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) - if vid is not None: - videos.append(vid) - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url'