[youtube] Add support for search result URLs (Fixes #2495)

This commit is contained in:
Philipp Hagemeister 2014-03-04 03:32:28 +01:00
parent 86fb4347f7
commit c9ae7b9565
5 changed files with 64 additions and 11 deletions

View file

@ -71,6 +71,10 @@ def test_youtube_show_matching(self):
def test_youtube_truncated(self): def test_youtube_truncated(self):
self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url']) self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_justin_tv_channelid_matching(self): def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))

View file

@ -16,6 +16,7 @@
YoutubeChannelIE, YoutubeChannelIE,
YoutubeShowIE, YoutubeShowIE,
YoutubeTopListIE, YoutubeTopListIE,
YoutubeSearchURLIE,
) )
@ -133,5 +134,14 @@ def test_youtube_toplist(self):
entries = result['entries'] entries = result['entries']
self.assertTrue(len(entries) >= 5) self.assertTrue(len(entries) >= 5)
def test_youtube_search_url(self):
dl = FakeYDL()
ie = YoutubeSearchURLIE(dl)
result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
entries = result['entries']
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'youtube-dl test video')
self.assertTrue(len(entries) >= 5)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -675,7 +675,7 @@ def process_video_result(self, info_dict, download=True):
info_dict['playlist'] = None info_dict['playlist'] = None
info_dict['playlist_index'] = None info_dict['playlist_index'] = None
if 'display_id' not in info_dict: if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id'] info_dict['display_id'] = info_dict['id']
# This extractors handle format selection themselves # This extractors handle format selection themselves

View file

@ -285,19 +285,20 @@
from .youporn import YouPornIE from .youporn import YouPornIE
from .youtube import ( from .youtube import (
YoutubeIE, YoutubeIE,
YoutubePlaylistIE,
YoutubeSearchIE,
YoutubeSearchDateIE,
YoutubeUserIE,
YoutubeChannelIE, YoutubeChannelIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeRecommendedIE,
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE, YoutubeFavouritesIE,
YoutubeHistoryIE, YoutubeHistoryIE,
YoutubePlaylistIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeTopListIE, YoutubeTopListIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeWatchLaterIE,
) )
from .zdf import ZDFIE from .zdf import ZDFIE

View file

@ -1645,7 +1645,7 @@ def _real_extract(self, url):
class YoutubeUserIE(InfoExtractor): class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s' _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50 _GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@ -1744,12 +1744,50 @@ def _get_n_results(self, query, n):
for video_id in video_ids] for video_id in video_ids]
return self.playlist_result(videos, query) return self.playlist_result(videos, query)
class YoutubeSearchDateIE(YoutubeSearchIE): class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date' IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate' _SEARCH_KEY = 'ytsearchdate'
IE_DESC = u'YouTube.com searches, newest videos first' IE_DESC = u'YouTube.com searches, newest videos first'
class YoutubeSearchURLIE(InfoExtractor):
IE_DESC = u'YouTube.com search URLs'
IE_NAME = u'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = compat_urllib_parse.unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
part_codes = re.findall(
r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
entries = []
for part_code in part_codes:
part_title = self._html_search_regex(
r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
part_url_snippet = self._html_search_regex(
r'(?s)href="([^"]+)"', part_code, 'item URL')
part_url = compat_urlparse.urljoin(
'https://www.youtube.com/', part_url_snippet)
entries.append({
'_type': 'url',
'url': part_url,
'title': part_title,
})
return {
'_type': 'playlist',
'entries': entries,
'title': query,
}
class YoutubeShowIE(InfoExtractor): class YoutubeShowIE(InfoExtractor):
IE_DESC = u'YouTube.com (multi-season) shows' IE_DESC = u'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(.*)' _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'