From 29eb5174031cfc0b5de556da3da7761ac377de4e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 17 Dec 2013 04:13:36 +0100 Subject: [PATCH] Add webpage_url_basename info_dict field (Fixes #1938) --- test/test_utils.py | 25 ++++++++++++++++--------- youtube_dl/YoutubeDL.py | 4 ++++ youtube_dl/utils.py | 7 +++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0fa66beec..5f4fdb771 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -13,20 +13,21 @@ #from youtube_dl.utils import htmlentity_transform from youtube_dl.utils import ( - timeconvert, - sanitize_filename, - unescapeHTML, - orderedSet, DateRange, - unified_strdate, + encodeFilename, find_xpath_attr, get_meta_content, - xpath_with_ns, - smuggle_url, - unsmuggle_url, + orderedSet, + sanitize_filename, shell_quote, - encodeFilename, + smuggle_url, str_to_int, + timeconvert, + unescapeHTML, + unified_strdate, + unsmuggle_url, + url_basename, + xpath_with_ns, ) if sys.version_info < (3, 0): @@ -181,6 +182,12 @@ def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) + def test_url_basename(self): + self.assertEqual(url_basename(u'http://foo.de/'), u'') + self.assertEqual(url_basename(u'http://foo.de/bar/baz'), u'baz') + self.assertEqual(url_basename(u'http://foo.de/bar/baz?x=y'), u'baz') + self.assertEqual(url_basename(u'http://foo.de/bar/baz#x=y'), u'baz') + self.assertEqual(url_basename(u'http://foo.de/bar/baz/'), u'baz') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f87415b..2a078adfb 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -47,6 +47,7 @@ subtitles_filename, takewhile_inclusive, UnavailableVideoError, + url_basename, write_json_file, write_string, YoutubeDLHandler, @@ -484,6 +485,7 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, { 'extractor': ie.IE_NAME, 'webpage_url': url, + 'webpage_url_basename': url_basename(url), 'extractor_key': ie.ie_key(), }) if process: @@ -576,6 +578,7 @@ def make_result(embedded_info): 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } @@ -596,6 +599,7 @@ def _fixup(r): { 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], }) return r diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dbfac0f43..a249c7ec1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1084,3 +1084,10 @@ def remove_start(s, start): if s.startswith(start): return s[len(start):] return s + + +def url_basename(url): + m = re.match(r'(?:https?:|)//[^/]+/(?:[^/?#]+/)?([^/?#]+)/?(?:[?#]|$)', url) + if not m: + return u'' + return m.group(1)