PhotobucketIE: accept new format of urls and add a test

This commit is contained in:
Jaime Marquínez Ferrándiz 2013-05-05 13:07:00 +02:00
parent f8602d3242
commit d96680f58d
2 changed files with 31 additions and 9 deletions

View file

@ -411,5 +411,14 @@
"info_dict":{ "info_dict":{
"title":"Sucked on a toilet" "title":"Sucked on a toilet"
} }
},
{
"name": "Photobucket",
"url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0",
"file": "zpsc0c3b9fa.mp4",
"md5": "7dabfb92b0a31f6c16cebc0f8e60ff99",
"info_dict":{
"title":"Tired of Link Building? Try BacklinkMyDomain.com!"
}
} }
] ]

View file

@ -848,7 +848,10 @@ def _real_extract(self, url):
class PhotobucketIE(InfoExtractor): class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com.""" """Information extractor for photobucket.com."""
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' # TODO: the original _VALID_URL was:
# r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
# Check if it's necessary to keep the old extracion process
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
IE_NAME = u'photobucket' IE_NAME = u'photobucket'
def _real_extract(self, url): def _real_extract(self, url):
@ -857,20 +860,30 @@ def _real_extract(self, url):
if mobj is None: if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url) raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1) video_id = mobj.group('id')
video_extension = 'flv' video_extension = mobj.group('ext')
# Retrieve video webpage to extract further information # Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url) webpage = self._download_webpage(url, video_id)
try:
self.report_download_webpage(video_id)
webpage = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
# Extract URL, uploader, and title from webpage # Extract URL, uploader, and title from webpage
self.report_extraction(video_id) self.report_extraction(video_id)
# We try first by looking the javascript code:
mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
if mobj is not None:
info = json.loads(mobj.group('json'))
return [{
'id': video_id,
'url': info[u'downloadUrl'],
'uploader': info[u'username'],
'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
'title': info[u'title'],
'ext': video_extension,
'thumbnail': info[u'thumbUrl'],
}]
# We try looking in other parts of the webpage
mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage) mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
if mobj is None: if mobj is None:
raise ExtractorError(u'Unable to extract media URL') raise ExtractorError(u'Unable to extract media URL')