[theplatform] Recognize URLs with whitespaces (closes #12044)

This commit is contained in:
Yen Chi Hsuan 2017-02-17 23:13:51 +08:00
parent 4cead6a614
commit fef51645d6
No known key found for this signature in database
GPG key ID: 7F902A182457CA23
3 changed files with 11 additions and 3 deletions

View file

@ -1,6 +1,7 @@
version <unreleased> version <unreleased>
Extractors Extractors
+ [theplatform] Recognize URLs with whitespaces (#12044)
+ [generic] Support complex JWPlayer embedded videos (#12030) + [generic] Support complex JWPlayer embedded videos (#12030)

View file

@ -1501,7 +1501,12 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
'add_ie': [VideoPressIE.ie_key()], 'add_ie': [VideoPressIE.ie_key()],
} },
{
# ThePlatform embedded with whitespaces in URLs
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
'only_matching': True,
},
# { # {
# # TODO: find another test # # TODO: find another test
# # http://schema.org/VideoObject # # http://schema.org/VideoObject

View file

@ -179,10 +179,12 @@ def _extract_urls(cls, webpage):
if m: if m:
return [m.group('url')] return [m.group('url')]
# Are whitesapces ignored in URLs?
# https://github.com/rg3/youtube-dl/issues/12044
matches = re.findall( matches = re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
if matches: if matches:
return list(zip(*matches))[1] return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
@staticmethod @staticmethod
def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):