2024-05-26 15:27:21 -04:00
from . bokecc import BokeCCBaseIE
2017-02-02 23:10:13 -05:00
from . . compat import (
2018-01-23 10:23:12 -05:00
compat_b64decode ,
2017-02-02 23:10:13 -05:00
compat_urllib_parse_unquote ,
compat_urlparse ,
)
2017-10-12 13:39:51 -04:00
from . . utils import (
2022-04-27 17:30:24 -04:00
ExtractorError ,
2017-10-12 13:39:51 -04:00
determine_ext ,
2022-04-27 17:30:24 -04:00
traverse_obj ,
2024-05-26 15:27:21 -04:00
update_url_query ,
2017-10-12 13:39:51 -04:00
)
2013-06-23 15:14:19 -04:00
2016-02-26 13:55:11 -05:00
class InfoQIE ( BokeCCBaseIE ) :
2015-06-25 09:54:44 -04:00
_VALID_URL = r ' https?://(?:www \ .)?infoq \ .com/(?:[^/]+/)+(?P<id>[^/]+) '
2014-04-19 19:01:37 -04:00
2015-06-25 09:54:44 -04:00
_TESTS = [ {
2014-04-20 20:55:35 -04:00
' url ' : ' http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things ' ,
2014-04-20 21:21:34 -04:00
' md5 ' : ' b5ca0e0a8c1fed93b0e65e48e462f9a2 ' ,
2014-04-20 20:55:35 -04:00
' info_dict ' : {
2015-12-13 06:15:45 -05:00
' id ' : ' A-Few-of-My-Favorite-Python-Things ' ,
2014-04-20 20:55:35 -04:00
' ext ' : ' mp4 ' ,
' description ' : ' Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience. ' ,
' title ' : ' A Few of My Favorite [Python] Things ' ,
2013-06-27 14:27:08 -04:00
} ,
2015-06-25 09:54:44 -04:00
} , {
' url ' : ' http://www.infoq.com/fr/presentations/changez-avis-sur-javascript ' ,
' only_matching ' : True ,
2015-12-13 06:15:45 -05:00
} , {
' url ' : ' http://www.infoq.com/cn/presentations/openstack-continued-delivery ' ,
' md5 ' : ' 4918d0cca1497f2244572caf626687ef ' ,
' info_dict ' : {
' id ' : ' openstack-continued-delivery ' ,
' title ' : ' OpenStack持续交付之路 ' ,
' ext ' : ' flv ' ,
' description ' : ' md5:308d981fb28fa42f49f9568322c683ff ' ,
} ,
2022-04-27 17:30:24 -04:00
' skip ' : ' Sorry, the page you visited does not exist ' ,
2017-02-02 23:10:13 -05:00
} , {
' url ' : ' https://www.infoq.com/presentations/Simple-Made-Easy ' ,
' md5 ' : ' 0e34642d4d9ef44bf86f66f6399672db ' ,
' info_dict ' : {
' id ' : ' Simple-Made-Easy ' ,
' title ' : ' Simple Made Easy ' ,
' ext ' : ' mp3 ' ,
' description ' : ' md5:3e0e213a8bbd074796ef89ea35ada25b ' ,
} ,
' params ' : {
' format ' : ' bestaudio ' ,
} ,
2015-06-25 09:54:44 -04:00
} ]
2013-06-23 15:14:19 -04:00
2017-02-02 23:10:13 -05:00
def _extract_rtmp_video ( self , webpage ) :
2014-04-19 19:10:30 -04:00
# The server URL is hardcoded
2020-11-21 09:50:42 -05:00
video_url = ' rtmpe://videof.infoq.com/cfx/st/ '
2014-04-19 19:10:30 -04:00
# Extract video URL
2014-04-20 21:21:34 -04:00
encoded_id = self . _search_regex (
2015-12-13 06:15:45 -05:00
r " jsclassref \ s*= \ s* ' ([^ ' ]*) ' " , webpage , ' encoded id ' , default = None )
2018-01-23 10:23:12 -05:00
real_id = compat_urllib_parse_unquote ( compat_b64decode ( encoded_id ) . decode ( ' utf-8 ' ) )
2014-04-19 19:10:30 -04:00
playpath = ' mp4: ' + real_id
2013-06-23 15:14:19 -04:00
2015-12-13 06:15:45 -05:00
return [ {
2017-02-02 23:10:13 -05:00
' format_id ' : ' rtmp_video ' ,
2015-12-13 06:15:45 -05:00
' url ' : video_url ,
' ext ' : determine_ext ( playpath ) ,
' play_path ' : playpath ,
} ]
2013-06-23 15:14:19 -04:00
2017-10-12 13:39:51 -04:00
def _extract_cf_auth ( self , webpage ) :
2017-10-09 12:50:53 -04:00
policy = self . _search_regex ( r ' InfoQConstants \ .scp \ s*= \ s* \' ([^ \' ]+) \' ' , webpage , ' policy ' )
signature = self . _search_regex ( r ' InfoQConstants \ .scs \ s*= \ s* \' ([^ \' ]+) \' ' , webpage , ' signature ' )
key_pair_id = self . _search_regex ( r ' InfoQConstants \ .sck \ s*= \ s* \' ([^ \' ]+) \' ' , webpage , ' key-pair-id ' )
2017-10-12 13:39:51 -04:00
return {
' Policy ' : policy ,
' Signature ' : signature ,
' Key-Pair-Id ' : key_pair_id ,
}
2014-04-20 21:21:34 -04:00
2017-02-02 23:10:13 -05:00
def _extract_http_video ( self , webpage ) :
http_video_url = self . _search_regex ( r ' P \ .s \ s*= \ s* \' ([^ \' ]+) \' ' , webpage , ' video URL ' )
2017-10-12 13:39:51 -04:00
http_video_url = update_url_query ( http_video_url , self . _extract_cf_auth ( webpage ) )
2015-12-13 06:15:45 -05:00
return [ {
2017-02-02 23:10:13 -05:00
' format_id ' : ' http_video ' ,
2015-12-13 04:29:27 -05:00
' url ' : http_video_url ,
2020-11-21 09:50:42 -05:00
' http_headers ' : { ' Referer ' : ' https://www.infoq.com/ ' } ,
2014-04-20 21:21:34 -04:00
} ]
2015-12-13 06:15:45 -05:00
2017-02-02 23:10:13 -05:00
def _extract_http_audio ( self , webpage , video_id ) :
2022-04-27 17:30:24 -04:00
try :
http_audio_url = traverse_obj ( self . _form_hidden_inputs ( ' mp3Form ' , webpage ) , ' filename ' )
except ExtractorError :
http_audio_url = None
2017-04-30 14:23:05 -04:00
if not http_audio_url :
2017-02-02 23:10:13 -05:00
return [ ]
# base URL is found in the Location header in the response returned by
# GET https://www.infoq.com/mp3download.action?filename=... when logged in.
2020-11-21 09:50:42 -05:00
http_audio_url = compat_urlparse . urljoin ( ' http://ress.infoq.com/downloads/mp3downloads/ ' , http_audio_url )
2017-10-12 13:39:51 -04:00
http_audio_url = update_url_query ( http_audio_url , self . _extract_cf_auth ( webpage ) )
2017-02-02 23:10:13 -05:00
# audio file seem to be missing some times even if there is a download link
# so probe URL to make sure
2017-10-12 13:39:51 -04:00
if not self . _is_valid_url ( http_audio_url , video_id ) :
2017-02-02 23:10:13 -05:00
return [ ]
return [ {
' format_id ' : ' http_audio ' ,
' url ' : http_audio_url ,
' vcodec ' : ' none ' ,
} ]
2015-12-13 06:15:45 -05:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
2022-04-04 04:27:35 -04:00
video_title = self . _html_extract_title ( webpage )
2015-12-13 06:15:45 -05:00
video_description = self . _html_search_meta ( ' description ' , webpage , ' description ' )
if ' /cn/ ' in url :
# for China videos, HTTP video URL exists but always fails with 403
2016-02-26 13:55:11 -05:00
formats = self . _extract_bokecc_formats ( webpage , video_id )
2015-12-13 06:15:45 -05:00
else :
2017-02-02 23:10:13 -05:00
formats = (
2019-05-10 16:56:22 -04:00
self . _extract_rtmp_video ( webpage )
+ self . _extract_http_video ( webpage )
+ self . _extract_http_audio ( webpage , video_id ) )
2015-12-13 06:15:45 -05:00
2014-04-20 20:55:35 -04:00
return {
2013-06-23 15:14:19 -04:00
' id ' : video_id ,
' title ' : video_title ,
' description ' : video_description ,
2014-04-20 21:21:34 -04:00
' formats ' : formats ,
2014-04-20 20:55:35 -04:00
}