2016-11-12 15:52:02 -05:00
import json
2024-05-26 15:27:21 -04:00
import re
2024-06-11 19:09:58 -04:00
import urllib . parse
2016-11-12 15:52:02 -05:00
from . common import InfoExtractor
from . . utils import (
ExtractorError ,
float_or_none ,
2017-12-29 19:28:18 -05:00
mimetype2ext ,
2024-05-26 15:27:21 -04:00
smuggle_url ,
2019-03-30 14:17:30 -04:00
str_or_none ,
2022-03-31 03:49:16 -04:00
try_call ,
2019-03-30 14:17:30 -04:00
try_get ,
2017-12-29 19:28:18 -05:00
unsmuggle_url ,
2018-07-21 08:08:28 -04:00
url_or_none ,
2017-12-29 19:28:18 -05:00
urljoin ,
2016-11-12 15:52:02 -05:00
)
2019-04-01 14:13:52 -04:00
_ID_RE = r ' (?:[0-9a-f] { 32,34}|[0-9a-f] {8} -[0-9a-f] {4} -[0-9a-f] {4} -[0-9a-f] {4} -[0-9a-f] { 12,14}) '
2019-03-30 14:17:30 -04:00
2016-11-12 15:52:02 -05:00
class MediasiteIE ( InfoExtractor ) :
2024-06-11 19:09:58 -04:00
_VALID_URL = rf ' (?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id> { _ID_RE } )(?P<query> \ ?[^#]+|) '
_EMBED_REGEX = [ rf ' (?xi)<iframe \ b[^>]+ \ bsrc=([ " \' ])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/ { _ID_RE } (?: \ ?.*?)?) \ 1 ' ]
2016-11-12 15:52:02 -05:00
_TESTS = [
{
' url ' : ' https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d ' ,
' info_dict ' : {
' id ' : ' 2db6c271681e4f199af3c60d1f82869b1d ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles ' ,
' description ' : ' Sir Andrew Wiles: “Equations in arithmetic” \\ n \\ nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers \\ u0027. ' ,
' timestamp ' : 1474268400.0 ,
' upload_date ' : ' 20160919 ' ,
} ,
} ,
{
' url ' : ' http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb ' ,
' info_dict ' : {
' id ' : ' 90bb363295d945d6b548c867d01181361d ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20150429 ' ,
' title ' : ' 5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity ' ,
' timestamp ' : 1430311380.0 ,
} ,
} ,
{
' url ' : ' https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d ' ,
' md5 ' : ' 481fda1c11f67588c0d9d8fbdced4e39 ' ,
' info_dict ' : {
' id ' : ' 585a43626e544bdd97aeb71a0ec907a01d ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0. ' ,
' description ' : ' ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg(?: \ ?.*)?$ ' ,
' duration ' : 7713.088 ,
' timestamp ' : 1413309600 ,
' upload_date ' : ' 20141014 ' ,
} ,
} ,
{
' url ' : ' https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4 ' ,
' md5 ' : ' ef1fdded95bdf19b12c5999949419c92 ' ,
' info_dict ' : {
' id ' : ' 86a9ea9f53e149079fbdb4202b521ed21d ' ,
' ext ' : ' wmv ' ,
' title ' : ' 64ste Vakantiecursus: Afvalwater ' ,
' description ' : ' md5:7fd774865cc69d972f542b157c328305 ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg(?: \ ?.*?)?$ ' ,
' duration ' : 10853 ,
' timestamp ' : 1326446400 ,
' upload_date ' : ' 20120113 ' ,
} ,
} ,
{
' url ' : ' http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d ' ,
' md5 ' : ' 9422edc9b9a60151727e4b6d8bef393d ' ,
' info_dict ' : {
' id ' : ' 24aace4429fc450fb5b38cdbf424a66e1d ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Xyce Software Training - Section 1 ' ,
' description ' : r ' re:(?s)SAND Number: SAND 2013-7800. { 200,} ' ,
' upload_date ' : ' 20120409 ' ,
' timestamp ' : 1333983600 ,
' duration ' : 7794 ,
2024-06-11 19:09:58 -04:00
} ,
2018-12-17 12:03:00 -05:00
} ,
{
' url ' : ' https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d ' ,
' only_matching ' : True ,
} ,
2018-12-17 13:55:13 -05:00
{
' url ' : ' https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d ' ,
' only_matching ' : True ,
} ,
2019-04-01 14:13:52 -04:00
{
# dashed id
' url ' : ' https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d ' ,
' only_matching ' : True ,
2024-06-11 19:09:58 -04:00
} ,
2016-11-12 15:52:02 -05:00
]
# look in Mediasite.Core.js (Mediasite.ContentStreamType[*])
_STREAM_TYPES = {
2017-12-29 19:28:18 -05:00
0 : ' video1 ' , # the main video
2016-11-12 15:52:02 -05:00
2 : ' slide ' ,
3 : ' presentation ' ,
2017-12-29 19:28:18 -05:00
4 : ' video2 ' , # screencast?
2016-11-12 15:52:02 -05:00
5 : ' video3 ' ,
}
2022-07-31 21:23:25 -04:00
@classmethod
def _extract_embed_urls ( cls , url , webpage ) :
for embed_url in super ( ) . _extract_embed_urls ( url , webpage ) :
yield smuggle_url ( embed_url , { ' UrlReferrer ' : url } )
2017-12-29 19:28:18 -05:00
2024-06-11 19:09:58 -04:00
def __extract_slides ( self , * , stream_id , snum , stream , duration , images ) :
slide_base_url = stream [ ' SlideBaseUrl ' ]
2021-05-23 12:34:53 -04:00
2024-06-11 19:09:58 -04:00
fname_template = stream [ ' SlideImageFileNameTemplate ' ]
2021-05-23 12:34:53 -04:00
if fname_template != ' slide_ { 0:D4}.jpg ' :
self . report_warning ( ' Unusual slide file name template; report a bug if slide downloading fails ' )
fname_template = re . sub ( r ' \ { 0:D([0-9]+) \ } ' , r ' { 0:0 \ 1} ' , fname_template )
fragments = [ ]
2024-06-11 19:09:58 -04:00
for i , slide in enumerate ( stream [ ' Slides ' ] ) :
2021-05-23 12:34:53 -04:00
if i == 0 :
if slide [ ' Time ' ] > 0 :
default_slide = images . get ( ' DefaultSlide ' )
if default_slide is None :
default_slide = images . get ( ' DefaultStreamImage ' )
if default_slide is not None :
default_slide = default_slide [ ' ImageFilename ' ]
if default_slide is not None :
fragments . append ( {
' path ' : default_slide ,
' duration ' : slide [ ' Time ' ] / 1000 ,
} )
2022-03-31 03:49:16 -04:00
next_time = try_call (
2024-06-11 19:09:58 -04:00
lambda : stream [ ' Slides ' ] [ i + 1 ] [ ' Time ' ] ,
2022-03-31 03:49:16 -04:00
lambda : duration ,
lambda : slide [ ' Time ' ] ,
expected_type = ( int , float ) )
2021-05-23 12:34:53 -04:00
fragments . append ( {
' path ' : fname_template . format ( slide . get ( ' Number ' , i + 1 ) ) ,
2024-06-11 19:09:58 -04:00
' duration ' : ( next_time - slide [ ' Time ' ] ) / 1000 ,
2021-05-23 12:34:53 -04:00
} )
return {
2024-06-11 19:09:58 -04:00
' format_id ' : f ' { stream_id } - { snum } .slides ' ,
2021-05-23 12:34:53 -04:00
' ext ' : ' mhtml ' ,
' url ' : slide_base_url ,
' protocol ' : ' mhtml ' ,
' acodec ' : ' none ' ,
' vcodec ' : ' none ' ,
' format_note ' : ' Slides ' ,
' fragments ' : fragments ,
' fragment_base_url ' : slide_base_url ,
}
2016-11-12 15:52:02 -05:00
def _real_extract ( self , url ) :
url , data = unsmuggle_url ( url , { } )
2021-08-18 21:41:24 -04:00
mobj = self . _match_valid_url ( url )
2017-12-29 19:28:18 -05:00
resource_id = mobj . group ( ' id ' )
query = mobj . group ( ' query ' )
2016-11-12 15:52:02 -05:00
2017-12-29 19:28:18 -05:00
webpage , urlh = self . _download_webpage_handle ( url , resource_id ) # XXX: add UrlReferrer?
2023-07-09 03:53:02 -04:00
redirect_url = urlh . url
2016-11-12 15:52:02 -05:00
# XXX: might have also extracted UrlReferrer and QueryString from the html
2024-06-11 19:09:58 -04:00
service_path = urllib . parse . urljoin ( redirect_url , self . _html_search_regex (
2017-12-29 19:28:18 -05:00
r ' <div[^>]+ \ bid=[ " \' ]ServicePath[^>]+>(.+?)</div> ' , webpage , resource_id ,
2016-11-12 15:52:02 -05:00
default = ' /Mediasite/PlayerService/PlayerService.svc/json ' ) )
2017-12-29 19:28:18 -05:00
player_options = self . _download_json (
2024-06-11 19:09:58 -04:00
f ' { service_path } /GetPlayerOptions ' , resource_id ,
2016-11-12 15:52:02 -05:00
headers = {
' Content-type ' : ' application/json; charset=utf-8 ' ,
' X-Requested-With ' : ' XMLHttpRequest ' ,
} ,
data = json . dumps ( {
' getPlayerOptionsRequest ' : {
2017-12-29 19:28:18 -05:00
' ResourceId ' : resource_id ,
' QueryString ' : query ,
2016-11-12 15:52:02 -05:00
' UrlReferrer ' : data . get ( ' UrlReferrer ' , ' ' ) ,
' UseScreenReader ' : False ,
2024-06-11 19:09:58 -04:00
} ,
} ) . encode ( ) ) [ ' d ' ]
2017-12-29 19:28:18 -05:00
presentation = player_options [ ' Presentation ' ]
title = presentation [ ' Title ' ]
if presentation is None :
raise ExtractorError (
2024-06-11 19:09:58 -04:00
' Mediasite says: {} ' . format ( player_options [ ' PlayerPresentationStatusMessage ' ] ) ,
2016-11-12 15:52:02 -05:00
expected = True )
thumbnails = [ ]
formats = [ ]
2024-06-11 19:09:58 -04:00
for snum , stream in enumerate ( presentation [ ' Streams ' ] ) :
stream_type = stream . get ( ' StreamType ' )
2017-12-29 19:28:18 -05:00
if stream_type is None :
continue
2024-06-11 19:09:58 -04:00
video_urls = stream . get ( ' VideoUrls ' )
2017-12-29 19:28:18 -05:00
if not isinstance ( video_urls , list ) :
video_urls = [ ]
stream_id = self . _STREAM_TYPES . get (
stream_type , ' type %u ' % stream_type )
2016-11-12 15:52:02 -05:00
stream_formats = [ ]
2024-06-11 19:09:58 -04:00
for unum , video_url in enumerate ( video_urls ) :
video_url = url_or_none ( video_url . get ( ' Location ' ) )
2018-07-21 08:08:28 -04:00
if not video_url :
2017-12-29 19:28:18 -05:00
continue
2016-11-12 15:52:02 -05:00
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS
2024-06-11 19:09:58 -04:00
media_type = video_url . get ( ' MediaType ' )
2017-12-29 19:28:18 -05:00
if media_type == ' SS ' :
2016-11-12 15:52:02 -05:00
stream_formats . extend ( self . _extract_ism_formats (
2017-12-29 19:28:18 -05:00
video_url , resource_id ,
2024-06-11 19:09:58 -04:00
ism_id = f ' { stream_id } - { snum } . { unum } ' ,
2017-12-29 19:28:18 -05:00
fatal = False ) )
elif media_type == ' Dash ' :
stream_formats . extend ( self . _extract_mpd_formats (
video_url , resource_id ,
2024-06-11 19:09:58 -04:00
mpd_id = f ' { stream_id } - { snum } . { unum } ' ,
2017-12-29 19:28:18 -05:00
fatal = False ) )
else :
stream_formats . append ( {
2024-06-11 19:09:58 -04:00
' format_id ' : f ' { stream_id } - { snum } . { unum } ' ,
2017-12-29 19:28:18 -05:00
' url ' : video_url ,
2024-06-11 19:09:58 -04:00
' ext ' : mimetype2ext ( video_url . get ( ' MimeType ' ) ) ,
2017-12-29 19:28:18 -05:00
} )
2016-11-12 15:52:02 -05:00
2024-06-11 19:09:58 -04:00
if stream . get ( ' HasSlideContent ' , False ) :
2021-05-23 12:34:53 -04:00
images = player_options [ ' PlayerLayoutOptions ' ] [ ' Images ' ]
stream_formats . append ( self . __extract_slides (
stream_id = stream_id ,
snum = snum ,
2024-06-11 19:09:58 -04:00
stream = stream ,
2021-05-23 12:34:53 -04:00
duration = presentation . get ( ' Duration ' ) ,
images = images ,
) )
2016-11-12 15:52:02 -05:00
# disprefer 'secondary' streams
2017-12-29 19:28:18 -05:00
if stream_type != 0 :
2016-11-12 15:52:02 -05:00
for fmt in stream_formats :
2021-02-18 17:03:16 -05:00
fmt [ ' quality ' ] = - 10
2016-11-12 15:52:02 -05:00
2024-06-11 19:09:58 -04:00
thumbnail_url = stream . get ( ' ThumbnailUrl ' )
2017-12-29 19:28:18 -05:00
if thumbnail_url :
2016-11-12 15:52:02 -05:00
thumbnails . append ( {
2024-06-11 19:09:58 -04:00
' id ' : f ' { stream_id } - { snum } ' ,
2017-12-29 19:28:18 -05:00
' url ' : urljoin ( redirect_url , thumbnail_url ) ,
' preference ' : - 1 if stream_type != 0 else 0 ,
2016-11-12 15:52:02 -05:00
} )
formats . extend ( stream_formats )
# XXX: Presentation['Presenters']
# XXX: Presentation['Transcript']
return {
2017-12-29 19:28:18 -05:00
' id ' : resource_id ,
' title ' : title ,
' description ' : presentation . get ( ' Description ' ) ,
' duration ' : float_or_none ( presentation . get ( ' Duration ' ) , 1000 ) ,
' timestamp ' : float_or_none ( presentation . get ( ' UnixTime ' ) , 1000 ) ,
2016-11-12 15:52:02 -05:00
' formats ' : formats ,
' thumbnails ' : thumbnails ,
}
2019-03-30 14:17:30 -04:00
class MediasiteCatalogIE ( InfoExtractor ) :
2024-06-11 19:09:58 -04:00
_VALID_URL = rf ''' (?xi)
2019-03-30 14:17:30 -04:00
( ? P < url > https ? : / / [ ^ / ] + / Mediasite )
/ Catalog / Full /
2024-06-11 19:09:58 -04:00
( ? P < catalog_id > { _ID_RE } )
2019-03-30 14:17:30 -04:00
( ? :
2024-06-11 19:09:58 -04:00
/ ( ? P < current_folder_id > { _ID_RE } )
/ ( ? P < root_dynamic_folder_id > { _ID_RE } )
2019-03-30 14:17:30 -04:00
) ?
2024-06-11 19:09:58 -04:00
'''
2019-03-30 14:17:30 -04:00
_TESTS = [ {
' url ' : ' http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21 ' ,
' info_dict ' : {
' id ' : ' 631f9e48530d454381549f955d08c75e21 ' ,
' title ' : ' WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically ' ,
} ,
' playlist_count ' : 6 ,
' expected_warnings ' : [ ' is not a supported codec ' ] ,
} , {
# with CurrentFolderId and RootDynamicFolderId
' url ' : ' https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521 ' ,
' info_dict ' : {
' id ' : ' 9518c4a6c5cf4993b21cbd53e828a92521 ' ,
' title ' : ' IUSM Family and Friends Sessions ' ,
} ,
' playlist_count ' : 2 ,
} , {
' url ' : ' http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21 ' ,
' only_matching ' : True ,
} , {
# no AntiForgeryToken
' url ' : ' https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521 ' ,
' only_matching ' : True ,
2019-04-01 14:13:52 -04:00
} , {
# dashed id
' url ' : ' http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e ' ,
' only_matching ' : True ,
2019-03-30 14:17:30 -04:00
} ]
def _real_extract ( self , url ) :
2021-08-18 21:41:24 -04:00
mobj = self . _match_valid_url ( url )
2019-03-30 14:17:30 -04:00
mediasite_url = mobj . group ( ' url ' )
catalog_id = mobj . group ( ' catalog_id ' )
current_folder_id = mobj . group ( ' current_folder_id ' ) or catalog_id
root_dynamic_folder_id = mobj . group ( ' root_dynamic_folder_id ' )
webpage = self . _download_webpage ( url , catalog_id )
# AntiForgeryToken is optional (e.g. [1])
# 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21
anti_forgery_token = self . _search_regex (
r ' AntiForgeryToken \ s*: \ s*([ " \' ])(?P<value>(?:(?! \ 1).)+) \ 1 ' ,
webpage , ' anti forgery token ' , default = None , group = ' value ' )
if anti_forgery_token :
anti_forgery_header = self . _search_regex (
r ' AntiForgeryHeaderName \ s*: \ s*([ " \' ])(?P<value>(?:(?! \ 1).)+) \ 1 ' ,
webpage , ' anti forgery header name ' ,
default = ' X-SOFO-AntiForgeryHeader ' , group = ' value ' )
data = {
' IsViewPage ' : True ,
' IsNewFolder ' : True ,
' AuthTicket ' : None ,
' CatalogId ' : catalog_id ,
' CurrentFolderId ' : current_folder_id ,
' RootDynamicFolderId ' : root_dynamic_folder_id ,
' ItemsPerPage ' : 1000 ,
' PageIndex ' : 0 ,
' PermissionMask ' : ' Execute ' ,
' CatalogSearchType ' : ' SearchInFolder ' ,
' SortBy ' : ' Date ' ,
' SortDirection ' : ' Descending ' ,
' StartDate ' : None ,
' EndDate ' : None ,
' StatusFilterList ' : None ,
' PreviewKey ' : None ,
' Tags ' : [ ] ,
}
headers = {
' Content-Type ' : ' application/json; charset=UTF-8 ' ,
' Referer ' : url ,
' X-Requested-With ' : ' XMLHttpRequest ' ,
}
if anti_forgery_token :
headers [ anti_forgery_header ] = anti_forgery_token
catalog = self . _download_json (
2024-06-11 19:09:58 -04:00
f ' { mediasite_url } /Catalog/Data/GetPresentationsForFolder ' ,
2019-03-30 14:17:30 -04:00
catalog_id , data = json . dumps ( data ) . encode ( ) , headers = headers )
entries = [ ]
for video in catalog [ ' PresentationDetailsList ' ] :
if not isinstance ( video , dict ) :
continue
video_id = str_or_none ( video . get ( ' Id ' ) )
if not video_id :
continue
entries . append ( self . url_result (
2024-06-11 19:09:58 -04:00
f ' { mediasite_url } /Play/ { video_id } ' ,
2019-03-30 14:17:30 -04:00
ie = MediasiteIE . ie_key ( ) , video_id = video_id ) )
title = try_get (
2024-06-11 19:09:58 -04:00
catalog , lambda x : x [ ' CurrentFolder ' ] [ ' Name ' ] , str )
2019-03-30 14:17:30 -04:00
2024-06-11 19:09:58 -04:00
return self . playlist_result ( entries , catalog_id , title )
2019-04-01 14:13:52 -04:00
class MediasiteNamedCatalogIE ( InfoExtractor ) :
_VALID_URL = r ' (?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+) '
_TESTS = [ {
' url ' : ' https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o ' ,
' only_matching ' : True ,
} ]
def _real_extract ( self , url ) :
2021-08-18 21:41:24 -04:00
mobj = self . _match_valid_url ( url )
2019-04-01 14:13:52 -04:00
mediasite_url = mobj . group ( ' url ' )
catalog_name = mobj . group ( ' catalog_name ' )
webpage = self . _download_webpage ( url , catalog_name )
catalog_id = self . _search_regex (
2024-06-11 19:09:58 -04:00
rf ' CatalogId \ s*: \ s*[ " \' ]( { _ID_RE } ) ' , webpage , ' catalog id ' )
2019-04-01 14:13:52 -04:00
return self . url_result (
2024-06-11 19:09:58 -04:00
f ' { mediasite_url } /Catalog/Full/ { catalog_id } ' ,
2019-04-01 14:13:52 -04:00
ie = MediasiteCatalogIE . ie_key ( ) , video_id = catalog_id )