From 78466fcab519d1b92fd9846bc8073885308a7e22 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Dec 2017 02:00:13 +0100 Subject: [PATCH] [shahid] add support for show pages(closes #7401) --- youtube_dl/extractor/aws.py | 78 +++++++++++ youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/scrippsnetworks.py | 103 +++------------ youtube_dl/extractor/shahid.py | 164 +++++++++++++++++------- 4 files changed, 219 insertions(+), 131 deletions(-) create mode 100644 youtube_dl/extractor/aws.py diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py new file mode 100644 index 0000000000..670abce0cc --- /dev/null +++ b/youtube_dl/extractor/aws.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import hmac + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlencode + + +class AWSIE(InfoExtractor): + _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' + _AWS_REGION = 'us-east-1' + + def _aws_execute_api(self, aws_dict, video_id, query=None): + query = query or {} + amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + date = amz_date[:8] + headers = { + 'Accept': 'application/json', + 'Host': self._AWS_PROXY_HOST, + 'X-Amz-Date': amz_date, + } + session_token = aws_dict.get('session_token') + if session_token: + headers['X-Amz-Security-Token'] = session_token + headers['X-Api-Key'] = self._AWS_API_KEY + + def aws_hash(s): + return hashlib.sha256(s.encode('utf-8')).hexdigest() + + # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + canonical_querystring = compat_urllib_parse_urlencode(query) + canonical_headers = '' + for header_name, header_value in headers.items(): + canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) + signed_headers = ';'.join([header.lower() for header in headers.keys()]) + canonical_request = '\n'.join([ + 'GET', + aws_dict['uri'], + canonical_querystring, + canonical_headers, + signed_headers, + aws_hash('') + ]) + + # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html + credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request'] + credential_scope = '/'.join(credential_scope_list) + string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)]) + + # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html + def aws_hmac(key, msg): + return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) + + def aws_hmac_digest(key, msg): + return aws_hmac(key, msg).digest() + + def aws_hmac_hexdigest(key, msg): + return aws_hmac(key, msg).hexdigest() + + k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8') + for value in credential_scope_list: + k_signing = aws_hmac_digest(k_signing, value) + + signature = aws_hmac_hexdigest(k_signing, string_to_sign) + + # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html + headers['Authorization'] = ', '.join([ + '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), + 'SignedHeaders=%s' % signed_headers, + 'Signature=%s' % signature, + ]) + + return self._download_json( + 'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), + video_id, headers=headers) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4072455137..513074801c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -927,7 +927,10 @@ from .servingsys import ServingSysIE from .servus import ServusIE from .sexu import SexuIE -from .shahid import ShahidIE +from .shahid import ( + ShahidIE, + ShahidShowIE, +) from .shared import ( SharedIE, VivoIE, diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index b446a02bac..4023aeef81 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -1,13 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import json import hashlib -import hmac import re -from .common import InfoExtractor +from .aws import AWSIE from .anvato import AnvatoIE from ..utils import ( smuggle_url, @@ -16,7 +14,7 @@ ) -class ScrippsNetworksWatchIE(InfoExtractor): +class ScrippsNetworksWatchIE(AWSIE): IE_NAME = 'scrippsnetworks:watch' _VALID_URL = r'''(?x) https?:// @@ -64,44 +62,27 @@ class ScrippsNetworksWatchIE(InfoExtractor): 'travelchannel': 'trav', 'geniuskitchen': 'genius', } - _SNI_HOST = 'web.api.video.snidigital.com' - _AWS_REGION = 'us-east-1' - _AWS_IDENTITY_ID_JSON = json.dumps({ - 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % _AWS_REGION - }) - _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' - _AWS_SERVICE = 'execute-api' - _AWS_REQUEST = 'aws4_request' - _AWS_SIGNED_HEADERS = ';'.join([ - 'host', 'x-amz-date', 'x-amz-security-token', 'x-api-key']) - _AWS_CANONICAL_REQUEST_TEMPLATE = '''GET -%(uri)s + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' -host:%(host)s -x-amz-date:%(date)s -x-amz-security-token:%(token)s -x-api-key:%(key)s - -%(signed_headers)s -%(payload_hash)s''' + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site', 'id') - def aws_hash(s): - return hashlib.sha256(s.encode('utf-8')).hexdigest() - + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') token = self._download_json( - 'https://cognito-identity.us-east-1.amazonaws.com/', video_id, - data=self._AWS_IDENTITY_ID_JSON.encode('utf-8'), + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, headers={ 'Accept': '*/*', 'Content-Type': 'application/x-amz-json-1.1', 'Referer': url, - 'X-Amz-Content-Sha256': aws_hash(self._AWS_IDENTITY_ID_JSON), + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', 'X-Amz-User-Agent': self._AWS_USER_AGENT, })['Token'] @@ -124,64 +105,12 @@ def get(key): sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, fatal=True) - access_key_id = get('AccessKeyId') - secret_access_key = get('SecretAccessKey') - session_token = get('SessionToken') - - # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html - uri = '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id) - datetime_now = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') - date = datetime_now[:8] - canonical_string = self._AWS_CANONICAL_REQUEST_TEMPLATE % { - 'uri': uri, - 'host': self._SNI_HOST, - 'date': datetime_now, - 'token': session_token, - 'key': self._AWS_API_KEY, - 'signed_headers': self._AWS_SIGNED_HEADERS, - 'payload_hash': aws_hash(''), - } - - # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html - credential_string = '/'.join([date, self._AWS_REGION, self._AWS_SERVICE, self._AWS_REQUEST]) - string_to_sign = '\n'.join([ - 'AWS4-HMAC-SHA256', datetime_now, credential_string, - aws_hash(canonical_string)]) - - # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html - def aws_hmac(key, msg): - return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) - - def aws_hmac_digest(key, msg): - return aws_hmac(key, msg).digest() - - def aws_hmac_hexdigest(key, msg): - return aws_hmac(key, msg).hexdigest() - - k_secret = 'AWS4' + secret_access_key - k_date = aws_hmac_digest(k_secret.encode('utf-8'), date) - k_region = aws_hmac_digest(k_date, self._AWS_REGION) - k_service = aws_hmac_digest(k_region, self._AWS_SERVICE) - k_signing = aws_hmac_digest(k_service, self._AWS_REQUEST) - - signature = aws_hmac_hexdigest(k_signing, string_to_sign) - - auth_header = ', '.join([ - 'AWS4-HMAC-SHA256 Credential=%s' % '/'.join( - [access_key_id, date, self._AWS_REGION, self._AWS_SERVICE, self._AWS_REQUEST]), - 'SignedHeaders=%s' % self._AWS_SIGNED_HEADERS, - 'Signature=%s' % signature, - ]) - - mcp_id = self._download_json( - 'https://%s%s' % (self._SNI_HOST, uri), video_id, headers={ - 'Accept': '*/*', - 'Referer': url, - 'Authorization': auth_header, - 'X-Amz-Date': datetime_now, - 'X-Amz-Security-Token': session_token, - 'X-Api-Key': self._AWS_API_KEY, - })['results'][0]['mcpId'] + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] return self.url_result( smuggle_url( diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 374f7faf9d..5c2a6206be 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -1,22 +1,53 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json +import math +import re -from .common import InfoExtractor +from .aws import AWSIE from ..compat import compat_HTTPError from ..utils import ( + clean_html, ExtractorError, + InAdvancePagedList, int_or_none, parse_iso8601, str_or_none, urlencode_postdata, - clean_html, ) -class ShahidIE(InfoExtractor): +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ @@ -41,34 +72,25 @@ class ShahidIE(InfoExtractor): 'only_matching': True }] - def _api2_request(self, *args, **kwargs): - try: - return self._download_json(*args, **kwargs) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - fail_data = self._parse_json( - e.cause.read().decode('utf-8'), None, fatal=False) - if fail_data: - faults = fail_data.get('faults', []) - faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) - if faults_message: - raise ExtractorError(faults_message, expected=True) - raise - def _real_initialize(self): email, password = self._get_login_info() if email is None: return - user_data = self._api2_request( - 'https://shahid.mbc.net/wd/service/users/login', - None, 'Logging in', data=json.dumps({ - 'email': email, - 'password': password, - 'basic': 'false', - }).encode('utf-8'), headers={ - 'Content-Type': 'application/json; charset=UTF-8', - })['user'] + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise self._download_webpage( 'https://shahid.mbc.net/populateContext', @@ -81,25 +103,13 @@ def _real_initialize(self): 'sessionId': user_data['sessionId'], })) - def _get_api_data(self, response): - data = response.get('data', {}) - - error = data.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), - expected=True) - - return data - def _real_extract(self, url): page_type, video_id = re.match(self._VALID_URL, url).groups() if page_type == 'clip': page_type = 'episode' - playout = self._api2_request( - 'https://api2.shahid.net/proxy/v2/playout/url/' + video_id, - video_id, 'Downloading player JSON')['playout'] + playout = self._call_api( + 'playout/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) @@ -107,13 +117,27 @@ def _real_extract(self, url): formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') self._sort_formats(formats) - video = self._get_api_data(self._download_json( + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), video_id, 'Downloading video JSON', query={ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - }))[page_type] + }) + data = response.get('data', {}) + error = data.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + video = data[page_type] title = video['title'] categories = [ category['name'] @@ -135,3 +159,57 @@ def _real_extract(self, url): 'episode_id': video_id, 'formats': formats, } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description'))