mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-14 20:38:11 -05:00
[ie/CNN] Fix extractor (#10185)
Closes #2640, Closes #9719 Authored by: kylegustavo, seproDev Co-authored-by: sepro <sepro@sepr0.com>
This commit is contained in:
parent
57212a5f97
commit
9acf79c91a
2 changed files with 195 additions and 117 deletions
|
@ -401,8 +401,6 @@
|
||||||
from .cnbc import CNBCVideoIE
|
from .cnbc import CNBCVideoIE
|
||||||
from .cnn import (
|
from .cnn import (
|
||||||
CNNIE,
|
CNNIE,
|
||||||
CNNArticleIE,
|
|
||||||
CNNBlogsIE,
|
|
||||||
CNNIndonesiaIE,
|
CNNIndonesiaIE,
|
||||||
)
|
)
|
||||||
from .comedycentral import (
|
from .comedycentral import (
|
||||||
|
|
|
@ -1,146 +1,226 @@
|
||||||
|
import functools
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from .turner import TurnerBaseIE
|
from ..utils import (
|
||||||
from ..utils import merge_dicts, try_call, url_basename
|
clean_html,
|
||||||
|
extract_attributes,
|
||||||
|
int_or_none,
|
||||||
|
merge_dicts,
|
||||||
|
parse_duration,
|
||||||
|
parse_iso8601,
|
||||||
|
parse_resolution,
|
||||||
|
try_call,
|
||||||
|
update_url,
|
||||||
|
url_or_none,
|
||||||
|
)
|
||||||
|
from ..utils.traversal import find_elements, traverse_obj
|
||||||
|
|
||||||
|
|
||||||
class CNNIE(TurnerBaseIE):
|
class CNNIE(InfoExtractor):
|
||||||
_VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
|
_VALID_URL = r'https?://(?:(?:edition|www|money|cnnespanol)\.)?cnn\.com/(?!audio/)(?P<display_id>[^?#]+?)(?:[?#]|$|/index\.html)'
|
||||||
(?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
|
|
||||||
|
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
|
'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
|
||||||
'md5': '3e6121ea48df7e2259fe73a0628605c4',
|
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
|
'id': 'med0e97ad0d154f56e29aa96e57192a14226734b6b',
|
||||||
|
'display_id': '2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': 'Nadal wins 8th French Open title',
|
'upload_date': '20240531',
|
||||||
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
|
'description': 'md5:844bcdb0629e1877a7a466c913f4c19c',
|
||||||
'duration': 135,
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/gettyimages-2151936122.jpg?c=original',
|
||||||
'upload_date': '20130609',
|
'duration': 373.0,
|
||||||
|
'timestamp': 1717148586,
|
||||||
|
'title': 'Borussia Dortmund star Jadon Sancho seeks Wembley redemption after 2020 Euros hurt',
|
||||||
|
'modified_date': '20240531',
|
||||||
|
'modified_timestamp': 1717150140,
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Failed to download m3u8 information'],
|
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29',
|
'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
|
||||||
'md5': 'b5cc60c60a3477d185af8f19a2a26f4e',
|
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
|
'id': 'me522945c4709b299e5cb8657900a7a21ad3b559f9',
|
||||||
|
'display_id': '2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': "Student's epic speech stuns new freshmen",
|
'description': 'md5:e0120fe5da9ad8259fd707c1cbb64a60',
|
||||||
'description': 'A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from "2001: A Space Odyssey."',
|
'title': 'Here’s how some inmates in closely divided state are now able to vote from jail',
|
||||||
'upload_date': '20130821',
|
'timestamp': 1718158269,
|
||||||
|
'upload_date': '20240612',
|
||||||
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701554-13565-571-still.jpg?c=original',
|
||||||
|
'duration': 202.0,
|
||||||
|
'modified_date': '20240612',
|
||||||
|
'modified_timestamp': 1718158509,
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Failed to download m3u8 information'],
|
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
|
'url': 'https://edition.cnn.com/2024/06/11/style/king-charles-portrait-vandalized/index.html',
|
||||||
'md5': 'f14d02ebd264df951feb2400e2c25a1b',
|
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln',
|
'id': 'mef5f52b9e1fe28b1ad192afcbc9206ae984894b68',
|
||||||
|
'display_id': '2024/06/11/style/king-charles-portrait-vandalized',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': 'Nashville Ep. 1: Hand crafted skateboards',
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701257-8846-816-still.jpg?c=original',
|
||||||
'description': 'md5:e7223a503315c9f150acac52e76de086',
|
'description': 'md5:19f78338ccec533db0fa8a4511012dae',
|
||||||
'upload_date': '20141222',
|
'title': 'Video shows King Charles\' portrait being vandalized by activists',
|
||||||
|
'timestamp': 1718113852,
|
||||||
|
'upload_date': '20240611',
|
||||||
|
'duration': 51.0,
|
||||||
|
'modified_timestamp': 1718116193,
|
||||||
|
'modified_date': '20240611',
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Failed to download m3u8 information'],
|
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
|
'url': 'https://edition.cnn.com/videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln',
|
||||||
'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
|
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
|
'id': 'mefba13799201b084ea3b1d0f7ca820ae94d4bb5b2',
|
||||||
|
'display_id': 'videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': '5 stunning stats about Netflix',
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/221205163510-robin-meade-sign-off.jpg?c=original',
|
||||||
'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
|
'duration': 158.0,
|
||||||
'upload_date': '20160819',
|
'title': 'Robin Meade signs off after HLN\'s last broadcast',
|
||||||
|
'description': 'md5:cff3c62d18d2fbc6c5c75cb029b7353b',
|
||||||
|
'upload_date': '20221205',
|
||||||
|
'timestamp': 1670284296,
|
||||||
|
'modified_timestamp': 1670332404,
|
||||||
|
'modified_date': '20221206',
|
||||||
},
|
},
|
||||||
'params': {
|
'params': {'format': 'direct'},
|
||||||
# m3u8 download
|
}, {
|
||||||
'skip_download': True,
|
'url': 'https://cnnespanol.cnn.com/video/ataque-misil-israel-beirut-libano-octubre-trax',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'me484a43722642aa00627b812fe928f2e99c6e2997',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'display_id': 'video/ataque-misil-israel-beirut-libano-octubre-trax',
|
||||||
|
'timestamp': 1729501452,
|
||||||
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/ataqeubeirut-1.jpg?c=original',
|
||||||
|
'description': 'md5:256ee7137d161f776cda429654135e52',
|
||||||
|
'upload_date': '20241021',
|
||||||
|
'duration': 31.0,
|
||||||
|
'title': 'VIDEO | Israel lanza un nuevo ataque sobre Beirut',
|
||||||
|
'modified_date': '20241021',
|
||||||
|
'modified_timestamp': 1729501530,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
|
'url': 'https://edition.cnn.com/2024/10/16/politics/kamala-harris-fox-news-interview/index.html',
|
||||||
'only_matching': True,
|
'info_dict': {
|
||||||
}, {
|
'id': '2024/10/16/politics/kamala-harris-fox-news-interview',
|
||||||
'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
|
},
|
||||||
'only_matching': True,
|
'playlist_count': 2,
|
||||||
}, {
|
'playlist': [{
|
||||||
'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
|
'md5': '073ffab87b8bef97c9913e71cc18ef9e',
|
||||||
'only_matching': True,
|
'info_dict': {
|
||||||
|
'id': 'me19d548fdd54df0924087039283128ef473ab397d',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': '\'I\'m not finished\': Harris interview with Fox News gets heated',
|
||||||
|
'display_id': 'kamala-harris-fox-news-interview-ebof-digvid',
|
||||||
|
'description': 'md5:e7dd3d1a04df916062230b60ca419a0a',
|
||||||
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/harris-20241016234916617.jpg?c=original',
|
||||||
|
'duration': 173.0,
|
||||||
|
'timestamp': 1729122182,
|
||||||
|
'upload_date': '20241016',
|
||||||
|
'modified_timestamp': 1729194706,
|
||||||
|
'modified_date': '20241017',
|
||||||
|
},
|
||||||
|
'params': {'format': 'direct'},
|
||||||
|
}, {
|
||||||
|
'md5': '11604ab4af83b650826753f1ccb8ecff',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'med04507d8ca3da827001f63d22af321ec29c7d97b',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': '\'Wise\': Buttigieg on Harris\' handling of interview question about gender transition surgery',
|
||||||
|
'display_id': 'pete-buttigieg-harris-fox-newssrc-digvid',
|
||||||
|
'description': 'md5:602a8a7e853ed5e574acd3159428c98e',
|
||||||
|
'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/buttigieg-20241017040412074.jpg?c=original',
|
||||||
|
'duration': 145.0,
|
||||||
|
'timestamp': 1729137765,
|
||||||
|
'upload_date': '20241017',
|
||||||
|
'modified_timestamp': 1729138184,
|
||||||
|
'modified_date': '20241017',
|
||||||
|
},
|
||||||
|
'params': {'format': 'direct'},
|
||||||
|
}],
|
||||||
}]
|
}]
|
||||||
|
|
||||||
_CONFIG = {
|
|
||||||
# http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
|
|
||||||
'edition': {
|
|
||||||
'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
|
|
||||||
'media_src': 'http://pmd.cdn.turner.com/cnn/big',
|
|
||||||
},
|
|
||||||
# http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
|
|
||||||
'money': {
|
|
||||||
'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
|
|
||||||
'media_src': 'http://ht3.cdn.turner.com/money/big',
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_timestamp(self, video_data):
|
|
||||||
# TODO: fix timestamp extraction
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
sub_domain, path, page_title = self._match_valid_url(url).groups()
|
display_id = self._match_valid_url(url).group('display_id')
|
||||||
if sub_domain not in ('money', 'edition'):
|
webpage = self._download_webpage(url, display_id)
|
||||||
sub_domain = 'edition'
|
app_id = traverse_obj(
|
||||||
config = self._CONFIG[sub_domain]
|
self._search_json(r'window\.env\s*=', webpage, 'window env', display_id, default={}),
|
||||||
return self._extract_cvp_info(
|
('TOP_AUTH_SERVICE_APP_ID', {str}))
|
||||||
config['data_src'] % path, page_title, {
|
|
||||||
'default': {
|
entries = []
|
||||||
'media_src': config['media_src'],
|
for player_data in traverse_obj(webpage, (
|
||||||
},
|
{find_elements(tag='div', attr='data-component-name', value='video-player', html=True)},
|
||||||
'f4m': {
|
..., {extract_attributes}, all, lambda _, v: v['data-media-id'])):
|
||||||
'host': 'cnn-vh.akamaihd.net',
|
media_id = player_data['data-media-id']
|
||||||
},
|
parent_uri = player_data.get('data-video-resource-parent-uri')
|
||||||
|
formats, subtitles = [], {}
|
||||||
|
|
||||||
|
video_data = {}
|
||||||
|
if parent_uri:
|
||||||
|
video_data = self._download_json(
|
||||||
|
'https://fave.api.cnn.io/v1/video', media_id, fatal=False,
|
||||||
|
query={
|
||||||
|
'id': media_id,
|
||||||
|
'stellarUri': parent_uri,
|
||||||
|
})
|
||||||
|
for direct_url in traverse_obj(video_data, ('files', ..., 'fileUri', {url_or_none})):
|
||||||
|
resolution, bitrate = None, None
|
||||||
|
if mobj := re.search(r'-(?P<res>\d+x\d+)_(?P<tbr>\d+)k\.mp4', direct_url):
|
||||||
|
resolution, bitrate = mobj.group('res', 'tbr')
|
||||||
|
formats.append({
|
||||||
|
'url': direct_url,
|
||||||
|
'format_id': 'direct',
|
||||||
|
'quality': 1,
|
||||||
|
'tbr': int_or_none(bitrate),
|
||||||
|
**parse_resolution(resolution),
|
||||||
|
})
|
||||||
|
for sub_data in traverse_obj(video_data, (
|
||||||
|
'closedCaptions', 'types', lambda _, v: url_or_none(v['track']['url']), 'track')):
|
||||||
|
subtitles.setdefault(sub_data.get('lang') or 'en', []).append({
|
||||||
|
'url': sub_data['url'],
|
||||||
|
'name': sub_data.get('label'),
|
||||||
|
})
|
||||||
|
|
||||||
|
if app_id:
|
||||||
|
media_data = self._download_json(
|
||||||
|
f'https://medium.ngtv.io/v2/media/{media_id}/desktop', media_id, fatal=False,
|
||||||
|
query={'appId': app_id})
|
||||||
|
m3u8_url = traverse_obj(media_data, (
|
||||||
|
'media', 'desktop', 'unprotected', 'unencrypted', 'url', {url_or_none}))
|
||||||
|
if m3u8_url:
|
||||||
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
|
m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
|
||||||
|
entries.append({
|
||||||
|
**traverse_obj(player_data, {
|
||||||
|
'title': ('data-headline', {clean_html}),
|
||||||
|
'description': ('data-description', {clean_html}),
|
||||||
|
'duration': ('data-duration', {parse_duration}),
|
||||||
|
'timestamp': ('data-publish-date', {parse_iso8601}),
|
||||||
|
'thumbnail': (
|
||||||
|
'data-poster-image-override', {json.loads}, 'big', 'uri', {url_or_none},
|
||||||
|
{functools.partial(update_url, query='c=original')}),
|
||||||
|
'display_id': 'data-video-slug',
|
||||||
|
}),
|
||||||
|
**traverse_obj(video_data, {
|
||||||
|
'timestamp': ('dateCreated', 'uts', {int_or_none(scale=1000)}),
|
||||||
|
'description': ('description', {clean_html}),
|
||||||
|
'title': ('headline', {str}),
|
||||||
|
'modified_timestamp': ('lastModified', 'uts', {int_or_none(scale=1000)}),
|
||||||
|
'duration': ('trt', {int_or_none}),
|
||||||
|
}),
|
||||||
|
'id': media_id,
|
||||||
|
'formats': formats,
|
||||||
|
'subtitles': subtitles,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if len(entries) == 1:
|
||||||
|
return {
|
||||||
|
**entries[0],
|
||||||
|
'display_id': display_id,
|
||||||
|
}
|
||||||
|
|
||||||
class CNNBlogsIE(InfoExtractor):
|
return self.playlist_result(entries, display_id)
|
||||||
_VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
|
|
||||||
_TEST = {
|
|
||||||
'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
|
|
||||||
'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
|
|
||||||
'info_dict': {
|
|
||||||
'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
|
|
||||||
'ext': 'mp4',
|
|
||||||
'title': 'Criminalizing journalism?',
|
|
||||||
'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
|
|
||||||
'upload_date': '20140209',
|
|
||||||
},
|
|
||||||
'expected_warnings': ['Failed to download m3u8 information'],
|
|
||||||
'add_ie': ['CNN'],
|
|
||||||
}
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
|
||||||
webpage = self._download_webpage(url, url_basename(url))
|
|
||||||
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
|
|
||||||
return self.url_result(cnn_url, CNNIE.ie_key())
|
|
||||||
|
|
||||||
|
|
||||||
class CNNArticleIE(InfoExtractor):
|
|
||||||
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
|
|
||||||
_TEST = {
|
|
||||||
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
|
|
||||||
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
|
|
||||||
'info_dict': {
|
|
||||||
'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
|
|
||||||
'ext': 'mp4',
|
|
||||||
'title': 'Obama: Cyberattack not an act of war',
|
|
||||||
'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b',
|
|
||||||
'upload_date': '20141221',
|
|
||||||
},
|
|
||||||
'expected_warnings': ['Failed to download m3u8 information'],
|
|
||||||
'add_ie': ['CNN'],
|
|
||||||
}
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
|
||||||
webpage = self._download_webpage(url, url_basename(url))
|
|
||||||
cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
|
|
||||||
return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
|
|
||||||
|
|
||||||
|
|
||||||
class CNNIndonesiaIE(InfoExtractor):
|
class CNNIndonesiaIE(InfoExtractor):
|
||||||
|
|
Loading…
Reference in a new issue