From 605b684c2dcfd583312717e90fa3e6ec1044b2ed Mon Sep 17 00:00:00 2001 From: nixxo Date: Mon, 8 Mar 2021 14:40:27 +0100 Subject: [PATCH] [mtv] Add mtv.it and extract series metadata (#156) * New extractors: MTVItalia, MTVItaliaProgramma * Extract fields: series, season_number, episode_number Authored-by: nixxo --- yt_dlp/extractor/extractors.py | 2 + yt_dlp/extractor/mtv.py | 169 +++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 580838d0f..3dc2c10f8 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -732,6 +732,8 @@ MTVServicesEmbeddedIE, MTVDEIE, MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, ) from .muenchentv import MuenchenTVIE from .mwave import MwaveIE, MwaveMeetGreetIE diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index f5e30d22d..f96226e56 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -14,6 +14,7 @@ fix_xml_ampersands, float_or_none, HEADRequest, + int_or_none, RegexNotFoundError, sanitized_Request, strip_or_none, @@ -176,6 +177,22 @@ def _get_video_info(self, itemdoc, use_hls=True): raise ExtractorError('Could not find video title') title = title.strip() + series = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:franchise') + season = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:seasonN') + episode = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:episodeN') + series = series.text if series is not None else None + season = season.text if season is not None else None + episode = episode.text if episode is not None else None + if season and episode: + # episode number includes season, so remove it + episode = re.sub(r'^%s' % season, '', episode) + # This a short id that's used in the webpage urls mtvn_id = None mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category', @@ -201,6 +218,9 @@ def _get_video_info(self, itemdoc, use_hls=True): 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), 'timestamp': timestamp, + 'series': series, + 'season_number': int_or_none(season), + 'episode_number': int_or_none(episode), } def _get_feed_query(self, uri): @@ -483,3 +503,152 @@ def _get_feed_query(self, uri): 'arcEp': 'mtv.de', 'mgid': uri, } + + +class MTVItaliaIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv.it' + _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:episodi|video|musica)/(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.mtv.it/episodi/24bqab/mario-una-serie-di-maccio-capatonda-cavoli-amario-episodio-completo-S1-E1', + 'info_dict': { + 'id': '0f0fc78e-45fc-4cce-8f24-971c25477530', + 'ext': 'mp4', + 'title': 'Cavoli amario (episodio completo)', + 'description': 'md5:4962bccea8fed5b7c03b295ae1340660', + 'series': 'Mario - Una Serie Di Maccio Capatonda', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }] + _GEO_COUNTRIES = ['IT'] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + + def _get_feed_query(self, uri): + return { + 'arcEp': 'mtv.it', + 'mgid': uri, + } + + +class MTVItaliaProgrammaIE(MTVItaliaIE): + IE_NAME = 'mtv.it:programma' + _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:programmi|playlist)/(?P[0-9a-z]+)' + _TESTS = [{ + # program page: general + 'url': 'http://www.mtv.it/programmi/s2rppv/mario-una-serie-di-maccio-capatonda', + 'info_dict': { + 'id': 'a6f155bc-8220-4640-aa43-9b95f64ffa3d', + 'title': 'Mario - Una Serie Di Maccio Capatonda', + 'description': 'md5:72fbffe1f77ccf4e90757dd4e3216153', + }, + 'playlist_count': 2, + 'params': { + 'skip_download': True, + }, + }, { + # program page: specific season + 'url': 'http://www.mtv.it/programmi/d9ncjf/mario-una-serie-di-maccio-capatonda-S2', + 'info_dict': { + 'id': '4deeb5d8-f272-490c-bde2-ff8d261c6dd1', + 'title': 'Mario - Una Serie Di Maccio Capatonda - Stagione 2', + }, + 'playlist_count': 34, + 'params': { + 'skip_download': True, + }, + }, { + # playlist page + redirect + 'url': 'http://www.mtv.it/playlist/sexy-videos/ilctal', + 'info_dict': { + 'id': 'dee8f9ee-756d-493b-bf37-16d1d2783359', + 'title': 'Sexy Videos', + }, + 'playlist_mincount': 145, + 'params': { + 'skip_download': True, + }, + }] + _GEO_COUNTRIES = ['IT'] + _FEED_URL = 'http://www.mtv.it/feeds/triforce/manifest/v8' + + def _get_entries(self, title, url): + while True: + pg = self._search_regex(r'/(\d+)$', url, 'entries', '1') + entries = self._download_json(url, title, 'page %s' % pg) + url = try_get( + entries, lambda x: x['result']['nextPageURL'], compat_str) + entries = try_get( + entries, ( + lambda x: x['result']['data']['items'], + lambda x: x['result']['data']['seasons']), + list) + for entry in entries or []: + if entry.get('canonicalURL'): + yield self.url_result(entry['canonicalURL']) + if not url: + break + + def _real_extract(self, url): + query = {'url': url} + info_url = update_url_query(self._FEED_URL, query) + video_id = self._match_id(url) + info = self._download_json(info_url, video_id).get('manifest') + + redirect = try_get( + info, lambda x: x['newLocation']['url'], compat_str) + if redirect: + return self.url_result(redirect) + + title = info.get('title') + video_id = try_get( + info, lambda x: x['reporting']['itemId'], compat_str) + parent_id = try_get( + info, lambda x: x['reporting']['parentId'], compat_str) + + playlist_url = current_url = None + for z in (info.get('zones') or {}).values(): + if z.get('moduleName') in ('INTL_M304', 'INTL_M209'): + info_url = z.get('feed') + if z.get('moduleName') in ('INTL_M308', 'INTL_M317'): + playlist_url = playlist_url or z.get('feed') + if z.get('moduleName') in ('INTL_M300',): + current_url = current_url or z.get('feed') + + if not info_url: + raise ExtractorError('No info found') + + if video_id == parent_id: + video_id = self._search_regex( + r'([^\/]+)/[^\/]+$', info_url, 'video_id') + + info = self._download_json(info_url, video_id, 'Show infos') + info = try_get(info, lambda x: x['result']['data'], dict) + title = title or try_get( + info, ( + lambda x: x['title'], + lambda x: x['headline']), + compat_str) + description = try_get(info, lambda x: x['content'], compat_str) + + if current_url: + season = try_get( + self._download_json(playlist_url, video_id, 'Seasons info'), + lambda x: x['result']['data'], dict) + current = try_get( + season, lambda x: x['currentSeason'], compat_str) + seasons = try_get( + season, lambda x: x['seasons'], list) or [] + + if current in [s.get('eTitle') for s in seasons]: + playlist_url = current_url + + title = re.sub( + r'[-|]\s*(?:mtv\s*italia|programma|playlist)', + '', title, flags=re.IGNORECASE).strip() + + return self.playlist_result( + self._get_entries(title, playlist_url), + video_id, title, description)