From 0e832c2c97c62f67593ad356ea6d507778c56759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Mar 2016 15:54:56 +0600 Subject: [PATCH] [bbc] Improve title and description extraction (Closes #8826, closes #8822) --- youtube_dl/extractor/bbc.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index f4d8b4a2f..497ebfd72 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -563,6 +563,14 @@ class BBCIE(BBCCoUkIE): 'title': 'BBC Blogs - Adam Curtis - BUGGER', }, 'playlist_count': 18, + }, { + # school report playlist with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, }, { # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', @@ -734,8 +742,17 @@ def _real_extract(self, url): json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') - playlist_description = json_ld_info.get('description') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(.+?)', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) if not timestamp: timestamp = parse_iso8601(self._search_regex( @@ -795,14 +812,6 @@ def _real_extract(self, url): entries.append(self._extract_from_playlist_sxml( playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) - playlist_title = self._og_search_title(webpage, default=None) - playlist_title = playlist_title or self._html_search_regex( - r'(.*?)', webpage, 'playlist title') - - playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title) - - playlist_description = self._og_search_description(webpage, default=None) - if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)