[bbc] Improve title and description extraction (Closes #8826, closes #8822)

This commit is contained in:
Sergey M․ 2016-03-13 15:54:56 +06:00
parent 8e4aa7bf18
commit 0e832c2c97

View file

@ -563,6 +563,14 @@ class BBCIE(BBCCoUkIE):
'title': 'BBC Blogs - Adam Curtis - BUGGER',
},
'playlist_count': 18,
}, {
# school report playlist with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779',
'info_dict': {
'id': '35744779',
'title': 'School which breaks down barriers in Jerusalem',
},
'playlist_count': 1,
}, {
# single video embedded with data-playable containing vpid
'url': 'http://www.bbc.com/news/world-europe-32041533',
@ -734,8 +742,17 @@ def _real_extract(self, url):
json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
timestamp = json_ld_info.get('timestamp')
playlist_title = json_ld_info.get('title')
playlist_description = json_ld_info.get('description')
if not playlist_title:
playlist_title = self._og_search_title(
webpage, default=None) or self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
playlist_description = json_ld_info.get(
'description') or self._og_search_description(webpage, default=None)
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
@ -795,14 +812,6 @@ def _real_extract(self, url):
entries.append(self._extract_from_playlist_sxml(
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
playlist_title = self._og_search_title(webpage, default=None)
playlist_title = playlist_title or self._html_search_regex(
r'<title>(.*?)</title>', webpage, 'playlist title')
playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title)
playlist_description = self._og_search_description(webpage, default=None)
if entries:
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)