toss new stuff into old file

This commit is contained in:
fnord 2015-06-20 08:22:13 -05:00
parent d5552a3477
commit 10273d6e08
3 changed files with 102 additions and 111 deletions

View file

@ -35,8 +35,7 @@
from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
from .bbcnews import BBCNewsIE
from .bbccouk import BBCCoUkIE, BBCNewsIE
from .beeg import BeegIE
from .behindkink import BehindKinkIE
from .beatportpro import BeatportProIE

View file

@ -5,9 +5,11 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_duration,
int_or_none,
)
from ..compat import compat_HTTPError
import re
class BBCCoUkIE(InfoExtractor):
@ -394,3 +396,102 @@ def _real_extract(self, url):
'formats': formats,
'subtitles': subtitles,
}
class BBCNewsIE(BBCCoUkIE):
IE_NAME = 'bbc.com'
IE_DESC = 'BBC news'
_VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P<id>[^/]+)'
mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
_TESTS = [{
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade despite Western boycott',
},
'playlist_count': 2,
},{
'url': 'http://www.bbc.com/news/business-28299555',
'info_dict': {
'id': 'business-28299555',
'title': 'Farnborough Airshow: Video highlights',
},
'playlist_count': 9,
},{
'url': 'http://www.bbc.com/news/world-europe-32041533',
'note': 'Video',
'info_dict': {
'id': 'p02mprgb',
'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'duration': 47,
},
'params': {
'skip_download': True,
}
}]
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC News)?</title>', webpage, 'list title')
pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
if pubdate:
pubdate = pubdate.replace('-','')
ret = []
# works with bbc.com/news/something-something-123456 articles
matches = re.findall(r"data-media-meta='({[^']+})'", webpage)
if not matches:
# stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
# in http://www.bbc.com/news/video_and_audio/international
matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
if not matches:
raise ExtractorError('No video found', expected=True)
for ent in matches:
jent = self._parse_json(ent,list_id)
programme_id = jent.get('externalId',None)
xml_url = jent.get('href', None)
title = jent['caption']
duration = parse_duration(jent.get('duration',None))
description = list_title + ' - ' + jent.get('caption','')
thumbnail = None
if jent.has_key('image'):
thumbnail=jent['image'].get('href',None)
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
elif xml_url:
# Cheap fallback
# http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
formats, subtitles = self._download_media_selector(programme_id)
else:
raise ExtractorError('data-media-meta entry has no externalId or href value.')
self._sort_formats(formats)
ret.append( {
'id': programme_id,
'uploader': 'BBC News',
'upload_date': pubdate,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
} )
if len(ret) > 0:
return self.playlist_result(ret, list_id, list_title)
raise ExtractorError('No video found', expected=True)

View file

@ -1,109 +0,0 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_duration,
int_or_none,
)
from ..compat import compat_HTTPError
import re
from .bbccouk import BBCCoUkIE
class BBCNewsIE(BBCCoUkIE):
IE_NAME = 'bbc.com'
IE_DESC = 'BBC news'
_VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P<id>[^/]+)'
mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
_TESTS = [{
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade despite Western boycott',
},
'playlist_count': 2,
},{
'url': 'http://www.bbc.com/news/business-28299555',
'info_dict': {
'id': 'business-28299555',
'title': 'Farnborough Airshow: Video highlights',
},
'playlist_count': 9,
},{
'url': 'http://www.bbc.com/news/world-europe-32041533',
'note': 'Video',
'info_dict': {
'id': 'p02mprgb',
'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'duration': 47,
},
'params': {
'skip_download': True,
}
}]
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC News)?</title>', webpage, 'list title')
pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
if pubdate:
pubdate = pubdate.replace('-','')
ret = []
# works with bbc.com/news/something-something-123456 articles
matches = re.findall(r"data-media-meta='({[^']+})'", webpage)
if not matches:
# stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
# in http://www.bbc.com/news/video_and_audio/international
matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
if not matches:
raise ExtractorError('No video found', expected=True)
for ent in matches:
jent = self._parse_json(ent,list_id)
programme_id = jent.get('externalId',None)
xml_url = jent.get('href', None)
title = jent['caption']
duration = parse_duration(jent.get('duration',None))
description = list_title + ' - ' + jent.get('caption','')
thumbnail = None
if jent.has_key('image'):
thumbnail=jent['image'].get('href',None)
if programme_id:
formats, subtitles = self._download_media_selector(programme_id)
elif xml_url:
# Cheap fallback
# http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
formats, subtitles = self._download_media_selector(programme_id)
else:
raise ExtractorError('data-media-meta entry has no externalId or href value.')
self._sort_formats(formats)
ret.append( {
'id': programme_id,
'uploader': 'BBC News',
'upload_date': pubdate,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
} )
if len(ret) > 0:
return self.playlist_result(ret, list_id, list_title)
raise ExtractorError('No video found', expected=True)