2013-06-23 13:58:33 -04:00
# coding: utf-8
2014-09-13 01:51:06 -04:00
from __future__ import unicode_literals
2013-09-22 04:37:23 -04:00
import itertools
2013-06-23 13:58:33 -04:00
import json
2013-09-21 18:35:03 -04:00
import os . path
2016-02-29 14:01:33 -05:00
import random
2013-06-23 13:58:33 -04:00
import re
2014-11-29 18:03:59 -05:00
import time
2013-09-21 08:19:30 -04:00
import traceback
2013-06-23 13:58:33 -04:00
2013-06-23 14:28:15 -04:00
from . common import InfoExtractor , SearchInfoExtractor
2014-03-30 01:02:58 -04:00
from . . jsinterp import JSInterpreter
2014-07-18 04:24:28 -04:00
from . . swfinterp import SWFInterpreter
2014-12-11 04:08:17 -05:00
from . . compat import (
2013-09-22 04:30:02 -04:00
compat_chr ,
2019-05-05 14:12:32 -04:00
compat_HTTPError ,
2017-09-01 13:57:14 -04:00
compat_kwargs ,
2013-06-23 13:58:33 -04:00
compat_parse_qs ,
2015-07-17 13:51:57 -04:00
compat_urllib_parse_unquote ,
compat_urllib_parse_unquote_plus ,
2016-03-25 15:46:57 -04:00
compat_urllib_parse_urlencode ,
2015-07-20 15:10:28 -04:00
compat_urllib_parse_urlparse ,
2013-10-01 11:58:13 -04:00
compat_urlparse ,
2013-06-23 13:58:33 -04:00
compat_str ,
2014-12-11 04:08:17 -05:00
)
from . . utils import (
2019-07-11 16:45:58 -04:00
bool_or_none ,
2013-06-23 13:58:33 -04:00
clean_html ,
2015-12-19 20:00:39 -05:00
error_to_compat_str ,
2019-08-12 18:02:52 -04:00
extract_attributes ,
2013-06-23 13:58:33 -04:00
ExtractorError ,
2015-02-11 12:39:31 -05:00
float_or_none ,
2014-12-11 04:08:17 -05:00
get_element_by_attribute ,
get_element_by_id ,
2014-01-18 23:47:20 -05:00
int_or_none ,
2016-01-24 12:02:19 -05:00
mimetype2ext ,
2014-12-11 04:08:17 -05:00
orderedSet ,
2017-02-12 06:09:53 -05:00
parse_codecs ,
2020-09-15 11:16:58 -04:00
parse_count ,
2015-07-20 15:10:28 -04:00
parse_duration ,
2015-12-14 10:31:53 -05:00
remove_quotes ,
2017-05-06 17:19:11 -04:00
remove_start ,
2015-07-25 11:30:34 -04:00
smuggle_url ,
2018-11-02 19:26:16 -04:00
str_or_none ,
2015-06-28 14:48:06 -04:00
str_to_int ,
2017-01-26 09:43:14 -05:00
try_get ,
2013-06-23 13:58:33 -04:00
unescapeHTML ,
unified_strdate ,
2015-07-25 11:30:34 -04:00
unsmuggle_url ,
2014-02-09 11:56:10 -05:00
uppercase_escape ,
2018-12-16 07:35:48 -05:00
url_or_none ,
2016-03-25 16:19:24 -04:00
urlencode_postdata ,
2013-06-23 13:58:33 -04:00
)
2014-11-23 14:41:03 -05:00
2013-09-11 09:48:23 -04:00
class YoutubeBaseInfoExtractor ( InfoExtractor ) :
2013-07-24 14:40:12 -04:00
""" Provide base functions for Youtube extractors """
_LOGIN_URL = ' https://accounts.google.com/ServiceLogin '
2015-08-13 23:11:11 -04:00
_TWOFACTOR_URL = ' https://accounts.google.com/signin/challenge '
2017-05-06 12:58:47 -04:00
_LOOKUP_URL = ' https://accounts.google.com/_/signin/sl/lookup '
2017-05-06 17:19:11 -04:00
_CHALLENGE_URL = ' https://accounts.google.com/_/signin/sl/challenge '
_TFA_URL = ' https://accounts.google.com/_/signin/challenge?hl=en&TL= {0} '
2017-05-06 12:58:47 -04:00
2013-07-24 14:40:12 -04:00
_NETRC_MACHINE = ' youtube '
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
2019-11-30 11:51:34 -05:00
_PLAYLIST_ID_RE = r ' (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_] { 10,} '
2017-03-24 14:17:17 -04:00
2020-06-15 14:59:46 -04:00
_YOUTUBE_CLIENT_HEADERS = {
' x-youtube-client-name ' : ' 1 ' ,
' x-youtube-client-version ' : ' 1.20200609.04.02 ' ,
}
2013-07-24 14:40:12 -04:00
def _set_language ( self ) :
2014-12-04 02:27:40 -05:00
self . _set_cookie (
2020-06-15 19:17:53 -04:00
' .youtube.com ' , ' PREF ' , ' f1=50000000&f6=8&hl=en ' ,
2014-11-29 18:03:59 -05:00
# YouTube sets the expire time to about two months
2014-12-04 02:27:40 -05:00
expire_time = time . time ( ) + 2 * 30 * 24 * 3600 )
2013-07-24 14:40:12 -04:00
2015-05-15 11:06:59 -04:00
def _ids_to_results ( self , ids ) :
return [
self . url_result ( vid_id , ' Youtube ' , video_id = vid_id )
for vid_id in ids ]
2013-07-24 14:40:12 -04:00
def _login ( self ) :
2014-08-16 17:28:41 -04:00
"""
Attempt to log in to YouTube .
True is returned if successful or skipped .
False is returned if login failed .
If _LOGIN_REQUIRED is set and no authentication was provided , an error is raised .
"""
2018-05-26 11:12:44 -04:00
username , password = self . _get_login_info ( )
2013-07-24 14:40:12 -04:00
# No authentication to be performed
if username is None :
2018-04-21 19:08:05 -04:00
if self . _LOGIN_REQUIRED and self . _downloader . params . get ( ' cookiefile ' ) is None :
2014-09-24 03:51:45 -04:00
raise ExtractorError ( ' No login info available, needed for using %s . ' % self . IE_NAME , expected = True )
2020-10-17 21:04:10 -04:00
if self . _downloader . params . get ( ' cookiefile ' ) and False : # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
self . to_screen ( ' [Cookies] Reminder - Make sure to always use up to date cookies! ' )
2014-08-16 17:28:41 -04:00
return True
2013-07-24 14:40:12 -04:00
2013-12-08 19:49:01 -05:00
login_page = self . _download_webpage (
self . _LOGIN_URL , None ,
2014-09-24 03:51:45 -04:00
note = ' Downloading login page ' ,
errnote = ' unable to fetch login page ' , fatal = False )
2013-12-08 19:49:01 -05:00
if login_page is False :
return
2013-07-24 14:40:12 -04:00
2016-08-23 13:25:21 -04:00
login_form = self . _hidden_inputs ( login_page )
2013-06-23 13:58:33 -04:00
2017-05-06 12:58:47 -04:00
def req ( url , f_req , note , errnote ) :
data = login_form . copy ( )
data . update ( {
' pstMsg ' : 1 ,
' checkConnection ' : ' youtube ' ,
' checkedDomains ' : ' youtube ' ,
' hl ' : ' en ' ,
' deviceinfo ' : ' [null,null,null,[],null, " US " ,null,null,[], " GlifWebSignIn " ,null,[null,null,[]]] ' ,
2017-05-06 17:19:11 -04:00
' f.req ' : json . dumps ( f_req ) ,
2017-05-06 12:58:47 -04:00
' flowName ' : ' GlifWebSignIn ' ,
' flowEntry ' : ' ServiceLogin ' ,
2019-07-11 15:26:05 -04:00
# TODO: reverse actual botguard identifier generation algo
' bgRequest ' : ' [ " identifier " , " " ] ' ,
2015-08-15 12:03:43 -04:00
} )
2017-05-06 12:58:47 -04:00
return self . _download_json (
url , None , note = note , errnote = errnote ,
transform_source = lambda s : re . sub ( r ' ^[^[]* ' , ' ' , s ) ,
fatal = False ,
data = urlencode_postdata ( data ) , headers = {
' Content-Type ' : ' application/x-www-form-urlencoded;charset=utf-8 ' ,
' Google-Accounts-XSRF ' : 1 ,
} )
2017-05-06 17:19:11 -04:00
def warn ( message ) :
self . _downloader . report_warning ( message )
lookup_req = [
username ,
None , [ ] , None , ' US ' , None , None , 2 , False , True ,
[
None , None ,
[ 2 , 1 , None , 1 ,
' https://accounts.google.com/ServiceLogin?passive=true&continue=https % 3A %2F %2F www.youtube.com %2F signin %3F next % 3D %252F %26a ction_handle_signin % 3Dtrue % 26hl % 3Den %26a pp % 3Ddesktop %26f eature % 3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath= %2F ServiceLogin&Page=PasswordSeparationSignIn ' ,
None , [ ] , 4 ] ,
1 , [ None , None , [ ] ] , None , None , None , True
] ,
username ,
]
2017-05-06 12:58:47 -04:00
lookup_results = req (
2017-05-06 17:19:11 -04:00
self . _LOOKUP_URL , lookup_req ,
2017-05-06 12:58:47 -04:00
' Looking up account info ' , ' Unable to look up account info ' )
if lookup_results is False :
return False
2015-08-15 12:03:43 -04:00
2017-05-06 17:19:11 -04:00
user_hash = try_get ( lookup_results , lambda x : x [ 0 ] [ 2 ] , compat_str )
if not user_hash :
warn ( ' Unable to extract user hash ' )
return False
challenge_req = [
user_hash ,
None , 1 , None , [ 1 , None , None , None , [ password , None , True ] ] ,
[
None , None , [ 2 , 1 , None , 1 , ' https://accounts.google.com/ServiceLogin?passive=true&continue=https % 3A %2F %2F www.youtube.com %2F signin %3F next % 3D %252F %26a ction_handle_signin % 3Dtrue % 26hl % 3Den %26a pp % 3Ddesktop %26f eature % 3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath= %2F ServiceLogin&Page=PasswordSeparationSignIn ' , None , [ ] , 4 ] ,
1 , [ None , None , [ ] ] , None , None , None , True
] ]
2014-08-16 17:28:41 -04:00
2017-05-06 17:19:11 -04:00
challenge_results = req (
self . _CHALLENGE_URL , challenge_req ,
' Logging in ' , ' Unable to log in ' )
2014-08-16 17:28:41 -04:00
2017-05-06 17:19:11 -04:00
if challenge_results is False :
2017-05-06 12:58:47 -04:00
return
2014-08-16 17:28:41 -04:00
2017-05-06 17:19:11 -04:00
login_res = try_get ( challenge_results , lambda x : x [ 0 ] [ 5 ] , list )
if login_res :
login_msg = try_get ( login_res , lambda x : x [ 5 ] , compat_str )
warn (
' Unable to login: %s ' % ' Invalid password '
if login_msg == ' INCORRECT_ANSWER_ENTERED ' else login_msg )
return False
res = try_get ( challenge_results , lambda x : x [ 0 ] [ - 1 ] , list )
if not res :
warn ( ' Unable to extract result entry ' )
return False
2018-07-04 13:36:35 -04:00
login_challenge = try_get ( res , lambda x : x [ 0 ] [ 0 ] , list )
if login_challenge :
challenge_str = try_get ( login_challenge , lambda x : x [ 2 ] , compat_str )
if challenge_str == ' TWO_STEP_VERIFICATION ' :
2017-05-06 17:19:11 -04:00
# SEND_SUCCESS - TFA code has been successfully sent to phone
# QUOTA_EXCEEDED - reached the limit of TFA codes
2018-07-04 13:36:35 -04:00
status = try_get ( login_challenge , lambda x : x [ 5 ] , compat_str )
2017-05-06 17:19:11 -04:00
if status == ' QUOTA_EXCEEDED ' :
warn ( ' Exceeded the limit of TFA codes, try later ' )
return False
tl = try_get ( challenge_results , lambda x : x [ 1 ] [ 2 ] , compat_str )
if not tl :
warn ( ' Unable to extract TL ' )
return False
tfa_code = self . _get_tfa_info ( ' 2-step verification code ' )
if not tfa_code :
warn (
' Two-factor authentication required. Provide it either interactively or with --twofactor <code> '
' (Note that only TOTP (Google Authenticator App) codes work at this time.) ' )
return False
tfa_code = remove_start ( tfa_code , ' G- ' )
tfa_req = [
user_hash , None , 2 , None ,
[
9 , None , None , None , None , None , None , None ,
[ None , tfa_code , True , 2 ]
] ]
tfa_results = req (
self . _TFA_URL . format ( tl ) , tfa_req ,
' Submitting TFA code ' , ' Unable to submit TFA code ' )
if tfa_results is False :
return False
tfa_res = try_get ( tfa_results , lambda x : x [ 0 ] [ 5 ] , list )
if tfa_res :
tfa_msg = try_get ( tfa_res , lambda x : x [ 5 ] , compat_str )
warn (
' Unable to finish TFA: %s ' % ' Invalid TFA code '
if tfa_msg == ' INCORRECT_ANSWER_ENTERED ' else tfa_msg )
return False
check_cookie_url = try_get (
tfa_results , lambda x : x [ 0 ] [ - 1 ] [ 2 ] , compat_str )
2018-07-04 13:36:35 -04:00
else :
CHALLENGES = {
' LOGIN_CHALLENGE ' : " This device isn ' t recognized. For your security, Google wants to make sure it ' s really you. " ,
' USERNAME_RECOVERY ' : ' Please provide additional information to aid in the recovery process. ' ,
' REAUTH ' : " There is something unusual about your activity. For your security, Google wants to make sure it ' s really you. " ,
}
challenge = CHALLENGES . get (
challenge_str ,
' %s returned error %s . ' % ( self . IE_NAME , challenge_str ) )
warn ( ' %s \n Go to https://accounts.google.com/, login and solve a challenge. ' % challenge )
return False
2017-05-06 17:19:11 -04:00
else :
check_cookie_url = try_get ( res , lambda x : x [ 2 ] , compat_str )
if not check_cookie_url :
warn ( ' Unable to extract CheckCookie URL ' )
return False
2017-05-06 12:58:47 -04:00
check_cookie_results = self . _download_webpage (
2017-05-06 17:19:11 -04:00
check_cookie_url , None , ' Checking cookie ' , fatal = False )
if check_cookie_results is False :
return False
2017-05-06 12:58:47 -04:00
2017-05-06 17:19:11 -04:00
if ' https://myaccount.google.com/ ' not in check_cookie_results :
warn ( ' Unable to log in ' )
2013-07-24 14:40:12 -04:00
return False
2017-05-06 12:58:47 -04:00
2013-07-24 14:40:12 -04:00
return True
2018-04-29 05:23:23 -04:00
def _download_webpage_handle ( self , * args , * * kwargs ) :
2018-09-17 11:13:39 -04:00
query = kwargs . get ( ' query ' , { } ) . copy ( )
query [ ' disable_polymer ' ] = ' true '
kwargs [ ' query ' ] = query
2018-04-29 05:23:23 -04:00
return super ( YoutubeBaseInfoExtractor , self ) . _download_webpage_handle (
2017-09-01 13:57:14 -04:00
* args , * * compat_kwargs ( kwargs ) )
2013-07-24 14:40:12 -04:00
def _real_initialize ( self ) :
if self . _downloader is None :
return
2014-11-29 18:03:59 -05:00
self . _set_language ( )
2013-07-24 14:40:12 -04:00
if not self . _login ( ) :
return
2013-06-23 13:58:33 -04:00
2013-08-08 02:54:10 -04:00
2016-01-31 06:49:59 -05:00
class YoutubeEntryListBaseInfoExtractor ( YoutubeBaseInfoExtractor ) :
2015-11-21 18:01:01 -05:00
# Extract entries from page with "Load more" button
2015-10-17 14:11:34 -04:00
def _entries ( self , page , playlist_id ) :
more_widget_html = content_html = page
for page_num in itertools . count ( 1 ) :
2015-11-21 18:01:01 -05:00
for entry in self . _process_page ( content_html ) :
yield entry
2015-10-17 14:11:34 -04:00
mobj = re . search ( r ' data-uix-load-more-href= " /?(?P<more>[^ " ]+) " ' , more_widget_html )
if not mobj :
break
2019-05-05 14:12:32 -04:00
count = 0
retries = 3
while count < = retries :
try :
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
more = self . _download_json (
2020-06-30 15:56:16 -04:00
' https://www.youtube.com/ %s ' % mobj . group ( ' more ' ) , playlist_id ,
2019-05-05 14:12:32 -04:00
' Downloading page # %s %s '
% ( page_num , ' (retry # %d ) ' % count if count else ' ' ) ,
2020-06-15 14:59:46 -04:00
transform_source = uppercase_escape ,
headers = self . _YOUTUBE_CLIENT_HEADERS )
2019-05-05 14:12:32 -04:00
break
except ExtractorError as e :
if isinstance ( e . cause , compat_HTTPError ) and e . cause . code in ( 500 , 503 ) :
count + = 1
if count < = retries :
continue
raise
2015-10-17 14:11:34 -04:00
content_html = more [ ' content_html ' ]
if not content_html . strip ( ) :
# Some webpages show a "Load more" button but they don't
# have more videos
break
more_widget_html = more [ ' load_more_widget_html ' ]
2015-11-21 18:01:01 -05:00
class YoutubePlaylistBaseInfoExtractor ( YoutubeEntryListBaseInfoExtractor ) :
def _process_page ( self , content ) :
for video_id , video_title in self . extract_videos_from_page ( content ) :
yield self . url_result ( video_id , ' Youtube ' , video_id , video_title )
2019-08-12 18:02:52 -04:00
def extract_videos_from_page_impl ( self , video_re , page , ids_in_page , titles_in_page ) :
for mobj in re . finditer ( video_re , page ) :
2015-10-17 14:11:34 -04:00
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if ' index ' in mobj . groupdict ( ) and mobj . group ( ' id ' ) == ' 0 ' :
continue
video_id = mobj . group ( ' id ' )
2019-08-12 18:02:52 -04:00
video_title = unescapeHTML (
mobj . group ( ' title ' ) ) if ' title ' in mobj . groupdict ( ) else None
2015-10-17 14:11:34 -04:00
if video_title :
video_title = video_title . strip ( )
2019-08-12 18:02:52 -04:00
if video_title == ' ► Play all ' :
video_title = None
2015-10-17 14:11:34 -04:00
try :
idx = ids_in_page . index ( video_id )
if video_title and not titles_in_page [ idx ] :
titles_in_page [ idx ] = video_title
except ValueError :
ids_in_page . append ( video_id )
titles_in_page . append ( video_title )
2019-08-12 18:02:52 -04:00
def extract_videos_from_page ( self , page ) :
ids_in_page = [ ]
titles_in_page = [ ]
self . extract_videos_from_page_impl (
self . _VIDEO_RE , page , ids_in_page , titles_in_page )
2015-10-17 14:11:34 -04:00
return zip ( ids_in_page , titles_in_page )
2015-11-21 18:01:01 -05:00
class YoutubePlaylistsBaseInfoExtractor ( YoutubeEntryListBaseInfoExtractor ) :
def _process_page ( self , content ) :
2016-03-26 10:42:18 -04:00
for playlist_id in orderedSet ( re . findall (
r ' <h3[^>]+class= " [^ " ]*yt-lockup-title[^ " ]* " [^>]*><a[^>]+href= " /?playlist \ ?list=([0-9A-Za-z-_] { 10,}) " ' ,
content ) ) :
2015-11-21 18:01:01 -05:00
yield self . url_result (
' https://www.youtube.com/playlist?list= %s ' % playlist_id , ' YoutubePlaylist ' )
2015-11-21 17:17:07 -05:00
def _real_extract ( self , url ) :
playlist_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , playlist_id )
title = self . _og_search_title ( webpage , fatal = False )
2015-11-21 18:01:01 -05:00
return self . playlist_result ( self . _entries ( webpage , playlist_id ) , playlist_id , title )
2015-11-21 17:17:07 -05:00
2015-02-16 15:44:17 -05:00
class YoutubeIE ( YoutubeBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com '
2013-11-18 10:42:35 -05:00
_VALID_URL = r """ (?x)^
2013-06-23 13:58:33 -04:00
(
2014-09-11 15:47:25 -04:00
( ? : https ? : / / | / / ) # http(s):// or protocol-independent URL
2019-11-30 11:51:34 -05:00
( ? : ( ? : ( ? : ( ? : \w + \. ) ? [ yY ] [ oO ] [ uU ] [ tT ] [ uU ] [ bB ] [ eE ] ( ? : - nocookie | kids ) ? \. com / |
2014-01-16 20:53:34 -05:00
( ? : www \. ) ? deturl \. com / www \. youtube \. com / |
2014-02-09 19:30:47 -05:00
( ? : www \. ) ? pwnyoutube \. com / |
2017-10-07 10:59:04 -04:00
( ? : www \. ) ? hooktube \. com / |
2014-02-18 14:00:54 -05:00
( ? : www \. ) ? yourepeat \. com / |
2013-09-15 06:14:59 -04:00
tube \. majestyc \. net / |
2019-07-13 14:23:22 -04:00
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
2019-03-16 20:15:15 -04:00
( ? : ( ? : www | dev ) \. ) ? invidio \. us / |
2019-07-13 14:23:22 -04:00
( ? : ( ? : www | no ) \. ) ? invidiou \. sh / |
( ? : ( ? : www | fi | de ) \. ) ? invidious \. snopyta \. org / |
2019-03-02 20:19:36 -05:00
( ? : www \. ) ? invidious \. kabi \. tk / |
2019-07-13 14:23:22 -04:00
( ? : www \. ) ? invidious \.13 ad \. de / |
2019-07-15 11:54:22 -04:00
( ? : www \. ) ? invidious \. mastodon \. host / |
2019-08-27 14:39:59 -04:00
( ? : www \. ) ? invidious \. nixnet \. xyz / |
2019-09-24 12:16:46 -04:00
( ? : www \. ) ? invidious \. drycat \. fr / |
2019-07-13 14:23:22 -04:00
( ? : www \. ) ? tube \. poal \. co / |
2019-03-02 20:19:36 -05:00
( ? : www \. ) ? vid \. wxzm \. sx / |
2020-05-13 18:54:42 -04:00
( ? : www \. ) ? yewtu \. be / |
2019-08-27 14:39:59 -04:00
( ? : www \. ) ? yt \. elukerio \. org / |
2019-10-04 07:52:15 -04:00
( ? : www \. ) ? yt \. lelux \. fi / |
2020-05-26 14:26:45 -04:00
( ? : www \. ) ? invidious \. ggc - project \. de / |
( ? : www \. ) ? yt \. maisputain \. ovh / |
( ? : www \. ) ? invidious \.13 ad \. de / |
( ? : www \. ) ? invidious \. toot \. koeln / |
( ? : www \. ) ? invidious \. fdn \. fr / |
( ? : www \. ) ? watch \. nettohikari \. com / |
2019-09-02 14:35:32 -04:00
( ? : www \. ) ? kgg2m7yk5aybusll \. onion / |
( ? : www \. ) ? qklhadlycap4cnod \. onion / |
( ? : www \. ) ? axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid \. onion / |
( ? : www \. ) ? c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid \. onion / |
( ? : www \. ) ? fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad \. onion / |
( ? : www \. ) ? invidious \. l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd \. onion / |
2019-09-24 15:43:34 -04:00
( ? : www \. ) ? owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya \. b32 \. i2p / |
2020-05-26 14:26:45 -04:00
( ? : www \. ) ? 4 l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd \. onion / |
2013-09-15 06:14:59 -04:00
youtube \. googleapis \. com / ) # the various hostnames, with wildcard subdomains
2013-06-23 13:58:33 -04:00
( ? : . * ? \#/)? # handle anchor (#/) redirect urls
( ? : # the various things that can precede the ID:
2014-09-24 04:34:29 -04:00
( ? : ( ? : v | embed | e ) / ( ? ! videoseries ) ) # v/ or embed/ or e/
2013-06-23 13:58:33 -04:00
| ( ? : # or the v= param in all its forms
2014-02-18 14:00:54 -05:00
( ? : ( ? : watch | movie ) ( ? : _popup ) ? ( ? : \. php ) ? / ? ) ? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
2013-06-23 13:58:33 -04:00
( ? : \? | \#!?) # the params delimiter ? or # or #!
2015-11-29 10:01:59 -05:00
( ? : . * ? [ & ; ] ) ? ? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
2013-06-23 13:58:33 -04:00
v =
)
2013-09-05 16:38:23 -04:00
) )
2015-08-16 16:04:13 -04:00
| ( ? :
youtu \. be | # just youtu.be/xxxx
2016-04-03 16:26:20 -04:00
vid \. plus | # or vid.plus/xxxx
zwearz \. com / watch | # or zwearz.com/watch/xxxx
2015-08-16 16:04:13 -04:00
) /
2014-09-11 15:47:25 -04:00
| ( ? : www \. ) ? cleanvideosearch \. com / media / action / yt / watch \? videoId =
2013-09-05 16:38:23 -04:00
)
2013-06-23 13:58:33 -04:00
) ? # all until now is optional -> you can pass the naked ID
2013-09-09 04:33:12 -04:00
( [ 0 - 9 A - Za - z_ - ] { 11 } ) # here is it! the YouTube video ID
2017-03-24 14:17:17 -04:00
( ? ! . * ? \blist =
( ? :
% ( playlist_id ) s | # combined list/video URLs are handled by the playlist IE
WL # WL are handled by the watch later IE
)
)
2013-06-23 13:58:33 -04:00
( ? ( 1 ) . + ) ? # if we found the ID, everything can follow
2017-03-24 14:17:17 -04:00
$ """ % { ' playlist_id ' : YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2013-06-23 13:58:33 -04:00
_NEXT_URL_RE = r ' [ \ ?&]next_url=([^&]+) '
2020-05-01 20:18:08 -04:00
_PLAYER_INFO_RE = (
r ' /(?P<id>[a-zA-Z0-9_-] { 8,})/player_ias \ .vflset(?:/[a-zA-Z] { 2,3}_[a-zA-Z] { 2,3})?/base \ .(?P<ext>[a-z]+)$ ' ,
r ' \ b(?P<id>vfl[a-zA-Z0-9_-]+) \ b.*? \ .(?P<ext>[a-z]+)$ ' ,
)
2013-12-24 06:34:09 -05:00
_formats = {
2016-03-02 11:35:04 -05:00
' 5 ' : { ' ext ' : ' flv ' , ' width ' : 400 , ' height ' : 240 , ' acodec ' : ' mp3 ' , ' abr ' : 64 , ' vcodec ' : ' h263 ' } ,
' 6 ' : { ' ext ' : ' flv ' , ' width ' : 450 , ' height ' : 270 , ' acodec ' : ' mp3 ' , ' abr ' : 64 , ' vcodec ' : ' h263 ' } ,
' 13 ' : { ' ext ' : ' 3gp ' , ' acodec ' : ' aac ' , ' vcodec ' : ' mp4v ' } ,
' 17 ' : { ' ext ' : ' 3gp ' , ' width ' : 176 , ' height ' : 144 , ' acodec ' : ' aac ' , ' abr ' : 24 , ' vcodec ' : ' mp4v ' } ,
' 18 ' : { ' ext ' : ' mp4 ' , ' width ' : 640 , ' height ' : 360 , ' acodec ' : ' aac ' , ' abr ' : 96 , ' vcodec ' : ' h264 ' } ,
' 22 ' : { ' ext ' : ' mp4 ' , ' width ' : 1280 , ' height ' : 720 , ' acodec ' : ' aac ' , ' abr ' : 192 , ' vcodec ' : ' h264 ' } ,
' 34 ' : { ' ext ' : ' flv ' , ' width ' : 640 , ' height ' : 360 , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' } ,
' 35 ' : { ' ext ' : ' flv ' , ' width ' : 854 , ' height ' : 480 , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' } ,
2016-02-07 14:30:57 -05:00
# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
2016-03-02 11:35:04 -05:00
' 36 ' : { ' ext ' : ' 3gp ' , ' width ' : 320 , ' acodec ' : ' aac ' , ' vcodec ' : ' mp4v ' } ,
' 37 ' : { ' ext ' : ' mp4 ' , ' width ' : 1920 , ' height ' : 1080 , ' acodec ' : ' aac ' , ' abr ' : 192 , ' vcodec ' : ' h264 ' } ,
' 38 ' : { ' ext ' : ' mp4 ' , ' width ' : 4096 , ' height ' : 3072 , ' acodec ' : ' aac ' , ' abr ' : 192 , ' vcodec ' : ' h264 ' } ,
' 43 ' : { ' ext ' : ' webm ' , ' width ' : 640 , ' height ' : 360 , ' acodec ' : ' vorbis ' , ' abr ' : 128 , ' vcodec ' : ' vp8 ' } ,
' 44 ' : { ' ext ' : ' webm ' , ' width ' : 854 , ' height ' : 480 , ' acodec ' : ' vorbis ' , ' abr ' : 128 , ' vcodec ' : ' vp8 ' } ,
' 45 ' : { ' ext ' : ' webm ' , ' width ' : 1280 , ' height ' : 720 , ' acodec ' : ' vorbis ' , ' abr ' : 192 , ' vcodec ' : ' vp8 ' } ,
2016-01-02 22:11:19 -05:00
' 46 ' : { ' ext ' : ' webm ' , ' width ' : 1920 , ' height ' : 1080 , ' acodec ' : ' vorbis ' , ' abr ' : 192 , ' vcodec ' : ' vp8 ' } ,
2016-03-02 11:35:04 -05:00
' 59 ' : { ' ext ' : ' mp4 ' , ' width ' : 854 , ' height ' : 480 , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' } ,
' 78 ' : { ' ext ' : ' mp4 ' , ' width ' : 854 , ' height ' : 480 , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' } ,
2016-01-02 22:11:19 -05:00
# 3D videos
2016-03-02 11:35:04 -05:00
' 82 ' : { ' ext ' : ' mp4 ' , ' height ' : 360 , ' format_note ' : ' 3D ' , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' , ' preference ' : - 20 } ,
' 83 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' 3D ' , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' , ' preference ' : - 20 } ,
' 84 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' 3D ' , ' acodec ' : ' aac ' , ' abr ' : 192 , ' vcodec ' : ' h264 ' , ' preference ' : - 20 } ,
' 85 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' 3D ' , ' acodec ' : ' aac ' , ' abr ' : 192 , ' vcodec ' : ' h264 ' , ' preference ' : - 20 } ,
2016-01-02 22:11:19 -05:00
' 100 ' : { ' ext ' : ' webm ' , ' height ' : 360 , ' format_note ' : ' 3D ' , ' acodec ' : ' vorbis ' , ' abr ' : 128 , ' vcodec ' : ' vp8 ' , ' preference ' : - 20 } ,
' 101 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' 3D ' , ' acodec ' : ' vorbis ' , ' abr ' : 192 , ' vcodec ' : ' vp8 ' , ' preference ' : - 20 } ,
' 102 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' format_note ' : ' 3D ' , ' acodec ' : ' vorbis ' , ' abr ' : 192 , ' vcodec ' : ' vp8 ' , ' preference ' : - 20 } ,
2013-08-19 21:22:25 -04:00
2013-09-03 21:49:35 -04:00
# Apple HTTP Live Streaming
2016-03-17 07:25:37 -04:00
' 91 ' : { ' ext ' : ' mp4 ' , ' height ' : 144 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 48 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
2016-03-02 11:35:04 -05:00
' 92 ' : { ' ext ' : ' mp4 ' , ' height ' : 240 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 48 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
' 93 ' : { ' ext ' : ' mp4 ' , ' height ' : 360 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
' 94 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 128 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
' 95 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 256 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
' 96 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 256 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
2016-01-02 22:11:19 -05:00
' 132 ' : { ' ext ' : ' mp4 ' , ' height ' : 240 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 48 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
' 151 ' : { ' ext ' : ' mp4 ' , ' height ' : 72 , ' format_note ' : ' HLS ' , ' acodec ' : ' aac ' , ' abr ' : 24 , ' vcodec ' : ' h264 ' , ' preference ' : - 10 } ,
2013-12-24 06:34:09 -05:00
# DASH mp4 video
2017-04-11 11:41:48 -04:00
' 133 ' : { ' ext ' : ' mp4 ' , ' height ' : 240 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 134 ' : { ' ext ' : ' mp4 ' , ' height ' : 360 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 135 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 136 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 137 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
2019-03-09 07:14:41 -05:00
' 138 ' : { ' ext ' : ' mp4 ' , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } , # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
2017-04-11 11:41:48 -04:00
' 160 ' : { ' ext ' : ' mp4 ' , ' height ' : 144 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 212 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 264 ' : { ' ext ' : ' mp4 ' , ' height ' : 1440 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
' 298 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' , ' fps ' : 60 } ,
' 299 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' , ' fps ' : 60 } ,
' 266 ' : { ' ext ' : ' mp4 ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' h264 ' } ,
2013-08-19 21:22:25 -04:00
2013-10-18 17:53:00 -04:00
# Dash mp4 audio
2017-04-11 11:41:48 -04:00
' 139 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' abr ' : 48 , ' container ' : ' m4a_dash ' } ,
' 140 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' abr ' : 128 , ' container ' : ' m4a_dash ' } ,
' 141 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' abr ' : 256 , ' container ' : ' m4a_dash ' } ,
' 256 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' container ' : ' m4a_dash ' } ,
' 258 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' container ' : ' m4a_dash ' } ,
' 325 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' dtse ' , ' container ' : ' m4a_dash ' } ,
' 328 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' ec-3 ' , ' container ' : ' m4a_dash ' } ,
2013-08-19 21:22:25 -04:00
# Dash webm
2017-04-11 11:41:48 -04:00
' 167 ' : { ' ext ' : ' webm ' , ' height ' : 360 , ' width ' : 640 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp8 ' } ,
' 168 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' width ' : 854 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp8 ' } ,
' 169 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' width ' : 1280 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp8 ' } ,
' 170 ' : { ' ext ' : ' webm ' , ' height ' : 1080 , ' width ' : 1920 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp8 ' } ,
' 218 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' width ' : 854 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp8 ' } ,
' 219 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' width ' : 854 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp8 ' } ,
' 278 ' : { ' ext ' : ' webm ' , ' height ' : 144 , ' format_note ' : ' DASH video ' , ' container ' : ' webm ' , ' vcodec ' : ' vp9 ' } ,
' 242 ' : { ' ext ' : ' webm ' , ' height ' : 240 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 243 ' : { ' ext ' : ' webm ' , ' height ' : 360 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 244 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 245 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 246 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 247 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 248 ' : { ' ext ' : ' webm ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 271 ' : { ' ext ' : ' webm ' , ' height ' : 1440 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
2015-11-30 09:42:05 -05:00
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
2017-04-11 11:41:48 -04:00
' 272 ' : { ' ext ' : ' webm ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 302 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' , ' fps ' : 60 } ,
' 303 ' : { ' ext ' : ' webm ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' , ' fps ' : 60 } ,
' 308 ' : { ' ext ' : ' webm ' , ' height ' : 1440 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' , ' fps ' : 60 } ,
' 313 ' : { ' ext ' : ' webm ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' } ,
' 315 ' : { ' ext ' : ' webm ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' vcodec ' : ' vp9 ' , ' fps ' : 60 } ,
2013-12-24 06:34:09 -05:00
# Dash webm audio
2017-04-11 11:41:48 -04:00
' 171 ' : { ' ext ' : ' webm ' , ' acodec ' : ' vorbis ' , ' format_note ' : ' DASH audio ' , ' abr ' : 128 } ,
' 172 ' : { ' ext ' : ' webm ' , ' acodec ' : ' vorbis ' , ' format_note ' : ' DASH audio ' , ' abr ' : 256 } ,
2014-01-08 20:38:50 -05:00
2014-11-18 05:06:09 -05:00
# Dash webm audio with opus inside
2017-04-11 11:41:48 -04:00
' 249 ' : { ' ext ' : ' webm ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' opus ' , ' abr ' : 50 } ,
' 250 ' : { ' ext ' : ' webm ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' opus ' , ' abr ' : 70 } ,
' 251 ' : { ' ext ' : ' webm ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' opus ' , ' abr ' : 160 } ,
2014-11-18 05:06:09 -05:00
2014-01-08 20:38:50 -05:00
# RTMP (unnamed)
' _rtmp ' : { ' protocol ' : ' rtmp ' } ,
2019-06-13 14:59:05 -04:00
# av01 video only formats sometimes served with "unknown" codecs
' 394 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
' 395 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
' 396 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
' 397 ' : { ' acodec ' : ' none ' , ' vcodec ' : ' av01.0.05M.08 ' } ,
2013-06-23 13:58:33 -04:00
}
2020-09-13 05:02:07 -04:00
_SUBTITLE_FORMATS = ( ' json3 ' , ' srv1 ' , ' srv2 ' , ' srv3 ' , ' ttml ' , ' vtt ' )
2013-08-19 21:22:25 -04:00
2017-02-26 04:51:21 -05:00
_GEO_BYPASS = False
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube '
2013-06-27 13:13:11 -04:00
_TESTS = [
{
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9 ' ,
2014-09-24 03:49:53 -04:00
' info_dict ' : {
' id ' : ' BaW_jenozKc ' ,
' ext ' : ' mp4 ' ,
2020-09-02 16:37:35 -04:00
' title ' : ' youtube-dl test video " \' / \\ ä↭𝕐 ' ,
2014-09-24 03:49:53 -04:00
' uploader ' : ' Philipp Hagemeister ' ,
' uploader_id ' : ' phihag ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/phihag ' ,
2018-09-14 14:24:26 -04:00
' channel_id ' : ' UCLqxVugv74EIW3VWh2NOa3Q ' ,
' channel_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/channel/UCLqxVugv74EIW3VWh2NOa3Q ' ,
2014-09-24 03:49:53 -04:00
' upload_date ' : ' 20121002 ' ,
2020-09-02 16:37:35 -04:00
' description ' : ' test chars: " \' / \\ ä↭𝕐 \n test URL: https://github.com/rg3/youtube-dl/issues/1892 \n \n This is a test video for youtube-dl. \n \n For more information, contact phihag@phihag.de . ' ,
2014-09-24 03:49:53 -04:00
' categories ' : [ ' Science & Technology ' ] ,
2020-09-02 16:37:35 -04:00
' tags ' : [ ' youtube-dl ' ] ,
2017-01-26 09:43:14 -05:00
' duration ' : 10 ,
2018-11-02 19:26:16 -04:00
' view_count ' : int ,
2014-08-31 12:10:05 -04:00
' like_count ' : int ,
' dislike_count ' : int ,
2015-07-20 15:10:28 -04:00
' start_time ' : 1 ,
2015-07-23 07:20:21 -04:00
' end_time ' : 9 ,
2013-06-27 13:13:11 -04:00
}
2013-06-27 13:55:39 -04:00
} ,
2013-11-18 07:05:18 -05:00
{
2014-09-24 03:49:53 -04:00
' url ' : ' //www.YouTube.com/watch?v=yZIXLfi8CZQ ' ,
' note ' : ' Embed-only video (#1746) ' ,
' info_dict ' : {
' id ' : ' yZIXLfi8CZQ ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20120608 ' ,
' title ' : ' Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012 ' ,
' description ' : ' md5:09b78bd971f1e3e289601dfba15ca4f7 ' ,
' uploader ' : ' SET India ' ,
2015-11-23 10:35:23 -05:00
' uploader_id ' : ' setindia ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/setindia ' ,
2015-11-23 10:35:23 -05:00
' age_limit ' : 18 ,
2013-11-18 07:05:18 -05:00
}
} ,
2015-08-10 14:52:38 -04:00
{
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY ' ,
2015-08-10 14:52:38 -04:00
' note ' : ' Use the first video ID in the URL ' ,
' info_dict ' : {
' id ' : ' BaW_jenozKc ' ,
' ext ' : ' mp4 ' ,
2020-09-02 16:37:35 -04:00
' title ' : ' youtube-dl test video " \' / \\ ä↭𝕐 ' ,
2015-08-10 14:52:38 -04:00
' uploader ' : ' Philipp Hagemeister ' ,
' uploader_id ' : ' phihag ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/phihag ' ,
2015-08-10 14:52:38 -04:00
' upload_date ' : ' 20121002 ' ,
2020-09-02 16:37:35 -04:00
' description ' : ' test chars: " \' / \\ ä↭𝕐 \n test URL: https://github.com/rg3/youtube-dl/issues/1892 \n \n This is a test video for youtube-dl. \n \n For more information, contact phihag@phihag.de . ' ,
2015-08-10 14:52:38 -04:00
' categories ' : [ ' Science & Technology ' ] ,
2020-09-02 16:37:35 -04:00
' tags ' : [ ' youtube-dl ' ] ,
2017-01-26 09:43:14 -05:00
' duration ' : 10 ,
2018-11-02 19:26:16 -04:00
' view_count ' : int ,
2015-08-10 14:52:38 -04:00
' like_count ' : int ,
' dislike_count ' : int ,
2015-08-10 15:22:06 -04:00
} ,
' params ' : {
' skip_download ' : True ,
} ,
2015-08-10 14:52:38 -04:00
} ,
2014-01-18 23:47:20 -05:00
{
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/watch?v=a9LDPn-MO4I ' ,
2014-09-24 03:49:53 -04:00
' note ' : ' 256k DASH audio (format 141) via DASH manifest ' ,
' info_dict ' : {
' id ' : ' a9LDPn-MO4I ' ,
' ext ' : ' m4a ' ,
' upload_date ' : ' 20121002 ' ,
' uploader_id ' : ' 8KVIDEO ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/8KVIDEO ' ,
2014-09-24 03:49:53 -04:00
' description ' : ' ' ,
' uploader ' : ' 8KVIDEO ' ,
' title ' : ' UHDTV TEST 8K VIDEO.mp4 '
2014-01-22 15:56:37 -05:00
} ,
2014-09-24 03:49:53 -04:00
' params ' : {
' youtube_include_dash_manifest ' : True ,
' format ' : ' 141 ' ,
2014-01-22 15:56:37 -05:00
} ,
2016-06-24 11:27:55 -04:00
' skip ' : ' format 141 not served anymore ' ,
2014-01-18 23:47:20 -05:00
} ,
2014-11-23 03:59:02 -05:00
# Controversy video
{
' url ' : ' https://www.youtube.com/watch?v=T4XJQO3qol8 ' ,
' info_dict ' : {
' id ' : ' T4XJQO3qol8 ' ,
' ext ' : ' mp4 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 219 ,
2014-11-23 03:59:02 -05:00
' upload_date ' : ' 20100909 ' ,
2019-01-15 14:18:27 -05:00
' uploader ' : ' Amazing Atheist ' ,
2014-11-23 03:59:02 -05:00
' uploader_id ' : ' TheAmazingAtheist ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/TheAmazingAtheist ' ,
2014-11-23 03:59:02 -05:00
' title ' : ' Burning Everyone \' s Koran ' ,
' description ' : ' SUBSCRIBE: http://www.youtube.com/saturninefilms \n \n Even Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html ' ,
}
2014-11-30 15:45:49 -05:00
} ,
2020-10-23 23:09:22 -04:00
# Normal age-gate video (embed allowed)
2014-11-30 15:45:49 -05:00
{
2016-09-17 10:48:20 -04:00
' url ' : ' https://youtube.com/watch?v=HtVdAasjOgU ' ,
2014-11-30 15:45:49 -05:00
' info_dict ' : {
' id ' : ' HtVdAasjOgU ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer ' ,
2017-01-02 07:08:07 -05:00
' description ' : r ' re:(?s). { 100,}About the Game \ n.*?The Witcher 3: Wild Hunt. { 100,} ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 142 ,
2014-11-30 15:45:49 -05:00
' uploader ' : ' The Witcher ' ,
' uploader_id ' : ' WitcherGame ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/WitcherGame ' ,
2014-11-30 15:45:49 -05:00
' upload_date ' : ' 20140605 ' ,
2015-08-10 15:24:53 -04:00
' age_limit ' : 18 ,
2014-11-30 15:45:49 -05:00
} ,
} ,
2019-03-09 07:14:41 -05:00
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
2014-12-11 10:28:07 -05:00
{
' url ' : ' lqQg6PlCWgI ' ,
' info_dict ' : {
' id ' : ' lqQg6PlCWgI ' ,
' ext ' : ' mp4 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 6085 ,
2015-11-23 10:37:21 -05:00
' upload_date ' : ' 20150827 ' ,
2014-12-11 10:34:37 -05:00
' uploader_id ' : ' olympic ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/olympic ' ,
2014-12-11 10:34:37 -05:00
' description ' : ' HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games ' ,
2016-06-24 11:47:19 -04:00
' uploader ' : ' Olympic ' ,
2014-12-11 10:34:37 -05:00
' title ' : ' Hockey - Women - GER-AUS - London 2012 Olympic Games ' ,
} ,
' params ' : {
' skip_download ' : ' requires avconv ' ,
2014-12-11 10:28:07 -05:00
}
2014-12-11 10:34:37 -05:00
} ,
2015-01-09 23:45:51 -05:00
# Non-square pixels
{
' url ' : ' https://www.youtube.com/watch?v=_b-2C3KPAM0 ' ,
' info_dict ' : {
' id ' : ' _b-2C3KPAM0 ' ,
' ext ' : ' mp4 ' ,
' stretched_ratio ' : 16 / 9. ,
2017-01-26 09:43:14 -05:00
' duration ' : 85 ,
2015-01-09 23:45:51 -05:00
' upload_date ' : ' 20110310 ' ,
' uploader_id ' : ' AllenMeow ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/AllenMeow ' ,
2015-01-09 23:45:51 -05:00
' description ' : ' made by Wacom from Korea | 字幕&加油添醋 by TY \' s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯 ' ,
2018-06-02 15:23:45 -04:00
' uploader ' : ' 孫ᄋᄅ ' ,
2015-01-09 23:45:51 -05:00
' title ' : ' [A-made] 變態妍字幕版 太妍 我就是這樣的人 ' ,
} ,
2015-04-05 14:35:55 -04:00
} ,
# url_encoded_fmt_stream_map is empty string
{
' url ' : ' qEJwOuvDf7I ' ,
' info_dict ' : {
' id ' : ' qEJwOuvDf7I ' ,
2015-08-12 11:27:58 -04:00
' ext ' : ' webm ' ,
2015-04-05 14:35:55 -04:00
' title ' : ' Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге ' ,
' description ' : ' ' ,
' upload_date ' : ' 20150404 ' ,
' uploader_id ' : ' spbelect ' ,
' uploader ' : ' Наблюдатели Петербурга ' ,
} ,
' params ' : {
' skip_download ' : ' requires avconv ' ,
2016-01-19 09:56:04 -05:00
} ,
' skip ' : ' This live event has ended. ' ,
2015-04-05 14:35:55 -04:00
} ,
2019-03-09 07:14:41 -05:00
# Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
2015-06-27 04:55:46 -04:00
{
' url ' : ' https://www.youtube.com/watch?v=FIl7x6_3R5Y ' ,
' info_dict ' : {
' id ' : ' FIl7x6_3R5Y ' ,
2018-06-02 15:23:45 -04:00
' ext ' : ' webm ' ,
2015-06-27 04:55:46 -04:00
' title ' : ' md5:7b81415841e02ecd4313668cde88737a ' ,
' description ' : ' md5:116377fd2963b81ec4ce64b542173306 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 220 ,
2015-06-27 04:55:46 -04:00
' upload_date ' : ' 20150625 ' ,
' uploader_id ' : ' dorappi2000 ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/dorappi2000 ' ,
2015-06-27 04:55:46 -04:00
' uploader ' : ' dorappi2000 ' ,
2018-06-02 15:23:45 -04:00
' formats ' : ' mincount:31 ' ,
2015-06-27 04:55:46 -04:00
} ,
2018-06-02 15:23:45 -04:00
' skip ' : ' not actual anymore ' ,
2015-07-20 13:34:24 -04:00
} ,
2015-06-10 02:47:02 -04:00
# DASH manifest with segment_list
{
' url ' : ' https://www.youtube.com/embed/CsmdDsKjzN8 ' ,
' md5 ' : ' 8ce563a1d667b599d21064e982ab9e31 ' ,
' info_dict ' : {
' id ' : ' CsmdDsKjzN8 ' ,
' ext ' : ' mp4 ' ,
2015-07-20 13:48:50 -04:00
' upload_date ' : ' 20150501 ' , # According to '<meta itemprop="datePublished"', but in other places it's 20150510
2015-06-10 02:47:02 -04:00
' uploader ' : ' Airtek ' ,
' description ' : ' Retransmisión en directo de la XVIII media maratón de Zaragoza. ' ,
' uploader_id ' : ' UCzTzUmjXxxacNnL8I3m4LnQ ' ,
' title ' : ' Retransmisión XVIII Media maratón Zaragoza 2015 ' ,
} ,
' params ' : {
' youtube_include_dash_manifest ' : True ,
' format ' : ' 135 ' , # bestvideo
2016-06-24 11:47:19 -04:00
} ,
' skip ' : ' This live event has ended. ' ,
2015-07-20 13:34:24 -04:00
} ,
2015-07-25 11:30:34 -04:00
{
# Multifeed videos (multiple cameras), URL is for Main Camera
' url ' : ' https://www.youtube.com/watch?v=jqWvoWXjCVs ' ,
' info_dict ' : {
' id ' : ' jqWvoWXjCVs ' ,
' title ' : ' teamPGP: Rocket League Noob Stream ' ,
' description ' : ' md5:dc7872fb300e143831327f1bae3af010 ' ,
} ,
' playlist ' : [ {
' info_dict ' : {
' id ' : ' jqWvoWXjCVs ' ,
' ext ' : ' mp4 ' ,
' title ' : ' teamPGP: Rocket League Noob Stream (Main Camera) ' ,
' description ' : ' md5:dc7872fb300e143831327f1bae3af010 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 7335 ,
2015-07-25 11:30:34 -04:00
' upload_date ' : ' 20150721 ' ,
' uploader ' : ' Beer Games Beer ' ,
' uploader_id ' : ' beergamesbeer ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/beergamesbeer ' ,
2016-03-02 12:07:25 -05:00
' license ' : ' Standard YouTube License ' ,
2015-07-25 11:30:34 -04:00
} ,
} , {
' info_dict ' : {
' id ' : ' 6h8e8xoXJzg ' ,
' ext ' : ' mp4 ' ,
' title ' : ' teamPGP: Rocket League Noob Stream (kreestuh) ' ,
' description ' : ' md5:dc7872fb300e143831327f1bae3af010 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 7337 ,
2015-07-25 11:30:34 -04:00
' upload_date ' : ' 20150721 ' ,
' uploader ' : ' Beer Games Beer ' ,
' uploader_id ' : ' beergamesbeer ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/beergamesbeer ' ,
2016-03-02 12:07:25 -05:00
' license ' : ' Standard YouTube License ' ,
2015-07-25 11:30:34 -04:00
} ,
} , {
' info_dict ' : {
' id ' : ' PUOgX5z9xZw ' ,
' ext ' : ' mp4 ' ,
' title ' : ' teamPGP: Rocket League Noob Stream (grizzle) ' ,
' description ' : ' md5:dc7872fb300e143831327f1bae3af010 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 7337 ,
2015-07-25 11:30:34 -04:00
' upload_date ' : ' 20150721 ' ,
' uploader ' : ' Beer Games Beer ' ,
' uploader_id ' : ' beergamesbeer ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/beergamesbeer ' ,
2016-03-02 12:07:25 -05:00
' license ' : ' Standard YouTube License ' ,
2015-07-25 11:30:34 -04:00
} ,
} , {
' info_dict ' : {
' id ' : ' teuwxikvS5k ' ,
' ext ' : ' mp4 ' ,
' title ' : ' teamPGP: Rocket League Noob Stream (zim) ' ,
' description ' : ' md5:dc7872fb300e143831327f1bae3af010 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 7334 ,
2015-07-25 11:30:34 -04:00
' upload_date ' : ' 20150721 ' ,
' uploader ' : ' Beer Games Beer ' ,
' uploader_id ' : ' beergamesbeer ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/beergamesbeer ' ,
2016-03-02 12:07:25 -05:00
' license ' : ' Standard YouTube License ' ,
2015-07-25 11:30:34 -04:00
} ,
} ] ,
' params ' : {
' skip_download ' : True ,
} ,
2019-01-15 14:18:27 -05:00
' skip ' : ' This video is not available. ' ,
2015-08-16 16:04:13 -04:00
} ,
2016-02-12 18:18:58 -05:00
{
2019-03-09 07:14:41 -05:00
# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
2016-02-12 18:18:58 -05:00
' url ' : ' https://www.youtube.com/watch?v=gVfLd0zydlo ' ,
' info_dict ' : {
' id ' : ' gVfLd0zydlo ' ,
' title ' : ' DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30 ' ,
} ,
' playlist_count ' : 2 ,
2016-06-24 11:47:19 -04:00
' skip ' : ' Not multifeed anymore ' ,
2016-02-12 18:18:58 -05:00
} ,
2015-08-16 16:04:13 -04:00
{
2016-09-17 10:48:20 -04:00
' url ' : ' https://vid.plus/FlRa-iH7PGw ' ,
2015-08-16 16:04:13 -04:00
' only_matching ' : True ,
2015-11-22 07:49:33 -05:00
} ,
2016-04-03 16:26:20 -04:00
{
2016-09-17 10:48:20 -04:00
' url ' : ' https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html ' ,
2016-04-03 16:26:20 -04:00
' only_matching ' : True ,
} ,
2015-11-22 07:49:33 -05:00
{
2019-03-09 07:14:41 -05:00
# Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
2016-01-18 12:19:38 -05:00
# Also tests cut-off URL expansion in video description (see
2019-03-09 07:14:41 -05:00
# https://github.com/ytdl-org/youtube-dl/issues/1892,
# https://github.com/ytdl-org/youtube-dl/issues/8164)
2015-11-22 07:49:33 -05:00
' url ' : ' https://www.youtube.com/watch?v=lsguqyKfVQg ' ,
' info_dict ' : {
' id ' : ' lsguqyKfVQg ' ,
' ext ' : ' mp4 ' ,
' title ' : ' { dark walk}; Loki/AC/Dishonored; collab w/Elflover21 ' ,
2018-06-02 15:23:45 -04:00
' alt_title ' : ' Dark Walk - Position Music ' ,
2015-11-22 07:49:33 -05:00
' description ' : ' md5:8085699c11dc3f597ce0410b0dcbb34a ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 133 ,
2015-11-22 07:49:33 -05:00
' upload_date ' : ' 20151119 ' ,
' uploader_id ' : ' IronSoulElf ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/IronSoulElf ' ,
2015-11-22 07:49:33 -05:00
' uploader ' : ' IronSoulElf ' ,
2018-06-02 15:23:45 -04:00
' creator ' : ' Todd Haberman, Daniel Law Heath and Aaron Kaplan ' ,
' track ' : ' Dark Walk - Position Music ' ,
' artist ' : ' Todd Haberman, Daniel Law Heath and Aaron Kaplan ' ,
2019-04-28 12:37:46 -04:00
' album ' : ' Position Music - Production Music Vol. 143 - Dark Walk ' ,
2015-11-22 07:49:33 -05:00
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
2015-11-23 10:02:37 -05:00
{
2019-03-09 07:14:41 -05:00
# Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
2015-11-23 10:02:37 -05:00
' url ' : ' https://www.youtube.com/watch?v=Ms7iBXnlUO8 ' ,
' only_matching ' : True ,
} ,
2015-11-27 19:07:07 -05:00
{
# Video with yt:stretch=17:0
' url ' : ' https://www.youtube.com/watch?v=Q39EVAstoRM ' ,
' info_dict ' : {
' id ' : ' Q39EVAstoRM ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Clash Of Clans#14 Dicas De Ataque Para CV 4 ' ,
' description ' : ' md5:ee18a25c350637c8faff806845bddee9 ' ,
' upload_date ' : ' 20151107 ' ,
' uploader_id ' : ' UCCr7TALkRbo3EtFzETQF1LA ' ,
' uploader ' : ' CH GAMER DROID ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
2016-06-24 11:47:19 -04:00
' skip ' : ' This video does not exist. ' ,
2015-11-27 19:07:07 -05:00
} ,
2016-03-02 12:07:25 -05:00
{
# Video licensed under Creative Commons
' url ' : ' https://www.youtube.com/watch?v=M4gD1WSo5mA ' ,
' info_dict ' : {
' id ' : ' M4gD1WSo5mA ' ,
' ext ' : ' mp4 ' ,
' title ' : ' md5:e41008789470fc2533a3252216f1c1d1 ' ,
' description ' : ' md5:a677553cf0840649b731a3024aeff4cc ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 721 ,
2016-03-02 12:07:25 -05:00
' upload_date ' : ' 20150127 ' ,
' uploader_id ' : ' BerkmanCenter ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/BerkmanCenter ' ,
2017-01-26 09:43:14 -05:00
' uploader ' : ' The Berkman Klein Center for Internet & Society ' ,
2016-03-02 12:07:25 -05:00
' license ' : ' Creative Commons Attribution license (reuse allowed) ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
2016-03-02 12:49:10 -05:00
{
# Channel-like uploader_url
' url ' : ' https://www.youtube.com/watch?v=eQcmzGIKrzg ' ,
' info_dict ' : {
' id ' : ' eQcmzGIKrzg ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Democratic Socialism and Foreign Policy | Bernie Sanders ' ,
' description ' : ' md5:dda0d780d5a6e120758d1711d062a867 ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 4060 ,
2016-03-02 12:49:10 -05:00
' upload_date ' : ' 20151119 ' ,
2018-06-02 15:23:45 -04:00
' uploader ' : ' Bernie Sanders ' ,
2016-03-02 12:49:10 -05:00
' uploader_id ' : ' UCH1dpzjCEiGAt8CXkryhkZg ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/channel/UCH1dpzjCEiGAt8CXkryhkZg ' ,
2016-03-02 12:49:10 -05:00
' license ' : ' Creative Commons Attribution license (reuse allowed) ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
2015-11-29 10:01:59 -05:00
{
' url ' : ' https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY ' ,
' only_matching ' : True ,
2016-07-11 15:10:35 -04:00
} ,
{
2019-03-09 07:14:41 -05:00
# YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
2016-07-11 15:10:35 -04:00
' url ' : ' https://www.youtube.com/watch?v=i1Ko8UG-Tdo ' ,
' only_matching ' : True ,
2016-09-02 14:17:15 -04:00
} ,
{
# Rental video preview
' url ' : ' https://www.youtube.com/watch?v=yYr8q0y5Jfg ' ,
' info_dict ' : {
' id ' : ' uGpuVWrhIzE ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Piku - Trailer ' ,
' description ' : ' md5:c36bd60c3fd6f1954086c083c72092eb ' ,
' upload_date ' : ' 20150811 ' ,
' uploader ' : ' FlixMatrix ' ,
' uploader_id ' : ' FlixMatrixKaravan ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/FlixMatrixKaravan ' ,
2016-09-02 14:17:15 -04:00
' license ' : ' Standard YouTube License ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
2018-06-02 15:23:45 -04:00
' skip ' : ' This video is not available. ' ,
2017-01-09 10:30:46 -05:00
} ,
2017-01-21 06:10:32 -05:00
{
# YouTube Red video with episode data
' url ' : ' https://www.youtube.com/watch?v=iqKdEhx-dD4 ' ,
' info_dict ' : {
' id ' : ' iqKdEhx-dD4 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Isolation - Mind Field (Ep 1) ' ,
2019-01-15 14:18:27 -05:00
' description ' : ' md5:46a29be4ceffa65b92d277b93f463c0f ' ,
2017-01-26 09:43:14 -05:00
' duration ' : 2085 ,
2017-01-21 06:10:32 -05:00
' upload_date ' : ' 20170118 ' ,
' uploader ' : ' Vsauce ' ,
' uploader_id ' : ' Vsauce ' ,
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/Vsauce ' ,
' series ' : ' Mind Field ' ,
' season_number ' : 1 ,
' episode_number ' : 1 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
' expected_warnings ' : [
' Skipping DASH manifest ' ,
] ,
} ,
2017-08-26 04:38:38 -04:00
{
# The following content has been identified by the YouTube community
# as inappropriate or offensive to some audiences.
' url ' : ' https://www.youtube.com/watch?v=6SJNVb0GnPI ' ,
' info_dict ' : {
' id ' : ' 6SJNVb0GnPI ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Race Differences in Intelligence ' ,
' description ' : ' md5:5d161533167390427a1f8ee89a1fc6f1 ' ,
' duration ' : 965 ,
' upload_date ' : ' 20140124 ' ,
' uploader ' : ' New Century Foundation ' ,
' uploader_id ' : ' UCEJYpZGqgUob0zVVEaLhvVg ' ,
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/channel/UCEJYpZGqgUob0zVVEaLhvVg ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
2017-01-09 10:30:46 -05:00
{
# itag 212
' url ' : ' 1t24XAntNCY ' ,
' only_matching ' : True ,
2017-02-26 04:51:21 -05:00
} ,
{
# geo restricted to JP
' url ' : ' sJL6WA-aGkQ ' ,
' only_matching ' : True ,
} ,
2017-03-24 14:17:17 -04:00
{
' url ' : ' https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM ' ,
' only_matching ' : True ,
} ,
2018-09-23 13:14:49 -04:00
{
' url ' : ' https://invidio.us/watch?v=BaW_jenozKc ' ,
' only_matching ' : True ,
} ,
2018-12-26 03:30:48 -05:00
{
# DRM protected
' url ' : ' https://www.youtube.com/watch?v=s7_qI6_mIXc ' ,
' only_matching ' : True ,
2019-01-15 14:18:27 -05:00
} ,
{
# Video with unsupported adaptive stream type formats
' url ' : ' https://www.youtube.com/watch?v=Z4Vy8R84T1U ' ,
' info_dict ' : {
' id ' : ' Z4Vy8R84T1U ' ,
' ext ' : ' mp4 ' ,
' title ' : ' saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta ' ,
' description ' : ' md5:d41d8cd98f00b204e9800998ecf8427e ' ,
' duration ' : 433 ,
' upload_date ' : ' 20130923 ' ,
' uploader ' : ' Amelia Putri Harwita ' ,
' uploader_id ' : ' UCpOxM49HJxmC1qCalXyB3_Q ' ,
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/channel/UCpOxM49HJxmC1qCalXyB3_Q ' ,
' formats ' : ' maxcount:10 ' ,
} ,
' params ' : {
' skip_download ' : True ,
' youtube_include_dash_manifest ' : False ,
} ,
2020-03-05 12:05:50 -05:00
' skip ' : ' not actual anymore ' ,
2019-04-22 00:26:48 -04:00
} ,
{
2019-04-27 04:16:17 -04:00
# Youtube Music Auto-generated description
2019-04-22 00:26:48 -04:00
' url ' : ' https://music.youtube.com/watch?v=MgNrAu2pzNs ' ,
' info_dict ' : {
' id ' : ' MgNrAu2pzNs ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Voyeur Girl ' ,
' description ' : ' md5:7ae382a65843d6df2685993e90a8628f ' ,
' upload_date ' : ' 20190312 ' ,
2020-03-05 12:05:50 -05:00
' uploader ' : ' Stephen - Topic ' ,
' uploader_id ' : ' UC-pWHpBjdGG69N9mM2auIAA ' ,
2019-04-22 00:26:48 -04:00
' artist ' : ' Stephen ' ,
' track ' : ' Voyeur Girl ' ,
' album ' : ' it \' s too much love to know my dear ' ,
' release_date ' : ' 20190313 ' ,
' release_year ' : 2019 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
{
2019-04-27 04:16:17 -04:00
# Youtube Music Auto-generated description
2019-04-22 00:26:48 -04:00
# Retrieve 'artist' field from 'Artist:' in video description
# when it is present on youtube music video
' url ' : ' https://www.youtube.com/watch?v=k0jLE7tTwjY ' ,
' info_dict ' : {
' id ' : ' k0jLE7tTwjY ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Latch Feat. Sam Smith ' ,
' description ' : ' md5:3cb1e8101a7c85fcba9b4fb41b951335 ' ,
' upload_date ' : ' 20150110 ' ,
' uploader ' : ' Various Artists - Topic ' ,
' uploader_id ' : ' UCNkEcmYdjrH4RqtNgh7BZ9w ' ,
' artist ' : ' Disclosure ' ,
' track ' : ' Latch Feat. Sam Smith ' ,
' album ' : ' Latch Featuring Sam Smith ' ,
' release_date ' : ' 20121008 ' ,
' release_year ' : 2012 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
{
2019-04-27 04:16:17 -04:00
# Youtube Music Auto-generated description
2019-04-22 00:26:48 -04:00
# handle multiple artists on youtube music video
' url ' : ' https://www.youtube.com/watch?v=74qn0eJSjpA ' ,
' info_dict ' : {
' id ' : ' 74qn0eJSjpA ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Eastside ' ,
' description ' : ' md5:290516bb73dcbfab0dcc4efe6c3de5f2 ' ,
' upload_date ' : ' 20180710 ' ,
' uploader ' : ' Benny Blanco - Topic ' ,
' uploader_id ' : ' UCzqz_ksRu_WkIzmivMdIS7A ' ,
' artist ' : ' benny blanco, Halsey, Khalid ' ,
' track ' : ' Eastside ' ,
' album ' : ' Eastside ' ,
' release_date ' : ' 20180713 ' ,
' release_year ' : 2018 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
{
2019-04-27 04:16:17 -04:00
# Youtube Music Auto-generated description
2019-04-22 00:26:48 -04:00
# handle youtube music video with release_year and no release_date
' url ' : ' https://www.youtube.com/watch?v=-hcAI0g-f5M ' ,
' info_dict ' : {
' id ' : ' -hcAI0g-f5M ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Put It On Me ' ,
2020-03-05 12:05:50 -05:00
' description ' : ' md5:f6422397c07c4c907c6638e1fee380a5 ' ,
2019-04-22 00:26:48 -04:00
' upload_date ' : ' 20180426 ' ,
' uploader ' : ' Matt Maeson - Topic ' ,
' uploader_id ' : ' UCnEkIGqtGcQMLk73Kp-Q5LQ ' ,
' artist ' : ' Matt Maeson ' ,
' track ' : ' Put It On Me ' ,
' album ' : ' The Hearse ' ,
' release_date ' : None ,
' release_year ' : 2018 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
2019-11-30 11:51:34 -05:00
{
' url ' : ' https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q ' ,
' only_matching ' : True ,
} ,
2020-04-30 13:40:38 -04:00
{
# invalid -> valid video id redirection
' url ' : ' DJztXj2GPfl ' ,
' info_dict ' : {
' id ' : ' DJztXj2GPfk ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack) ' ,
' description ' : ' md5:bf577a41da97918e94fa9798d9228825 ' ,
' upload_date ' : ' 20090125 ' ,
' uploader ' : ' Prochorowka ' ,
' uploader_id ' : ' Prochorowka ' ,
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/Prochorowka ' ,
' artist ' : ' Panjabi MC ' ,
' track ' : ' Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix ' ,
' album ' : ' Beware of the Boys (Mundian To Bach Ke) ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
2020-09-13 10:23:21 -04:00
} ,
{
# empty description results in an empty string
' url ' : ' https://www.youtube.com/watch?v=x41yOUIvK2k ' ,
' info_dict ' : {
' id ' : ' x41yOUIvK2k ' ,
' ext ' : ' mp4 ' ,
' title ' : ' IMG 3456 ' ,
' description ' : ' ' ,
' upload_date ' : ' 20170613 ' ,
' uploader_id ' : ' ElevageOrVert ' ,
' uploader ' : ' ElevageOrVert ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
2013-06-27 13:13:11 -04:00
]
2013-09-21 08:19:30 -04:00
def __init__ ( self , * args , * * kwargs ) :
super ( YoutubeIE , self ) . __init__ ( * args , * * kwargs )
2013-09-21 09:19:48 -04:00
self . _player_cache = { }
2013-09-21 08:19:30 -04:00
2013-06-23 13:58:33 -04:00
def report_video_info_webpage_download ( self , video_id ) :
""" Report attempt to download video info webpage. """
2014-09-24 03:51:45 -04:00
self . to_screen ( ' %s : Downloading video info webpage ' % video_id )
2013-06-23 13:58:33 -04:00
def report_information_extraction ( self , video_id ) :
""" Report attempt to extract video information. """
2014-09-24 03:51:45 -04:00
self . to_screen ( ' %s : Extracting video information ' % video_id )
2013-06-23 13:58:33 -04:00
def report_unavailable_format ( self , video_id , format ) :
""" Report extracted video URL. """
2014-09-24 03:51:45 -04:00
self . to_screen ( ' %s : Format %s not available ' % ( video_id , format ) )
2013-06-23 13:58:33 -04:00
def report_rtmp_download ( self ) :
""" Indicate the download will use the RTMP protocol. """
2014-09-24 03:51:45 -04:00
self . to_screen ( ' RTMP download detected ' )
2013-06-23 13:58:33 -04:00
2014-08-02 06:21:53 -04:00
def _signature_cache_id ( self , example_sig ) :
""" Return a string representation of a signature """
2014-09-13 01:51:06 -04:00
return ' . ' . join ( compat_str ( len ( part ) ) for part in example_sig . split ( ' . ' ) )
2014-08-02 06:21:53 -04:00
2020-05-01 20:18:08 -04:00
@classmethod
def _extract_player_info ( cls , player_url ) :
for player_re in cls . _PLAYER_INFO_RE :
id_m = re . search ( player_re , player_url )
if id_m :
break
else :
2014-07-22 20:19:33 -04:00
raise ExtractorError ( ' Cannot identify player %r ' % player_url )
2020-05-01 20:18:08 -04:00
return id_m . group ( ' ext ' ) , id_m . group ( ' id ' )
def _extract_signature_function ( self , video_id , player_url , example_sig ) :
player_type , player_id = self . _extract_player_info ( player_url )
2013-09-21 08:19:30 -04:00
2013-09-21 18:35:03 -04:00
# Read from filesystem cache
2014-08-02 06:21:53 -04:00
func_id = ' %s _ %s _ %s ' % (
player_type , player_id , self . _signature_cache_id ( example_sig ) )
2013-09-21 18:35:03 -04:00
assert os . path . basename ( func_id ) == func_id
2014-09-03 06:41:05 -04:00
2014-09-24 03:51:45 -04:00
cache_spec = self . _downloader . cache . load ( ' youtube-sigfuncs ' , func_id )
2014-09-03 06:41:05 -04:00
if cache_spec is not None :
2014-09-13 01:51:06 -04:00
return lambda s : ' ' . join ( s [ i ] for i in cache_spec )
2013-09-21 09:19:48 -04:00
2015-02-18 04:39:14 -05:00
download_note = (
' Downloading player %s ' % player_url
if self . _downloader . params . get ( ' verbose ' ) else
' Downloading %s player %s ' % ( player_type , player_id )
)
2013-09-21 08:19:30 -04:00
if player_type == ' js ' :
code = self . _download_webpage (
player_url , video_id ,
2015-02-18 04:39:14 -05:00
note = download_note ,
2014-09-24 03:51:45 -04:00
errnote = ' Download of %s failed ' % player_url )
2013-09-21 09:19:48 -04:00
res = self . _parse_sig_js ( code )
2013-09-21 18:35:03 -04:00
elif player_type == ' swf ' :
2013-09-21 08:19:30 -04:00
urlh = self . _request_webpage (
player_url , video_id ,
2015-02-18 04:39:14 -05:00
note = download_note ,
2014-09-24 03:51:45 -04:00
errnote = ' Download of %s failed ' % player_url )
2013-09-21 08:19:30 -04:00
code = urlh . read ( )
2013-09-21 09:19:48 -04:00
res = self . _parse_sig_swf ( code )
2013-09-21 08:19:30 -04:00
else :
assert False , ' Invalid player type %r ' % player_type
2015-02-18 04:42:23 -05:00
test_string = ' ' . join ( map ( compat_chr , range ( len ( example_sig ) ) ) )
cache_res = res ( test_string )
cache_spec = [ ord ( c ) for c in cache_res ]
2013-09-21 09:19:48 -04:00
2014-09-24 03:51:45 -04:00
self . _downloader . cache . store ( ' youtube-sigfuncs ' , func_id , cache_spec )
2013-09-21 09:19:48 -04:00
return res
2014-08-02 06:21:53 -04:00
def _print_sig_code ( self , func , example_sig ) :
2013-09-22 04:30:02 -04:00
def gen_sig_code ( idxs ) :
def _genslice ( start , end , step ) :
2014-09-13 01:51:06 -04:00
starts = ' ' if start == 0 else str ( start )
2014-11-23 15:20:46 -05:00
ends = ( ' : %d ' % ( end + step ) ) if end + step > = 0 else ' : '
2014-09-24 03:51:45 -04:00
steps = ' ' if step == 1 else ( ' : %d ' % step )
2014-09-13 01:51:06 -04:00
return ' s[ %s %s %s ] ' % ( starts , ends , steps )
2013-09-22 04:30:02 -04:00
step = None
2014-12-16 18:06:41 -05:00
# Quelch pyflakes warnings - start will be set when step is set
start = ' (Never used) '
2013-09-22 04:30:02 -04:00
for i , prev in zip ( idxs [ 1 : ] , idxs [ : - 1 ] ) :
if step is not None :
if i - prev == step :
continue
yield _genslice ( start , prev , step )
step = None
continue
if i - prev in [ - 1 , 1 ] :
step = i - prev
start = prev
continue
else :
2014-09-13 01:51:06 -04:00
yield ' s[ %d ] ' % prev
2013-09-22 04:30:02 -04:00
if step is None :
2014-09-13 01:51:06 -04:00
yield ' s[ %d ] ' % i
2013-09-22 04:30:02 -04:00
else :
yield _genslice ( start , i , step )
2014-09-13 01:51:06 -04:00
test_string = ' ' . join ( map ( compat_chr , range ( len ( example_sig ) ) ) )
2013-09-22 06:18:16 -04:00
cache_res = func ( test_string )
2013-09-22 04:30:02 -04:00
cache_spec = [ ord ( c ) for c in cache_res ]
2014-09-13 01:51:06 -04:00
expr_code = ' + ' . join ( gen_sig_code ( cache_spec ) )
2014-08-02 06:21:53 -04:00
signature_id_tuple = ' ( %s ) ' % (
' , ' . join ( compat_str ( len ( p ) ) for p in example_sig . split ( ' . ' ) ) )
2014-09-24 03:51:45 -04:00
code = ( ' if tuple(len(p) for p in s.split( \' . \' )) == %s : \n '
2014-09-13 01:51:06 -04:00
' return %s \n ' ) % ( signature_id_tuple , expr_code )
2014-09-24 03:51:45 -04:00
self . to_screen ( ' Extracted signature function: \n ' + code )
2013-09-22 04:30:02 -04:00
2013-09-21 08:19:30 -04:00
def _parse_sig_js ( self , jscode ) :
funcname = self . _search_regex (
2019-06-21 11:58:42 -04:00
( r ' \ b[cs] \ s*&& \ s*[adf] \ .set \ ([^,]+ \ s*, \ s*encodeURIComponent \ s* \ ( \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
r ' \ b[a-zA-Z0-9]+ \ s*&& \ s*[a-zA-Z0-9]+ \ .set \ ([^,]+ \ s*, \ s*encodeURIComponent \ s* \ ( \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
2020-07-27 18:04:50 -04:00
r ' (?: \ b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$] {2} ) \ s*= \ s*function \ ( \ s*a \ s* \ ) \ s* { \ s*a \ s*= \ s*a \ .split \ ( \ s* " " \ s* \ ) ' ,
2019-06-21 15:22:07 -04:00
r ' (?P<sig>[a-zA-Z0-9$]+) \ s*= \ s*function \ ( \ s*a \ s* \ ) \ s* { \ s*a \ s*= \ s*a \ .split \ ( \ s* " " \ s* \ ) ' ,
2019-06-21 11:58:42 -04:00
# Obsolete patterns
r ' ([ " \' ])signature \ 1 \ s*, \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
2018-09-07 16:36:10 -04:00
r ' \ .sig \ | \ |(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
2019-06-21 11:58:42 -04:00
r ' yt \ .akamaized \ .net/ \ ) \ s* \ | \ | \ s*.*? \ s*[cs] \ s*&& \ s*[adf] \ .set \ ([^,]+ \ s*, \ s*(?:encodeURIComponent \ s* \ ()? \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
r ' \ b[cs] \ s*&& \ s*[adf] \ .set \ ([^,]+ \ s*, \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
r ' \ b[a-zA-Z0-9]+ \ s*&& \ s*[a-zA-Z0-9]+ \ .set \ ([^,]+ \ s*, \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
r ' \ bc \ s*&& \ s*a \ .set \ ([^,]+ \ s*, \ s* \ ([^)]* \ ) \ s* \ ( \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
r ' \ bc \ s*&& \ s*[a-zA-Z0-9]+ \ .set \ ([^,]+ \ s*, \ s* \ ([^)]* \ ) \ s* \ ( \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ,
r ' \ bc \ s*&& \ s*[a-zA-Z0-9]+ \ .set \ ([^,]+ \ s*, \ s* \ ([^)]* \ ) \ s* \ ( \ s*(?P<sig>[a-zA-Z0-9$]+) \ ( ' ) ,
2017-01-31 10:19:29 -05:00
jscode , ' Initial JS player signature function name ' , group = ' sig ' )
2014-03-30 01:02:58 -04:00
jsi = JSInterpreter ( jscode )
initial_function = jsi . extract_function ( funcname )
2013-09-21 08:19:30 -04:00
return lambda s : initial_function ( [ s ] )
def _parse_sig_swf ( self , file_contents ) :
2014-07-18 04:24:28 -04:00
swfi = SWFInterpreter ( file_contents )
2014-09-13 01:51:06 -04:00
TARGET_CLASSNAME = ' SignatureDecipher '
2014-07-18 04:24:28 -04:00
searched_class = swfi . extract_class ( TARGET_CLASSNAME )
2014-09-13 01:51:06 -04:00
initial_function = swfi . extract_function ( searched_class , ' decipher ' )
2013-09-21 08:19:30 -04:00
return lambda s : initial_function ( [ s ] )
2013-09-21 09:19:48 -04:00
def _decrypt_signature ( self , s , video_id , player_url , age_gate = False ) :
2013-06-27 16:20:50 -04:00
""" Turn the encrypted s field into a working signature """
2013-06-26 19:51:10 -04:00
2014-07-11 04:44:39 -04:00
if player_url is None :
2014-09-24 03:51:45 -04:00
raise ExtractorError ( ' Cannot decrypt signature without player_url ' )
2013-09-27 00:15:21 -04:00
2014-09-24 03:51:45 -04:00
if player_url . startswith ( ' // ' ) :
2014-09-13 01:51:06 -04:00
player_url = ' https: ' + player_url
2017-01-31 10:19:29 -05:00
elif not re . match ( r ' https?:// ' , player_url ) :
player_url = compat_urlparse . urljoin (
' https://www.youtube.com ' , player_url )
2014-07-11 04:44:39 -04:00
try :
2014-08-02 06:23:18 -04:00
player_id = ( player_url , self . _signature_cache_id ( s ) )
2014-07-11 04:44:39 -04:00
if player_id not in self . _player_cache :
func = self . _extract_signature_function (
2014-08-02 06:21:53 -04:00
video_id , player_url , s
2014-07-11 04:44:39 -04:00
)
self . _player_cache [ player_id ] = func
func = self . _player_cache [ player_id ]
if self . _downloader . params . get ( ' youtube_print_sig_code ' ) :
2014-08-02 06:21:53 -04:00
self . _print_sig_code ( func , s )
2014-07-11 04:44:39 -04:00
return func ( s )
except Exception as e :
tb = traceback . format_exc ( )
raise ExtractorError (
2014-09-13 01:51:06 -04:00
' Signature extraction failed: ' + tb , cause = e )
2013-09-21 08:19:30 -04:00
2020-08-04 21:04:36 -04:00
def _get_subtitles ( self , video_id , webpage , has_live_chat_replay ) :
2013-09-11 09:48:23 -04:00
try :
2014-12-31 09:44:15 -05:00
subs_doc = self . _download_xml (
2014-01-25 11:11:55 -05:00
' https://video.google.com/timedtext?hl=en&type=list&v= %s ' % video_id ,
2013-09-11 10:24:47 -04:00
video_id , note = False )
except ExtractorError as err :
2015-12-19 20:00:39 -05:00
self . _downloader . report_warning ( ' unable to download video subtitles: %s ' % error_to_compat_str ( err ) )
2013-09-11 09:48:23 -04:00
return { }
sub_lang_list = { }
2014-12-31 09:44:15 -05:00
for track in subs_doc . findall ( ' track ' ) :
lang = track . attrib [ ' lang_code ' ]
2014-07-22 22:56:09 -04:00
if lang in sub_lang_list :
continue
2015-02-16 15:44:17 -05:00
sub_formats = [ ]
2016-02-05 19:44:38 -05:00
for ext in self . _SUBTITLE_FORMATS :
2016-03-25 15:46:57 -04:00
params = compat_urllib_parse_urlencode ( {
2015-02-16 15:44:17 -05:00
' lang ' : lang ,
' v ' : video_id ,
' fmt ' : ext ,
' name ' : track . attrib [ ' name ' ] . encode ( ' utf-8 ' ) ,
} )
sub_formats . append ( {
' url ' : ' https://www.youtube.com/api/timedtext? ' + params ,
' ext ' : ext ,
} )
sub_lang_list [ lang ] = sub_formats
2020-10-31 00:46:51 -04:00
if has_live_chat_replay :
2020-08-04 20:30:10 -04:00
sub_lang_list [ ' live_chat ' ] = [
{
' video_id ' : video_id ,
' ext ' : ' json ' ,
' protocol ' : ' youtube_live_chat_replay ' ,
} ,
2020-10-31 00:46:51 -04:00
]
2013-09-11 09:48:23 -04:00
if not sub_lang_list :
2014-09-24 03:51:45 -04:00
self . _downloader . report_warning ( ' video doesn \' t have subtitles ' )
2013-09-11 09:48:23 -04:00
return { }
return sub_lang_list
2015-11-23 10:00:06 -05:00
def _get_ytplayer_config ( self , video_id , webpage ) :
patterns = (
2015-11-23 10:14:03 -05:00
# User data may contain arbitrary character sequences that may affect
# JSON extraction with regex, e.g. when '};' is contained the second
# regex won't capture the whole JSON. Yet working around by trying more
# concrete regex first keeping in mind proper quoted string handling
# to be implemented in future that will replace this workaround (see
2019-03-09 07:14:41 -05:00
# https://github.com/ytdl-org/youtube-dl/issues/7468,
# https://github.com/ytdl-org/youtube-dl/pull/7599)
2015-11-23 10:00:06 -05:00
r ' ;ytplayer \ .config \ s*= \ s*( { .+?});ytplayer ' ,
r ' ;ytplayer \ .config \ s*= \ s*( { .+?}); ' ,
)
config = self . _search_regex (
patterns , webpage , ' ytplayer.config ' , default = None )
if config :
return self . _parse_json (
uppercase_escape ( config ) , video_id , fatal = False )
2015-11-22 07:49:33 -05:00
2020-08-04 20:30:10 -04:00
def _get_yt_initial_data ( self , video_id , webpage ) :
config = self . _search_regex (
2020-08-10 17:13:43 -04:00
( r ' window \ [ " ytInitialData " \ ] \ s*= \ s*(.*?)(?<=}); ' ,
r ' var \ s+ytInitialData \ s*= \ s*(.*?)(?<=}); ' ) ,
2020-08-04 20:30:10 -04:00
webpage , ' ytInitialData ' , default = None )
if config :
return self . _parse_json (
uppercase_escape ( config ) , video_id , fatal = False )
2020-10-29 17:37:06 -04:00
def _get_music_metadata_from_yt_initial ( self , yt_initial ) :
music_metadata = [ ]
key_map = {
' Album ' : ' album ' ,
' Artist ' : ' artist ' ,
' Song ' : ' track '
}
contents = try_get ( yt_initial , lambda x : x [ ' contents ' ] [ ' twoColumnWatchNextResults ' ] [ ' results ' ] [ ' results ' ] [ ' contents ' ] )
if type ( contents ) is list :
for content in contents :
music_track = { }
if type ( content ) is not dict :
continue
videoSecondaryInfoRenderer = try_get ( content , lambda x : x [ ' videoSecondaryInfoRenderer ' ] )
if type ( videoSecondaryInfoRenderer ) is not dict :
continue
rows = try_get ( videoSecondaryInfoRenderer , lambda x : x [ ' metadataRowContainer ' ] [ ' metadataRowContainerRenderer ' ] [ ' rows ' ] )
if type ( rows ) is not list :
continue
for row in rows :
metadataRowRenderer = try_get ( row , lambda x : x [ ' metadataRowRenderer ' ] )
if type ( metadataRowRenderer ) is not dict :
continue
key = try_get ( metadataRowRenderer , lambda x : x [ ' title ' ] [ ' simpleText ' ] )
value = try_get ( metadataRowRenderer , lambda x : x [ ' contents ' ] [ 0 ] [ ' simpleText ' ] ) or \
try_get ( metadataRowRenderer , lambda x : x [ ' contents ' ] [ 0 ] [ ' runs ' ] [ 0 ] [ ' text ' ] )
if type ( key ) is not str or type ( value ) is not str :
continue
if key in key_map :
if key_map [ key ] in music_track :
# we've started on a new track
music_metadata . append ( music_track )
music_track = { }
music_track [ key_map [ key ] ] = value
if len ( music_track . keys ( ) ) :
music_metadata . append ( music_track )
return music_metadata
2015-02-16 15:44:17 -05:00
def _get_automatic_captions ( self , video_id , webpage ) :
2013-09-11 09:48:23 -04:00
""" We need the webpage for getting the captions url, pass it as an
argument to speed up the process . """
2014-09-24 03:51:45 -04:00
self . to_screen ( ' %s : Looking for automatic captions ' % video_id )
2015-11-23 10:00:06 -05:00
player_config = self . _get_ytplayer_config ( video_id , webpage )
2014-09-13 01:51:06 -04:00
err_msg = ' Couldn \' t find automatic captions for %s ' % video_id
2015-11-23 10:00:06 -05:00
if not player_config :
2013-09-11 09:48:23 -04:00
self . _downloader . report_warning ( err_msg )
return { }
try :
2014-11-26 06:41:53 -05:00
args = player_config [ ' args ' ]
2016-02-26 11:21:47 -05:00
caption_url = args . get ( ' ttsurl ' )
if caption_url :
timestamp = args [ ' timestamp ' ]
# We get the available subtitles
2016-03-25 15:46:57 -04:00
list_params = compat_urllib_parse_urlencode ( {
2016-02-26 11:21:47 -05:00
' type ' : ' list ' ,
' tlangs ' : 1 ,
' asrs ' : 1 ,
} )
list_url = caption_url + ' & ' + list_params
caption_list = self . _download_xml ( list_url , video_id )
original_lang_node = caption_list . find ( ' track ' )
if original_lang_node is None :
self . _downloader . report_warning ( ' Video doesn \' t have automatic captions ' )
return { }
original_lang = original_lang_node . attrib [ ' lang_code ' ]
caption_kind = original_lang_node . attrib . get ( ' kind ' , ' ' )
sub_lang_list = { }
for lang_node in caption_list . findall ( ' target ' ) :
sub_lang = lang_node . attrib [ ' lang_code ' ]
sub_formats = [ ]
for ext in self . _SUBTITLE_FORMATS :
2016-03-25 15:46:57 -04:00
params = compat_urllib_parse_urlencode ( {
2016-02-26 11:21:47 -05:00
' lang ' : original_lang ,
' tlang ' : sub_lang ,
' fmt ' : ext ,
' ts ' : timestamp ,
' kind ' : caption_kind ,
} )
sub_formats . append ( {
' url ' : caption_url + ' & ' + params ,
' ext ' : ext ,
} )
sub_lang_list [ sub_lang ] = sub_formats
return sub_lang_list
2017-06-22 15:00:19 -04:00
def make_captions ( sub_url , sub_langs ) :
parsed_sub_url = compat_urllib_parse_urlparse ( sub_url )
caption_qs = compat_parse_qs ( parsed_sub_url . query )
captions = { }
for sub_lang in sub_langs :
sub_formats = [ ]
for ext in self . _SUBTITLE_FORMATS :
caption_qs . update ( {
' tlang ' : [ sub_lang ] ,
' fmt ' : [ ext ] ,
} )
sub_url = compat_urlparse . urlunparse ( parsed_sub_url . _replace (
query = compat_urllib_parse_urlencode ( caption_qs , True ) ) )
sub_formats . append ( {
' url ' : sub_url ,
' ext ' : ext ,
} )
captions [ sub_lang ] = sub_formats
return captions
# New captions format as of 22.06.2017
player_response = args . get ( ' player_response ' )
if player_response and isinstance ( player_response , compat_str ) :
player_response = self . _parse_json (
player_response , video_id , fatal = False )
if player_response :
renderer = player_response [ ' captions ' ] [ ' playerCaptionsTracklistRenderer ' ]
2020-09-03 05:08:41 -04:00
caption_tracks = renderer [ ' captionTracks ' ]
for caption_track in caption_tracks :
if ' kind ' not in caption_track :
# not an automatic transcription
continue
base_url = caption_track [ ' baseUrl ' ]
sub_lang_list = [ ]
for lang in renderer [ ' translationLanguages ' ] :
lang_code = lang . get ( ' languageCode ' )
if lang_code :
sub_lang_list . append ( lang_code )
return make_captions ( base_url , sub_lang_list )
2020-09-05 20:57:08 -04:00
2020-09-03 05:08:41 -04:00
self . _downloader . report_warning ( " Couldn ' t find automatic captions for %s " % video_id )
return { }
2016-02-26 11:21:47 -05:00
# Some videos don't provide ttsurl but rather caption_tracks and
# caption_translation_languages (e.g. 20LmZk1hakA)
2017-06-22 15:00:19 -04:00
# Does not used anymore as of 22.06.2017
2016-02-26 11:21:47 -05:00
caption_tracks = args [ ' caption_tracks ' ]
caption_translation_languages = args [ ' caption_translation_languages ' ]
caption_url = compat_parse_qs ( caption_tracks . split ( ' , ' ) [ 0 ] ) [ ' u ' ] [ 0 ]
2017-06-22 15:00:19 -04:00
sub_lang_list = [ ]
2016-02-26 11:21:47 -05:00
for lang in caption_translation_languages . split ( ' , ' ) :
lang_qs = compat_parse_qs ( compat_urllib_parse_unquote_plus ( lang ) )
sub_lang = lang_qs . get ( ' lc ' , [ None ] ) [ 0 ]
2017-06-22 15:00:19 -04:00
if sub_lang :
sub_lang_list . append ( sub_lang )
return make_captions ( caption_url , sub_lang_list )
2013-09-11 09:48:23 -04:00
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
2017-06-22 15:00:19 -04:00
except ( KeyError , IndexError , ExtractorError ) :
2013-09-11 09:48:23 -04:00
self . _downloader . report_warning ( err_msg )
return { }
2018-12-16 07:35:48 -05:00
def _mark_watched ( self , video_id , video_info , player_response ) :
playback_url = url_or_none ( try_get (
player_response ,
lambda x : x [ ' playbackTracking ' ] [ ' videostatsPlaybackUrl ' ] [ ' baseUrl ' ] ) or try_get (
video_info , lambda x : x [ ' videostats_playback_base_url ' ] [ 0 ] ) )
2016-02-29 14:01:33 -05:00
if not playback_url :
return
parsed_playback_url = compat_urlparse . urlparse ( playback_url )
qs = compat_urlparse . parse_qs ( parsed_playback_url . query )
# cpn generation algorithm is reverse engineered from base.js.
# In fact it works even with dummy cpn.
CPN_ALPHABET = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_ '
cpn = ' ' . join ( ( CPN_ALPHABET [ random . randint ( 0 , 256 ) & 63 ] for _ in range ( 0 , 16 ) ) )
qs . update ( {
' ver ' : [ ' 2 ' ] ,
' cpn ' : [ cpn ] ,
} )
playback_url = compat_urlparse . urlunparse (
2016-03-25 15:46:57 -04:00
parsed_playback_url . _replace ( query = compat_urllib_parse_urlencode ( qs , True ) ) )
2016-02-29 14:01:33 -05:00
self . _download_webpage (
playback_url , video_id , ' Marking watched ' ,
' Unable to mark watched ' , fatal = False )
2017-09-05 13:48:37 -04:00
@staticmethod
def _extract_urls ( webpage ) :
# Embedded YouTube player
entries = [
unescapeHTML ( mobj . group ( ' url ' ) )
for mobj in re . finditer ( r ''' (?x)
( ? :
< iframe [ ^ > ] + ? src = |
data - video - url = |
< embed [ ^ > ] + ? src = |
embedSWF \( ? : \s * |
< object [ ^ > ] + data = |
new \s + SWFObject \(
)
( [ " \' ])
( ? P < url > ( ? : https ? : ) ? / / ( ? : www \. ) ? youtube ( ? : - nocookie ) ? \. com /
2017-10-27 11:26:43 -04:00
( ? : embed | v | p ) / [ 0 - 9 A - Za - z_ - ] { 11 } . * ? )
2017-09-05 13:48:37 -04:00
\1 ''' , webpage)]
# lazyYT YouTube embed
entries . extend ( list ( map (
unescapeHTML ,
re . findall ( r ' class= " lazyYT " data-youtube-id= " ([^ " ]+) " ' , webpage ) ) ) )
# Wordpress "YouTube Video Importer" plugin
matches = re . findall ( r ''' (?x)<div[^>]+
class = ( ? P < q1 > [ \' " ])[^ \' " ]* \b yvii_single_video_player \b [^ \' " ]*(?P=q1)[^>]+
data - video_id = ( ? P < q2 > [ \' " ])([^ \' " ]+)(?P=q2) ' ' ' , webpage )
entries . extend ( m [ - 1 ] for m in matches )
return entries
@staticmethod
def _extract_url ( webpage ) :
urls = YoutubeIE . _extract_urls ( webpage )
return urls [ 0 ] if urls else None
2014-02-08 13:20:11 -05:00
@classmethod
def extract_id ( cls , url ) :
mobj = re . match ( cls . _VALID_URL , url , re . VERBOSE )
2013-06-23 13:58:33 -04:00
if mobj is None :
2014-09-24 03:51:45 -04:00
raise ExtractorError ( ' Invalid URL: %s ' % url )
2013-06-23 13:58:33 -04:00
video_id = mobj . group ( 2 )
return video_id
2020-06-05 17:16:31 -04:00
def _extract_chapters_from_json ( self , webpage , video_id , duration ) :
if not webpage :
return
2020-08-11 05:20:47 -04:00
initial_data = self . _parse_json (
2020-06-05 17:16:31 -04:00
self . _search_regex (
2020-08-11 05:20:47 -04:00
r ' window \ [ " ytInitialData " \ ] = (.+); \ n ' , webpage ,
2020-06-05 17:16:31 -04:00
' player args ' , default = ' {} ' ) ,
video_id , fatal = False )
2020-08-11 05:20:47 -04:00
if not initial_data or not isinstance ( initial_data , dict ) :
2020-06-05 17:16:31 -04:00
return
chapters_list = try_get (
2020-08-11 05:20:47 -04:00
initial_data ,
2020-06-05 17:16:31 -04:00
lambda x : x [ ' playerOverlays ' ]
[ ' playerOverlayRenderer ' ]
[ ' decoratedPlayerBarRenderer ' ]
[ ' decoratedPlayerBarRenderer ' ]
[ ' playerBar ' ]
[ ' chapteredPlayerBarRenderer ' ]
[ ' chapters ' ] ,
list )
if not chapters_list :
return
def chapter_time ( chapter ) :
return float_or_none (
try_get (
chapter ,
lambda x : x [ ' chapterRenderer ' ] [ ' timeRangeStartMillis ' ] ,
int ) ,
scale = 1000 )
chapters = [ ]
for next_num , chapter in enumerate ( chapters_list , start = 1 ) :
start_time = chapter_time ( chapter )
if start_time is None :
continue
end_time = ( chapter_time ( chapters_list [ next_num ] )
if next_num < len ( chapters_list ) else duration )
if end_time is None :
continue
title = try_get (
chapter , lambda x : x [ ' chapterRenderer ' ] [ ' title ' ] [ ' simpleText ' ] ,
compat_str )
chapters . append ( {
' start_time ' : start_time ,
' end_time ' : end_time ,
' title ' : title ,
} )
return chapters
2017-05-05 15:27:06 -04:00
@staticmethod
2020-06-05 17:16:31 -04:00
def _extract_chapters_from_description ( description , duration ) :
2017-05-05 15:27:06 -04:00
if not description :
return None
chapter_lines = re . findall (
r ' (?:^|<br \ s*/>)([^<]*<a[^>]+onclick=[ " \' ]yt \ .www \ .watch \ .player \ .seekTo[^>]+>( \ d { 1,2}: \ d { 1,2}(?:: \ d { 1,2})?)</a>[^>]*)(?=$|<br \ s*/>) ' ,
description )
if not chapter_lines :
return None
chapters = [ ]
for next_num , ( chapter_line , time_point ) in enumerate (
chapter_lines , start = 1 ) :
start_time = parse_duration ( time_point )
if start_time is None :
continue
2017-06-01 12:29:45 -04:00
if start_time > duration :
break
2017-05-05 15:27:06 -04:00
end_time = ( duration if next_num == len ( chapter_lines )
else parse_duration ( chapter_lines [ next_num ] [ 1 ] ) )
if end_time is None :
continue
2017-06-01 12:29:45 -04:00
if end_time > duration :
end_time = duration
if start_time > end_time :
break
2017-05-05 15:27:06 -04:00
chapter_title = re . sub (
r ' <a[^>]+>[^<]+</a> ' , ' ' , chapter_line ) . strip ( ' \t - ' )
chapter_title = re . sub ( r ' \ s+ ' , ' ' , chapter_title )
chapters . append ( {
' start_time ' : start_time ,
' end_time ' : end_time ,
' title ' : chapter_title ,
} )
return chapters
2020-06-05 17:16:31 -04:00
def _extract_chapters ( self , webpage , description , video_id , duration ) :
return ( self . _extract_chapters_from_json ( webpage , video_id , duration )
or self . _extract_chapters_from_description ( description , duration ) )
2013-06-23 13:58:33 -04:00
def _real_extract ( self , url ) :
2015-07-25 11:30:34 -04:00
url , smuggled_data = unsmuggle_url ( url , { } )
2014-03-20 19:33:53 -04:00
proto = (
2014-09-13 01:51:06 -04:00
' http ' if self . _downloader . params . get ( ' prefer_insecure ' , False )
else ' https ' )
2014-03-20 19:33:53 -04:00
2015-07-20 15:10:28 -04:00
start_time = None
2015-07-23 07:20:21 -04:00
end_time = None
2015-07-20 15:10:28 -04:00
parsed_url = compat_urllib_parse_urlparse ( url )
for component in [ parsed_url . fragment , parsed_url . query ] :
query = compat_parse_qs ( component )
2015-07-23 07:20:21 -04:00
if start_time is None and ' t ' in query :
2015-07-20 15:10:28 -04:00
start_time = parse_duration ( query [ ' t ' ] [ 0 ] )
2015-07-23 07:21:18 -04:00
if start_time is None and ' start ' in query :
start_time = parse_duration ( query [ ' start ' ] [ 0 ] )
2015-07-23 07:20:21 -04:00
if end_time is None and ' end ' in query :
end_time = parse_duration ( query [ ' end ' ] [ 0 ] )
2015-07-20 15:10:28 -04:00
2013-06-23 13:58:33 -04:00
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re . search ( self . _NEXT_URL_RE , url )
if mobj :
2015-07-17 13:51:57 -04:00
url = proto + ' ://www.youtube.com/ ' + compat_urllib_parse_unquote ( mobj . group ( 1 ) ) . lstrip ( ' / ' )
2014-02-08 13:20:11 -05:00
video_id = self . extract_id ( url )
2013-06-23 13:58:33 -04:00
# Get video webpage
2014-11-23 03:59:02 -05:00
url = proto + ' ://www.youtube.com/watch?v= %s &gl=US&hl=en&has_verified=1&bpctr=9999999999 ' % video_id
2020-04-30 13:40:38 -04:00
video_webpage , urlh = self . _download_webpage_handle ( url , video_id )
qs = compat_parse_qs ( compat_urllib_parse_urlparse ( urlh . geturl ( ) ) . query )
video_id = qs . get ( ' v ' , [ None ] ) [ 0 ] or video_id
2013-06-23 13:58:33 -04:00
# Attempt to extract SWF player URL
2013-09-21 08:19:30 -04:00
mobj = re . search ( r ' swfConfig.*? " (https?: \\ / \\ /.*?watch.*?-.*? \ .swf) " ' , video_webpage )
2013-06-23 13:58:33 -04:00
if mobj is not None :
player_url = re . sub ( r ' \\ (.) ' , r ' \ 1 ' , mobj . group ( 1 ) )
else :
player_url = None
2015-06-26 14:36:23 -04:00
dash_mpds = [ ]
def add_dash_mpd ( video_info ) :
dash_mpd = video_info . get ( ' dashmpd ' )
if dash_mpd and dash_mpd [ 0 ] not in dash_mpds :
dash_mpds . append ( dash_mpd [ 0 ] )
2019-01-15 13:12:58 -05:00
def add_dash_mpd_pr ( pl_response ) :
dash_mpd = url_or_none ( try_get (
pl_response , lambda x : x [ ' streamingData ' ] [ ' dashManifestUrl ' ] ,
compat_str ) )
if dash_mpd and dash_mpd not in dash_mpds :
dash_mpds . append ( dash_mpd )
2017-08-26 04:38:38 -04:00
is_live = None
view_count = None
def extract_view_count ( v_info ) :
return int_or_none ( try_get ( v_info , lambda x : x [ ' view_count ' ] [ 0 ] ) )
2019-07-30 13:14:08 -04:00
def extract_player_response ( player_response , video_id ) :
pl_response = str_or_none ( player_response )
if not pl_response :
return
pl_response = self . _parse_json ( pl_response , video_id , fatal = False )
if isinstance ( pl_response , dict ) :
add_dash_mpd_pr ( pl_response )
return pl_response
2020-10-25 23:32:37 -04:00
def extract_embedded_config ( embed_webpage , video_id ) :
embedded_config = self . _search_regex (
r ' setConfig \ (( { .*}) \ ); ' ,
embed_webpage , ' ytInitialData ' , default = None )
if embedded_config :
return embedded_config
2018-11-02 19:26:16 -04:00
player_response = { }
2013-06-23 13:58:33 -04:00
# Get video info
2020-03-07 20:34:17 -05:00
video_info = { }
2015-01-29 22:43:50 -05:00
embed_webpage = None
2020-09-11 23:08:57 -04:00
if ( self . _og_search_property ( ' restrictions:age ' , video_webpage , default = None ) == ' 18+ '
or re . search ( r ' player-age-gate-content " > ' , video_webpage ) is not None ) :
2020-10-06 22:19:08 -04:00
cookie_keys = self . _get_cookies ( ' https://www.youtube.com ' ) . keys ( )
2013-07-09 08:38:24 -04:00
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
2014-12-29 16:58:14 -05:00
url = proto + ' ://www.youtube.com/embed/ %s ' % video_id
embed_webpage = self . _download_webpage ( url , video_id , ' Downloading embed webpage ' )
2020-10-25 23:32:37 -04:00
ext = extract_embedded_config ( embed_webpage , video_id )
# playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
playable_in_embed = re . search ( r ' { \\ \ " playableInEmbed \\ \ " :(?P<playableinEmbed>[^ \ ,]+) ' , ext )
if not playable_in_embed :
self . to_screen ( ' Could not determine whether playabale in embed for video %s ' % video_id )
playable_in_embed = ' '
else :
playable_in_embed = playable_in_embed . group ( ' playableinEmbed ' )
# check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
# if re.search(r'player-unavailable">', embed_webpage) is not None:
if playable_in_embed == ' false ' :
2020-10-06 22:54:38 -04:00
'''
# TODO apply this patch when Support for Python 2.6(!) and above drops
2020-10-06 22:19:08 -04:00
if ( { ' VISITOR_INFO1_LIVE ' , ' HSID ' , ' SSID ' , ' SID ' } < = cookie_keys
2020-10-06 22:31:23 -04:00
or { ' VISITOR_INFO1_LIVE ' , ' __Secure-3PSID ' , ' LOGIN_INFO ' } < = cookie_keys ) :
2020-10-06 22:54:38 -04:00
'''
if ( set ( ( ' VISITOR_INFO1_LIVE ' , ' HSID ' , ' SSID ' , ' SID ' ) ) < = set ( cookie_keys )
or set ( ( ' VISITOR_INFO1_LIVE ' , ' __Secure-3PSID ' , ' LOGIN_INFO ' ) ) < = set ( cookie_keys ) ) :
2020-10-06 22:31:23 -04:00
age_gate = False
# Try looking directly into the video webpage
ytplayer_config = self . _get_ytplayer_config ( video_id , video_webpage )
if ytplayer_config :
args = ytplayer_config [ ' args ' ]
if args . get ( ' url_encoded_fmt_stream_map ' ) or args . get ( ' hlsvp ' ) :
# Convert to the same format returned by compat_parse_qs
video_info = dict ( ( k , [ v ] ) for k , v in args . items ( ) )
add_dash_mpd ( video_info )
# Rental video is not rented but preview is available (e.g.
# https://www.youtube.com/watch?v=yYr8q0y5Jfg,
# https://github.com/ytdl-org/youtube-dl/issues/10532)
if not video_info and args . get ( ' ypc_vid ' ) :
return self . url_result (
args [ ' ypc_vid ' ] , YoutubeIE . ie_key ( ) , video_id = args [ ' ypc_vid ' ] )
if args . get ( ' livestream ' ) == ' 1 ' or args . get ( ' live_playback ' ) == 1 :
is_live = True
if not player_response :
player_response = extract_player_response ( args . get ( ' player_response ' ) , video_id )
if not video_info or self . _downloader . params . get ( ' youtube_include_dash_manifest ' , True ) :
add_dash_mpd_pr ( player_response )
2020-10-06 22:19:08 -04:00
else :
raise ExtractorError ( ' Video is age restricted and only playable on Youtube. Requires cookies! ' , expected = True )
else :
data = compat_urllib_parse_urlencode ( {
' video_id ' : video_id ,
' eurl ' : ' https://youtube.googleapis.com/v/ ' + video_id ,
' sts ' : self . _search_regex (
r ' " sts " \ s*: \ s*( \ d+) ' , embed_webpage , ' sts ' , default = ' ' ) ,
} )
video_info_url = proto + ' ://www.youtube.com/get_video_info? ' + data
try :
video_info_webpage = self . _download_webpage (
video_info_url , video_id ,
note = ' Refetching age-gated info webpage ' ,
errnote = ' unable to download video info webpage ' )
except ExtractorError :
video_info_webpage = None
if video_info_webpage :
video_info = compat_parse_qs ( video_info_webpage )
pl_response = video_info . get ( ' player_response ' , [ None ] ) [ 0 ]
player_response = extract_player_response ( pl_response , video_id )
add_dash_mpd ( video_info )
view_count = extract_view_count ( video_info )
2013-07-09 08:38:24 -04:00
else :
age_gate = False
2015-06-26 14:36:23 -04:00
# Try looking directly into the video webpage
2015-11-23 10:00:06 -05:00
ytplayer_config = self . _get_ytplayer_config ( video_id , video_webpage )
if ytplayer_config :
2014-11-30 14:56:32 -05:00
args = ytplayer_config [ ' args ' ]
2018-05-13 08:20:16 -04:00
if args . get ( ' url_encoded_fmt_stream_map ' ) or args . get ( ' hlsvp ' ) :
2015-06-26 14:36:23 -04:00
# Convert to the same format returned by compat_parse_qs
video_info = dict ( ( k , [ v ] ) for k , v in args . items ( ) )
add_dash_mpd ( video_info )
2016-09-02 14:17:15 -04:00
# Rental video is not rented but preview is available (e.g.
# https://www.youtube.com/watch?v=yYr8q0y5Jfg,
2019-03-09 07:14:41 -05:00
# https://github.com/ytdl-org/youtube-dl/issues/10532)
2016-09-02 14:17:15 -04:00
if not video_info and args . get ( ' ypc_vid ' ) :
return self . url_result (
args [ ' ypc_vid ' ] , YoutubeIE . ie_key ( ) , video_id = args [ ' ypc_vid ' ] )
2015-07-20 14:14:20 -04:00
if args . get ( ' livestream ' ) == ' 1 ' or args . get ( ' live_playback ' ) == 1 :
is_live = True
2018-11-02 19:26:16 -04:00
if not player_response :
2019-07-30 13:14:08 -04:00
player_response = extract_player_response ( args . get ( ' player_response ' ) , video_id )
2015-06-27 04:31:18 -04:00
if not video_info or self . _downloader . params . get ( ' youtube_include_dash_manifest ' , True ) :
2019-01-15 13:12:58 -05:00
add_dash_mpd_pr ( player_response )
2018-01-25 09:30:33 -05:00
def extract_unavailable_message ( ) :
2019-08-16 12:36:23 -04:00
messages = [ ]
for tag , kind in ( ( ' h1 ' , ' message ' ) , ( ' div ' , ' submessage ' ) ) :
msg = self . _html_search_regex (
r ' (?s)< {tag} [^>]+id=[ " \' ]unavailable- {kind} [ " \' ][^>]*>(.+?)</ {tag} > ' . format ( tag = tag , kind = kind ) ,
video_webpage , ' unavailable %s ' % kind , default = None )
if msg :
messages . append ( msg )
if messages :
return ' \n ' . join ( messages )
2018-01-25 09:30:33 -05:00
2020-03-07 17:09:02 -05:00
if not video_info and not player_response :
2019-04-22 15:31:09 -04:00
unavailable_message = extract_unavailable_message ( )
if not unavailable_message :
unavailable_message = ' Unable to extract video data '
raise ExtractorError (
' YouTube said: %s ' % unavailable_message , expected = True , video_id = video_id )
2020-03-07 17:09:02 -05:00
if not isinstance ( video_info , dict ) :
video_info = { }
2018-11-02 19:26:16 -04:00
video_details = try_get (
player_response , lambda x : x [ ' videoDetails ' ] , dict ) or { }
2020-06-15 15:37:19 -04:00
microformat = try_get (
player_response , lambda x : x [ ' microformat ' ] [ ' playerMicroformatRenderer ' ] , dict ) or { }
2019-07-29 19:13:33 -04:00
video_title = video_info . get ( ' title ' , [ None ] ) [ 0 ] or video_details . get ( ' title ' )
if not video_title :
2015-07-25 11:30:34 -04:00
self . _downloader . report_warning ( ' Unable to extract video title ' )
video_title = ' _ '
2017-05-05 15:27:06 -04:00
description_original = video_description = get_element_by_id ( " eow-description " , video_webpage )
2015-07-25 11:30:34 -04:00
if video_description :
2017-10-17 06:07:37 -04:00
def replace_url ( m ) :
redir_url = compat_urlparse . urljoin ( url , m . group ( 1 ) )
parsed_redir_url = compat_urllib_parse_urlparse ( redir_url )
if re . search ( r ' ^(?:www \ .)?(?:youtube(?:-nocookie)? \ .com|youtu \ .be)$ ' , parsed_redir_url . netloc ) and parsed_redir_url . path == ' /redirect ' :
qs = compat_parse_qs ( parsed_redir_url . query )
q = qs . get ( ' q ' )
if q and q [ 0 ] :
return q [ 0 ]
return redir_url
2017-05-05 15:27:06 -04:00
description_original = video_description = re . sub ( r ''' (?x)
2015-07-25 11:30:34 -04:00
< a \s +
2016-05-06 12:11:18 -04:00
( ? : [ a - zA - Z - ] + = " [^ " ] * " \ s+)*?
2016-01-07 13:52:55 -05:00
( ? : title | href ) = " ([^ " ] + ) " \ s+
2016-05-06 12:11:18 -04:00
( ? : [ a - zA - Z - ] + = " [^ " ] * " \ s+)*?
2016-06-24 11:37:13 -04:00
class = " [^ " ] * " [^>]*>
2016-01-07 13:52:55 -05:00
[ ^ < ] + \. { 3 } \s *
2015-07-25 11:30:34 -04:00
< / a >
2017-10-17 06:07:37 -04:00
''' , replace_url, video_description)
2015-07-25 11:30:34 -04:00
video_description = clean_html ( video_description )
else :
2020-09-13 10:23:21 -04:00
video_description = video_details . get ( ' shortDescription ' )
if video_description is None :
video_description = self . _html_search_meta ( ' description ' , video_webpage )
2015-07-25 11:30:34 -04:00
2018-12-15 10:25:12 -05:00
if not smuggled_data . get ( ' force_singlefeed ' , False ) :
2015-07-29 11:18:16 -04:00
if not self . _downloader . params . get ( ' noplaylist ' ) :
2018-12-15 10:25:12 -05:00
multifeed_metadata_list = try_get (
player_response ,
lambda x : x [ ' multicamera ' ] [ ' playerLegacyMulticameraRenderer ' ] [ ' metadataList ' ] ,
compat_str ) or try_get (
video_info , lambda x : x [ ' multifeed_metadata_list ' ] [ 0 ] , compat_str )
if multifeed_metadata_list :
entries = [ ]
feed_ids = [ ]
for feed in multifeed_metadata_list . split ( ' , ' ) :
# Unquote should take place before split on comma (,) since textual
# fields may contain comma as well (see
2019-03-09 07:14:41 -05:00
# https://github.com/ytdl-org/youtube-dl/issues/8536)
2018-12-15 10:25:12 -05:00
feed_data = compat_parse_qs ( compat_urllib_parse_unquote_plus ( feed ) )
2020-04-09 11:42:43 -04:00
def feed_entry ( name ) :
return try_get ( feed_data , lambda x : x [ name ] [ 0 ] , compat_str )
feed_id = feed_entry ( ' id ' )
if not feed_id :
continue
feed_title = feed_entry ( ' title ' )
title = video_title
if feed_title :
title + = ' ( %s ) ' % feed_title
2018-12-15 10:25:12 -05:00
entries . append ( {
' _type ' : ' url_transparent ' ,
' ie_key ' : ' Youtube ' ,
' url ' : smuggle_url (
' %s ://www.youtube.com/watch?v= %s ' % ( proto , feed_data [ ' id ' ] [ 0 ] ) ,
{ ' force_singlefeed ' : True } ) ,
2020-04-09 11:42:43 -04:00
' title ' : title ,
2018-12-15 10:25:12 -05:00
} )
2020-04-09 11:42:43 -04:00
feed_ids . append ( feed_id )
2018-12-15 10:25:12 -05:00
self . to_screen (
' Downloading multifeed video ( %s ) - add --no-playlist to just download video %s '
% ( ' , ' . join ( feed_ids ) , video_id ) )
return self . playlist_result ( entries , video_id , video_title , video_description )
else :
self . to_screen ( ' Downloading just video %s because of --no-playlist ' % video_id )
2015-07-25 11:30:34 -04:00
2017-08-26 04:38:38 -04:00
if view_count is None :
2017-08-26 18:59:08 -04:00
view_count = extract_view_count ( video_info )
2018-11-02 19:26:16 -04:00
if view_count is None and video_details :
view_count = int_or_none ( video_details . get ( ' viewCount ' ) )
2020-06-15 15:38:45 -04:00
if view_count is None and microformat :
view_count = int_or_none ( microformat . get ( ' viewCount ' ) )
2013-11-17 05:06:16 -05:00
2019-07-11 16:45:58 -04:00
if is_live is None :
2019-07-14 09:30:05 -04:00
is_live = bool_or_none ( video_details . get ( ' isLive ' ) )
2019-07-11 16:45:58 -04:00
2020-08-04 20:30:10 -04:00
has_live_chat_replay = False
2020-08-05 16:29:41 -04:00
if not is_live :
2020-08-04 20:30:10 -04:00
yt_initial_data = self . _get_yt_initial_data ( video_id , video_webpage )
try :
yt_initial_data [ ' contents ' ] [ ' twoColumnWatchNextResults ' ] [ ' conversationBar ' ] [ ' liveChatRenderer ' ] [ ' continuations ' ] [ 0 ] [ ' reloadContinuationData ' ] [ ' continuation ' ]
has_live_chat_replay = True
2020-08-05 16:29:41 -04:00
except ( KeyError , IndexError , TypeError ) :
2020-08-04 20:30:10 -04:00
pass
2013-06-23 13:58:33 -04:00
# Check for "rental" videos
if ' ypc_video_rental_bar_text ' in video_info and ' author ' not in video_info :
2019-03-09 07:14:41 -05:00
raise ExtractorError ( ' " rental " videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information. ' , expected = True )
2013-06-23 13:58:33 -04:00
2018-05-14 12:27:56 -04:00
def _extract_filesize ( media_url ) :
return int_or_none ( self . _search_regex (
r ' \ bclen[=/]( \ d+) ' , media_url , ' filesize ' , default = None ) )
2019-09-11 11:44:47 -04:00
streaming_formats = try_get ( player_response , lambda x : x [ ' streamingData ' ] [ ' formats ' ] , list ) or [ ]
streaming_formats . extend ( try_get ( player_response , lambda x : x [ ' streamingData ' ] [ ' adaptiveFormats ' ] , list ) or [ ] )
2013-06-23 13:58:33 -04:00
if ' conn ' in video_info and video_info [ ' conn ' ] [ 0 ] . startswith ( ' rtmp ' ) :
self . report_rtmp_download ( )
2014-01-18 23:47:20 -05:00
formats = [ {
' format_id ' : ' _rtmp ' ,
' protocol ' : ' rtmp ' ,
' url ' : video_info [ ' conn ' ] [ 0 ] ,
' player_url ' : player_url ,
} ]
2019-09-11 11:44:47 -04:00
elif not is_live and ( streaming_formats or len ( video_info . get ( ' url_encoded_fmt_stream_map ' , [ ' ' ] ) [ 0 ] ) > = 1 or len ( video_info . get ( ' adaptive_fmts ' , [ ' ' ] ) [ 0 ] ) > = 1 ) :
2014-11-23 14:41:03 -05:00
encoded_url_map = video_info . get ( ' url_encoded_fmt_stream_map ' , [ ' ' ] ) [ 0 ] + ' , ' + video_info . get ( ' adaptive_fmts ' , [ ' ' ] ) [ 0 ]
2013-10-25 10:52:58 -04:00
if ' rtmpe % 3Dyes ' in encoded_url_map :
2019-03-09 07:14:41 -05:00
raise ExtractorError ( ' rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information. ' , expected = True )
2019-09-11 11:44:47 -04:00
formats = [ ]
2016-03-02 11:52:13 -05:00
formats_spec = { }
2016-03-02 08:23:17 -05:00
fmt_list = video_info . get ( ' fmt_list ' , [ ' ' ] ) [ 0 ]
if fmt_list :
for fmt in fmt_list . split ( ' , ' ) :
spec = fmt . split ( ' / ' )
2016-03-02 11:52:13 -05:00
if len ( spec ) > 1 :
width_height = spec [ 1 ] . split ( ' x ' )
if len ( width_height ) == 2 :
formats_spec [ spec [ 0 ] ] = {
' resolution ' : spec [ 1 ] ,
' width ' : int_or_none ( width_height [ 0 ] ) ,
' height ' : int_or_none ( width_height [ 1 ] ) ,
}
2019-09-11 11:44:47 -04:00
for fmt in streaming_formats :
itag = str_or_none ( fmt . get ( ' itag ' ) )
if not itag :
2014-08-02 00:35:18 -04:00
continue
2019-09-11 11:44:47 -04:00
quality = fmt . get ( ' quality ' )
quality_label = fmt . get ( ' qualityLabel ' ) or quality
formats_spec [ itag ] = {
' asr ' : int_or_none ( fmt . get ( ' audioSampleRate ' ) ) ,
' filesize ' : int_or_none ( fmt . get ( ' contentLength ' ) ) ,
' format_note ' : quality_label ,
' fps ' : int_or_none ( fmt . get ( ' fps ' ) ) ,
' height ' : int_or_none ( fmt . get ( ' height ' ) ) ,
# bitrate for itag 43 is always 2147483647
' tbr ' : float_or_none ( fmt . get ( ' averageBitrate ' ) or fmt . get ( ' bitrate ' ) , 1000 ) if itag != ' 43 ' else None ,
' width ' : int_or_none ( fmt . get ( ' width ' ) ) ,
}
for fmt in streaming_formats :
2020-04-11 12:05:08 -04:00
if fmt . get ( ' drmFamilies ' ) or fmt . get ( ' drm_families ' ) :
2019-09-11 11:44:47 -04:00
continue
url = url_or_none ( fmt . get ( ' url ' ) )
if not url :
2020-05-08 06:42:30 -04:00
cipher = fmt . get ( ' cipher ' ) or fmt . get ( ' signatureCipher ' )
2019-09-11 11:44:47 -04:00
if not cipher :
continue
url_data = compat_parse_qs ( cipher )
url = url_or_none ( try_get ( url_data , lambda x : x [ ' url ' ] [ 0 ] , compat_str ) )
if not url :
continue
else :
cipher = None
url_data = compat_parse_qs ( compat_urllib_parse_urlparse ( url ) . query )
2019-01-15 13:28:50 -05:00
stream_type = int_or_none ( try_get ( url_data , lambda x : x [ ' stream_type ' ] [ 0 ] ) )
# Unsupported FORMAT_STREAM_TYPE_OTF
if stream_type == 3 :
continue
2015-01-29 22:43:50 -05:00
2019-09-11 11:44:47 -04:00
format_id = fmt . get ( ' itag ' ) or url_data [ ' itag ' ] [ 0 ]
if not format_id :
continue
format_id = compat_str ( format_id )
2017-05-27 12:47:41 -04:00
2019-09-11 11:44:47 -04:00
if cipher :
if ' s ' in url_data or self . _downloader . params . get ( ' youtube_include_dash_manifest ' , True ) :
2020-10-27 15:39:49 -04:00
ASSETS_RE = r ' (?: " assets " :.+? " js " : \ s*( " [^ " ]+ " ))|(?: " jsUrl " : \ s*( " [^ " ]+ " )) '
2019-09-11 11:44:47 -04:00
jsplayer_url_json = self . _search_regex (
ASSETS_RE ,
embed_webpage if age_gate else video_webpage ,
' JS player URL (1) ' , default = None )
if not jsplayer_url_json and not age_gate :
# We need the embed website after all
if embed_webpage is None :
embed_url = proto + ' ://www.youtube.com/embed/ %s ' % video_id
embed_webpage = self . _download_webpage (
embed_url , video_id , ' Downloading embed webpage ' )
jsplayer_url_json = self . _search_regex (
ASSETS_RE , embed_webpage , ' JS player URL ' )
player_url = json . loads ( jsplayer_url_json )
2014-07-17 10:28:30 -04:00
if player_url is None :
2019-09-11 11:44:47 -04:00
player_url_json = self . _search_regex (
r ' ytplayer \ .config.*? " url " \ s*: \ s*( " [^ " ]+ " ) ' ,
video_webpage , ' age gate player URL ' )
player_url = json . loads ( player_url_json )
if ' sig ' in url_data :
url + = ' &signature= ' + url_data [ ' sig ' ] [ 0 ]
elif ' s ' in url_data :
encrypted_sig = url_data [ ' s ' ] [ 0 ]
if self . _downloader . params . get ( ' verbose ' ) :
if player_url is None :
player_desc = ' unknown '
2014-07-17 10:28:30 -04:00
else :
2020-05-01 20:18:08 -04:00
player_type , player_version = self . _extract_player_info ( player_url )
player_desc = ' %s player %s ' % ( ' flash ' if player_type == ' swf ' else ' html5 ' , player_version )
2019-09-11 11:44:47 -04:00
parts_sizes = self . _signature_cache_id ( encrypted_sig )
self . to_screen ( ' { %s } signature length %s , %s ' %
( format_id , parts_sizes , player_desc ) )
signature = self . _decrypt_signature (
encrypted_sig , video_id , player_url , age_gate )
sp = try_get ( url_data , lambda x : x [ ' sp ' ] [ 0 ] , compat_str ) or ' signature '
url + = ' & %s = %s ' % ( sp , signature )
2014-08-02 00:35:18 -04:00
if ' ratebypass ' not in url :
url + = ' &ratebypass=yes '
2015-06-15 13:06:43 -04:00
2016-01-24 12:02:19 -05:00
dct = {
' format_id ' : format_id ,
' url ' : url ,
' player_url ' : player_url ,
}
if format_id in self . _formats :
dct . update ( self . _formats [ format_id ] )
2016-03-02 11:52:13 -05:00
if format_id in formats_spec :
dct . update ( formats_spec [ format_id ] )
2016-01-24 12:02:19 -05:00
2015-08-29 22:07:07 -04:00
# Some itags are not included in DASH manifest thus corresponding formats will
2019-03-09 07:14:41 -05:00
# lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2015-08-29 22:07:07 -04:00
# Trying to extract metadata from url_encoded_fmt_stream_map entry.
mobj = re . search ( r ' ^(?P<width> \ d+)[xX](?P<height> \ d+)$ ' , url_data . get ( ' size ' , [ ' ' ] ) [ 0 ] )
width , height = ( int ( mobj . group ( ' width ' ) ) , int ( mobj . group ( ' height ' ) ) ) if mobj else ( None , None )
2016-01-24 12:02:19 -05:00
2019-09-11 11:44:47 -04:00
if width is None :
width = int_or_none ( fmt . get ( ' width ' ) )
if height is None :
height = int_or_none ( fmt . get ( ' height ' ) )
2018-05-14 12:27:56 -04:00
filesize = int_or_none ( url_data . get (
' clen ' , [ None ] ) [ 0 ] ) or _extract_filesize ( url )
2019-09-11 11:44:47 -04:00
quality = url_data . get ( ' quality ' , [ None ] ) [ 0 ] or fmt . get ( ' quality ' )
quality_label = url_data . get ( ' quality_label ' , [ None ] ) [ 0 ] or fmt . get ( ' qualityLabel ' )
2019-09-11 15:45:30 -04:00
tbr = ( float_or_none ( url_data . get ( ' bitrate ' , [ None ] ) [ 0 ] , 1000 )
or float_or_none ( fmt . get ( ' bitrate ' ) , 1000 ) ) if format_id != ' 43 ' else None
2019-09-11 11:44:47 -04:00
fps = int_or_none ( url_data . get ( ' fps ' , [ None ] ) [ 0 ] ) or int_or_none ( fmt . get ( ' fps ' ) )
2018-05-16 11:24:44 -04:00
2016-01-24 12:02:19 -05:00
more_fields = {
2018-05-14 12:27:56 -04:00
' filesize ' : filesize ,
2019-09-11 11:44:47 -04:00
' tbr ' : tbr ,
2015-06-15 13:06:43 -04:00
' width ' : width ,
' height ' : height ,
2019-09-11 11:44:47 -04:00
' fps ' : fps ,
' format_note ' : quality_label or quality ,
2015-06-15 13:06:43 -04:00
}
2016-01-24 12:02:19 -05:00
for key , value in more_fields . items ( ) :
if value :
dct [ key ] = value
2019-09-11 11:44:47 -04:00
type_ = url_data . get ( ' type ' , [ None ] ) [ 0 ] or fmt . get ( ' mimeType ' )
2015-08-29 22:07:07 -04:00
if type_ :
type_split = type_ . split ( ' ; ' )
kind_ext = type_split [ 0 ] . split ( ' / ' )
if len ( kind_ext ) == 2 :
2016-01-24 12:02:19 -05:00
kind , _ = kind_ext
dct [ ' ext ' ] = mimetype2ext ( type_split [ 0 ] )
2015-08-29 22:07:07 -04:00
if kind in ( ' audio ' , ' video ' ) :
codecs = None
for mobj in re . finditer (
r ' (?P<key>[a-zA-Z_-]+)=(?P<quote>[ " \' ]?)(?P<val>.+?)(?P=quote)(?:;|$) ' , type_ ) :
if mobj . group ( ' key ' ) == ' codecs ' :
codecs = mobj . group ( ' val ' )
break
if codecs :
2017-02-12 06:09:53 -05:00
dct . update ( parse_codecs ( codecs ) )
2018-02-03 19:17:26 -05:00
if dct . get ( ' acodec ' ) == ' none ' or dct . get ( ' vcodec ' ) == ' none ' :
dct [ ' downloader_options ' ] = {
# Youtube throttles chunks >~10M
' http_chunk_size ' : 10485760 ,
}
2015-08-29 22:07:07 -04:00
formats . append ( dct )
2013-06-23 13:58:33 -04:00
else :
2019-01-10 10:46:53 -05:00
manifest_url = (
url_or_none ( try_get (
player_response ,
lambda x : x [ ' streamingData ' ] [ ' hlsManifestUrl ' ] ,
2019-05-10 16:56:22 -04:00
compat_str ) )
or url_or_none ( try_get (
2019-01-10 10:46:53 -05:00
video_info , lambda x : x [ ' hlsvp ' ] [ 0 ] , compat_str ) ) )
if manifest_url :
formats = [ ]
m3u8_formats = self . _extract_m3u8_formats (
manifest_url , video_id , ' mp4 ' , fatal = False )
for a_format in m3u8_formats :
itag = self . _search_regex (
r ' /itag/( \ d+)/ ' , a_format [ ' url ' ] , ' itag ' , default = None )
if itag :
a_format [ ' format_id ' ] = itag
if itag in self . _formats :
dct = self . _formats [ itag ] . copy ( )
dct . update ( a_format )
a_format = dct
a_format [ ' player_url ' ] = player_url
# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
a_format . setdefault ( ' http_headers ' , { } ) [ ' Youtubedl-no-compression ' ] = ' True '
2020-09-16 07:00:41 -04:00
if self . _downloader . params . get ( ' youtube_include_hls_manifest ' , True ) :
formats . append ( a_format )
2019-01-10 10:46:53 -05:00
else :
2019-08-05 15:32:44 -04:00
error_message = extract_unavailable_message ( )
2019-01-10 10:46:53 -05:00
if not error_message :
2019-08-05 15:32:44 -04:00
error_message = clean_html ( try_get (
player_response , lambda x : x [ ' playabilityStatus ' ] [ ' reason ' ] ,
compat_str ) )
if not error_message :
error_message = clean_html (
try_get ( video_info , lambda x : x [ ' reason ' ] [ 0 ] , compat_str ) )
2019-01-10 10:46:53 -05:00
if error_message :
raise ExtractorError ( error_message , expected = True )
raise ExtractorError ( ' no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info ' )
2013-06-23 13:58:33 -04:00
2018-06-02 15:08:38 -04:00
# uploader
2018-11-02 19:26:16 -04:00
video_uploader = try_get (
video_info , lambda x : x [ ' author ' ] [ 0 ] ,
compat_str ) or str_or_none ( video_details . get ( ' author ' ) )
2018-06-02 15:08:38 -04:00
if video_uploader :
video_uploader = compat_urllib_parse_unquote_plus ( video_uploader )
else :
self . _downloader . report_warning ( ' unable to extract uploader name ' )
# uploader_id
video_uploader_id = None
video_uploader_url = None
mobj = re . search (
r ' <link itemprop= " url " href= " (?P<uploader_url>https?://www \ .youtube \ .com/(?:user|channel)/(?P<uploader_id>[^ " ]+)) " > ' ,
video_webpage )
if mobj is not None :
video_uploader_id = mobj . group ( ' uploader_id ' )
video_uploader_url = mobj . group ( ' uploader_url ' )
2020-06-15 15:43:09 -04:00
else :
owner_profile_url = url_or_none ( microformat . get ( ' ownerProfileUrl ' ) )
if owner_profile_url :
video_uploader_id = self . _search_regex (
r ' (?:user|channel)/([^/]+) ' , owner_profile_url , ' uploader id ' ,
default = None )
video_uploader_url = owner_profile_url
2018-06-02 15:08:38 -04:00
2019-05-08 13:58:47 -04:00
channel_id = (
2019-05-10 16:56:22 -04:00
str_or_none ( video_details . get ( ' channelId ' ) )
or self . _html_search_meta (
' channelId ' , video_webpage , ' channel id ' , default = None )
or self . _search_regex (
2019-05-08 13:58:47 -04:00
r ' data-channel-external-id=([ " \' ])(?P<id>(?:(?! \ 1).)+) \ 1 ' ,
video_webpage , ' channel id ' , default = None , group = ' id ' ) )
2018-09-14 14:24:26 -04:00
channel_url = ' http://www.youtube.com/channel/ %s ' % channel_id if channel_id else None
2020-06-15 15:29:04 -04:00
thumbnails = [ ]
thumbnails_list = try_get (
video_details , lambda x : x [ ' thumbnail ' ] [ ' thumbnails ' ] , list ) or [ ]
for t in thumbnails_list :
if not isinstance ( t , dict ) :
continue
thumbnail_url = url_or_none ( t . get ( ' url ' ) )
if not thumbnail_url :
continue
thumbnails . append ( {
' url ' : thumbnail_url ,
' width ' : int_or_none ( t . get ( ' width ' ) ) ,
' height ' : int_or_none ( t . get ( ' height ' ) ) ,
} )
if not thumbnails :
2018-06-02 15:08:38 -04:00
video_thumbnail = None
2020-06-15 15:29:04 -04:00
# We try first to get a high quality image:
m_thumb = re . search ( r ' <span itemprop= " thumbnail " .*?href= " (.*?) " > ' ,
video_webpage , re . DOTALL )
if m_thumb is not None :
video_thumbnail = m_thumb . group ( 1 )
thumbnail_url = try_get ( video_info , lambda x : x [ ' thumbnail_url ' ] [ 0 ] , compat_str )
if thumbnail_url :
video_thumbnail = compat_urllib_parse_unquote_plus ( thumbnail_url )
if video_thumbnail :
thumbnails . append ( { ' url ' : video_thumbnail } )
2018-06-02 15:08:38 -04:00
# upload date
upload_date = self . _html_search_meta (
' datePublished ' , video_webpage , ' upload date ' , default = None )
if not upload_date :
upload_date = self . _search_regex (
[ r ' (?s)id= " eow-date.*?>(.*?)</span> ' ,
r ' (?:id= " watch-uploader-info " .*?>.*?|[ " \' ]simpleText[ " \' ] \ s*: \ s*[ " \' ])(?:Published|Uploaded|Streamed live|Started) on (.+?)[< " \' ] ' ] ,
video_webpage , ' upload date ' , default = None )
2020-06-15 15:37:19 -04:00
if not upload_date :
upload_date = microformat . get ( ' publishDate ' ) or microformat . get ( ' uploadDate ' )
2018-06-02 15:08:38 -04:00
upload_date = unified_strdate ( upload_date )
video_license = self . _html_search_regex (
r ' <h4[^>]+class= " title " [^>]*> \ s*License \ s*</h4> \ s*<ul[^>]*> \ s*<li>(.+?)</li ' ,
video_webpage , ' license ' , default = None )
m_music = re . search (
r ''' (?x)
< h4 [ ^ > ] + class = " title " [ ^ > ] * > \s * Music \s * < / h4 > \s *
< ul [ ^ > ] * > \s *
< li > ( ? P < title > . + ? )
by ( ? P < creator > . + ? )
( ? :
\( . + ? \) |
< a [ ^ > ] *
( ? :
\bhref = [ " \' ]/red[^>]*>| # drop possible
> \s * Listen ad - free with YouTube Red # YouTube Red ad
)
. * ?
) ? < / li
''' ,
video_webpage )
if m_music :
video_alt_title = remove_quotes ( unescapeHTML ( m_music . group ( ' title ' ) ) )
video_creator = clean_html ( m_music . group ( ' creator ' ) )
else :
video_alt_title = video_creator = None
def extract_meta ( field ) :
return self . _html_search_regex (
r ' <h4[^>]+class= " title " [^>]*> \ s* %s \ s*</h4> \ s*<ul[^>]*> \ s*<li>(.+?)</li> \ s* ' % field ,
video_webpage , field , default = None )
track = extract_meta ( ' Song ' )
artist = extract_meta ( ' Artist ' )
2019-04-28 12:37:46 -04:00
album = extract_meta ( ' Album ' )
2019-04-27 04:16:17 -04:00
# Youtube Music Auto-generated description
2019-04-28 12:37:46 -04:00
release_date = release_year = None
2019-04-27 04:16:17 -04:00
if video_description :
mobj = re . search ( r ' (?s)Provided to YouTube by [^ \ n]+ \ n+(?P<track>[^·]+)·(?P<artist>[^ \ n]+) \ n+(?P<album>[^ \ n]+)(?:.+?℗ \ s*(?P<release_year> \ d {4} )(?! \ d))?(?:.+?Released on \ s*: \ s*(?P<release_date> \ d {4} - \ d {2} - \ d {2} ))?(.+? \ nArtist \ s*: \ s*(?P<clean_artist>[^ \ n]+))? ' , video_description )
if mobj :
if not track :
track = mobj . group ( ' track ' ) . strip ( )
if not artist :
artist = mobj . group ( ' clean_artist ' ) or ' , ' . join ( a . strip ( ) for a in mobj . group ( ' artist ' ) . split ( ' · ' ) )
2019-04-28 12:37:46 -04:00
if not album :
album = mobj . group ( ' album ' . strip ( ) )
2019-04-27 04:16:17 -04:00
release_year = mobj . group ( ' release_year ' )
release_date = mobj . group ( ' release_date ' )
if release_date :
release_date = release_date . replace ( ' - ' , ' ' )
if not release_year :
release_year = int ( release_date [ : 4 ] )
if release_year :
release_year = int ( release_year )
2018-06-02 15:08:38 -04:00
2020-10-29 17:37:06 -04:00
yt_initial = self . _get_yt_initial_data ( video_id , video_webpage )
if yt_initial :
music_metadata = self . _get_music_metadata_from_yt_initial ( yt_initial )
if len ( music_metadata ) :
album = music_metadata [ 0 ] . get ( ' album ' )
artist = music_metadata [ 0 ] . get ( ' artist ' )
track = music_metadata [ 0 ] . get ( ' track ' )
2018-06-02 15:08:38 -04:00
m_episode = re . search (
r ' <div[^>]+id= " watch7-headline " [^>]*> \ s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b> \ s*S(?P<season> \ d+) \ s*• \ s*E(?P<episode> \ d+)</span> ' ,
video_webpage )
if m_episode :
2018-12-31 07:57:01 -05:00
series = unescapeHTML ( m_episode . group ( ' series ' ) )
2018-06-02 15:08:38 -04:00
season_number = int ( m_episode . group ( ' season ' ) )
episode_number = int ( m_episode . group ( ' episode ' ) )
else :
series = season_number = episode_number = None
m_cat_container = self . _search_regex (
r ' (?s)<h4[^>]*> \ s*Category \ s*</h4> \ s*<ul[^>]*>(.*?)</ul> ' ,
video_webpage , ' categories ' , default = None )
2020-06-15 16:13:39 -04:00
category = None
2018-06-02 15:08:38 -04:00
if m_cat_container :
category = self . _html_search_regex (
r ' (?s)<a[^<]+>(.*?)</a> ' , m_cat_container , ' category ' ,
default = None )
2020-06-15 16:13:39 -04:00
if not category :
category = try_get (
microformat , lambda x : x [ ' category ' ] , compat_str )
video_categories = None if category is None else [ category ]
2018-06-02 15:08:38 -04:00
video_tags = [
unescapeHTML ( m . group ( ' content ' ) )
for m in re . finditer ( self . _meta_regex ( ' og:video:tag ' ) , video_webpage ) ]
2020-06-15 16:13:39 -04:00
if not video_tags :
video_tags = try_get ( video_details , lambda x : x [ ' keywords ' ] , list )
2018-06-02 15:08:38 -04:00
def _extract_count ( count_name ) :
return str_to_int ( self . _search_regex (
2020-09-15 11:33:44 -04:00
r ' " accessibilityData " : \ { " label " : " ([ \ d, \ w]+) %s s " \ } '
2018-06-02 15:08:38 -04:00
% re . escape ( count_name ) ,
video_webpage , count_name , default = None ) )
like_count = _extract_count ( ' like ' )
dislike_count = _extract_count ( ' dislike ' )
2018-11-02 19:26:16 -04:00
if view_count is None :
view_count = str_to_int ( self . _search_regex (
r ' <[^>]+class=[ " \' ]watch-view-count[^>]+> \ s*([ \ d, \ s]+) ' , video_webpage ,
' view count ' , default = None ) )
2019-06-07 12:13:33 -04:00
average_rating = (
float_or_none ( video_details . get ( ' averageRating ' ) )
or try_get ( video_info , lambda x : float_or_none ( x [ ' avg_rating ' ] [ 0 ] ) ) )
2018-06-02 15:08:38 -04:00
# subtitles
2020-08-04 20:30:10 -04:00
video_subtitles = self . extract_subtitles (
video_id , video_webpage , has_live_chat_replay )
2018-06-02 15:08:38 -04:00
automatic_captions = self . extract_automatic_captions ( video_id , video_webpage )
video_duration = try_get (
video_info , lambda x : int_or_none ( x [ ' length_seconds ' ] [ 0 ] ) )
2018-11-02 19:26:16 -04:00
if not video_duration :
video_duration = int_or_none ( video_details . get ( ' lengthSeconds ' ) )
2018-06-02 15:08:38 -04:00
if not video_duration :
video_duration = parse_duration ( self . _html_search_meta (
' duration ' , video_webpage , ' video duration ' ) )
2020-09-15 11:16:58 -04:00
# Get Subscriber Count of channel
subscriber_count = parse_count ( self . _search_regex (
r ' " text " : " ([ \ d \ .]+ \ w?) subscribers " ' ,
video_webpage ,
' subscriber count ' ,
default = None
) )
2018-06-02 15:08:38 -04:00
# annotations
video_annotations = None
if self . _downloader . params . get ( ' writeannotations ' , False ) :
2019-08-09 03:16:53 -04:00
xsrf_token = self . _search_regex (
r ' ([ \' " ])XSRF_TOKEN \ 1 \ s*: \ s*([ \' " ])(?P<xsrf_token>[A-Za-z0-9+/=]+) \ 2 ' ,
video_webpage , ' xsrf token ' , group = ' xsrf_token ' , fatal = False )
invideo_url = try_get (
player_response , lambda x : x [ ' annotations ' ] [ 0 ] [ ' playerAnnotationsUrlsRenderer ' ] [ ' invideoUrl ' ] , compat_str )
if xsrf_token and invideo_url :
xsrf_field_name = self . _search_regex (
r ' ([ \' " ])XSRF_FIELD_NAME \ 1 \ s*: \ s*([ \' " ])(?P<xsrf_field_name> \ w+) \ 2 ' ,
video_webpage , ' xsrf field name ' ,
group = ' xsrf_field_name ' , default = ' session_token ' )
video_annotations = self . _download_webpage (
self . _proto_relative_url ( invideo_url ) ,
video_id , note = ' Downloading annotations ' ,
errnote = ' Unable to download video annotations ' , fatal = False ,
data = urlencode_postdata ( { xsrf_field_name : xsrf_token } ) )
2018-06-02 15:08:38 -04:00
2020-06-05 17:16:31 -04:00
chapters = self . _extract_chapters ( video_webpage , description_original , video_id , video_duration )
2018-06-02 15:08:38 -04:00
2014-01-18 23:47:20 -05:00
# Look for the DASH manifest
2014-10-12 18:03:08 -04:00
if self . _downloader . params . get ( ' youtube_include_dash_manifest ' , True ) :
2015-07-09 10:48:38 -04:00
dash_mpd_fatal = True
2016-02-02 12:10:23 -05:00
for mpd_url in dash_mpds :
2015-06-26 14:36:23 -04:00
dash_formats = { }
2014-12-10 07:21:24 -05:00
try :
2016-01-30 07:05:56 -05:00
def decrypt_sig ( mobj ) :
s = mobj . group ( 1 )
dec_s = self . _decrypt_signature ( s , video_id , player_url , age_gate )
return ' /signature/ %s ' % dec_s
2016-02-02 12:10:23 -05:00
mpd_url = re . sub ( r ' /s/([a-fA-F0-9 \ .]+) ' , decrypt_sig , mpd_url )
2016-01-30 09:52:23 -05:00
2016-02-02 12:10:23 -05:00
for df in self . _extract_mpd_formats (
mpd_url , video_id , fatal = dash_mpd_fatal ,
formats_dict = self . _formats ) :
2018-05-14 12:27:56 -04:00
if not df . get ( ' filesize ' ) :
df [ ' filesize ' ] = _extract_filesize ( df [ ' url ' ] )
2015-06-26 14:36:23 -04:00
# Do not overwrite DASH format found in some previous DASH manifest
if df [ ' format_id ' ] not in dash_formats :
dash_formats [ df [ ' format_id ' ] ] = df
2015-07-09 10:48:38 -04:00
# Additional DASH manifests may end up in HTTP Error 403 therefore
# allow them to fail without bug report message if we already have
# some DASH manifest succeeded. This is temporary workaround to reduce
# burst of bug reports until we figure out the reason and whether it
# can be fixed at all.
dash_mpd_fatal = False
2014-12-10 07:21:24 -05:00
except ( ExtractorError , KeyError ) as e :
self . report_warning (
' Skipping DASH manifest: %r ' % e , video_id )
2015-06-26 14:36:23 -04:00
if dash_formats :
2015-05-22 05:58:52 -04:00
# Remove the formats we found through non-DASH, they
# contain less info and it can be wrong, because we use
# fixed values (for example the resolution). See
2019-03-09 07:14:41 -05:00
# https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2015-05-22 05:58:52 -04:00
# example.
2015-06-26 16:48:50 -04:00
formats = [ f for f in formats if f [ ' format_id ' ] not in dash_formats . keys ( ) ]
2015-06-26 14:36:23 -04:00
formats . extend ( dash_formats . values ( ) )
2013-12-22 22:51:42 -05:00
2015-01-09 23:45:51 -05:00
# Check for malformed aspect ratio
stretched_m = re . search (
r ' <meta \ s+property= " og:video:tag " .*?content= " yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+) " > ' ,
video_webpage )
if stretched_m :
2015-11-27 19:07:07 -05:00
w = float ( stretched_m . group ( ' w ' ) )
h = float ( stretched_m . group ( ' h ' ) )
2015-11-28 07:50:21 -05:00
# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
# We will only process correct ratios.
2015-11-27 19:07:07 -05:00
if w > 0 and h > 0 :
2015-11-28 02:16:46 -05:00
ratio = w / h
2015-11-27 19:07:07 -05:00
for f in formats :
if f . get ( ' vcodec ' ) != ' none ' :
f [ ' stretched_ratio ' ] = ratio
2015-01-09 23:45:51 -05:00
2019-04-29 17:32:55 -04:00
if not formats :
2020-03-07 20:34:17 -05:00
if ' reason ' in video_info :
if ' The uploader has not made this video available in your country. ' in video_info [ ' reason ' ] :
regions_allowed = self . _html_search_meta (
' regionsAllowed ' , video_webpage , default = None )
countries = regions_allowed . split ( ' , ' ) if regions_allowed else None
self . raise_geo_restricted (
msg = video_info [ ' reason ' ] [ 0 ] , countries = countries )
reason = video_info [ ' reason ' ] [ 0 ]
if ' Invalid parameters ' in reason :
unavailable_message = extract_unavailable_message ( )
if unavailable_message :
reason = unavailable_message
raise ExtractorError (
' YouTube said: %s ' % reason ,
expected = True , video_id = video_id )
if video_info . get ( ' license_info ' ) or try_get ( player_response , lambda x : x [ ' streamingData ' ] [ ' licenseInfos ' ] ) :
raise ExtractorError ( ' This video is DRM protected. ' , expected = True )
2019-05-25 18:14:47 -04:00
2013-12-24 06:25:22 -05:00
self . _sort_formats ( formats )
2013-12-17 21:30:55 -05:00
2018-12-16 07:35:48 -05:00
self . mark_watched ( video_id , video_info , player_response )
2016-02-29 14:01:33 -05:00
2013-12-17 21:30:55 -05:00
return {
2014-11-23 15:20:46 -05:00
' id ' : video_id ,
' uploader ' : video_uploader ,
' uploader_id ' : video_uploader_id ,
2016-03-02 12:49:10 -05:00
' uploader_url ' : video_uploader_url ,
2018-09-14 14:24:26 -04:00
' channel_id ' : channel_id ,
' channel_url ' : channel_url ,
2014-11-23 15:20:46 -05:00
' upload_date ' : upload_date ,
2016-03-02 12:07:25 -05:00
' license ' : video_license ,
2018-06-02 15:05:14 -04:00
' creator ' : video_creator or artist ,
2014-11-23 15:20:46 -05:00
' title ' : video_title ,
2018-06-02 15:05:14 -04:00
' alt_title ' : video_alt_title or track ,
2020-06-15 15:29:04 -04:00
' thumbnails ' : thumbnails ,
2014-11-23 15:20:46 -05:00
' description ' : video_description ,
' categories ' : video_categories ,
2015-07-28 17:43:32 -04:00
' tags ' : video_tags ,
2014-11-23 15:20:46 -05:00
' subtitles ' : video_subtitles ,
2015-02-16 15:44:17 -05:00
' automatic_captions ' : automatic_captions ,
2014-11-23 15:20:46 -05:00
' duration ' : video_duration ,
' age_limit ' : 18 if age_gate else 0 ,
' annotations ' : video_annotations ,
2017-05-05 15:27:06 -04:00
' chapters ' : chapters ,
2014-03-20 19:33:53 -04:00
' webpage_url ' : proto + ' ://www.youtube.com/watch?v= %s ' % video_id ,
2014-11-23 15:20:46 -05:00
' view_count ' : view_count ,
2013-12-17 21:30:55 -05:00
' like_count ' : like_count ,
' dislike_count ' : dislike_count ,
2019-06-07 12:13:33 -04:00
' average_rating ' : average_rating ,
2014-11-23 15:20:46 -05:00
' formats ' : formats ,
2015-07-20 14:14:20 -04:00
' is_live ' : is_live ,
2015-07-20 15:10:28 -04:00
' start_time ' : start_time ,
2015-07-23 07:20:21 -04:00
' end_time ' : end_time ,
2017-01-21 06:10:32 -05:00
' series ' : series ,
' season_number ' : season_number ,
' episode_number ' : episode_number ,
2018-06-02 15:05:14 -04:00
' track ' : track ,
' artist ' : artist ,
2019-04-22 00:26:48 -04:00
' album ' : album ,
' release_date ' : release_date ,
' release_year ' : release_year ,
2020-09-15 11:16:58 -04:00
' subscriber_count ' : subscriber_count ,
2013-12-17 21:30:55 -05:00
}
2013-06-23 13:58:33 -04:00
2014-11-23 14:41:03 -05:00
2016-01-31 06:49:59 -05:00
class YoutubePlaylistIE ( YoutubePlaylistBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com playlists '
2014-02-06 13:46:26 -05:00
_VALID_URL = r """ (?x)(?:
2013-06-23 13:58:33 -04:00
( ? : https ? : / / ) ?
( ? : \w + \. ) ?
( ? :
2018-11-05 07:08:39 -05:00
( ? :
2019-11-30 11:51:34 -05:00
youtube ( ? : kids ) ? \. com |
2018-11-05 07:08:39 -05:00
invidio \. us
)
/
2016-09-04 09:12:34 -04:00
( ? :
2017-02-28 11:06:47 -05:00
( ? : course | view_play_list | my_playlists | artist | playlist | watch | embed / ( ? : videoseries | [ 0 - 9 A - Za - z_ - ] { 11 } ) )
2016-09-04 09:12:34 -04:00
\? ( ? : . * ? [ & ; ] ) * ? ( ? : p | a | list ) =
| p /
) |
youtu \. be / [ 0 - 9 A - Za - z_ - ] { 11 } \? . * ? \blist =
2013-06-23 13:58:33 -04:00
)
2014-02-06 13:46:26 -05:00
(
2019-11-30 11:51:34 -05:00
( ? : PL | LL | EC | UU | FL | RD | UL | TL | PU | OLAK5uy_ ) ? [ 0 - 9 A - Za - z - _ ] { 10 , }
2014-11-23 14:41:03 -05:00
# Top tracks, they can also include dots
2014-02-06 13:46:26 -05:00
| ( ? : MC ) [ \w \. ] *
)
2013-06-23 13:58:33 -04:00
. *
|
2017-03-24 14:17:17 -04:00
( % ( playlist_id ) s )
) """ % { ' playlist_id ' : YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2017-09-01 13:57:14 -04:00
_TEMPLATE_URL = ' https://www.youtube.com/playlist?list= %s '
2019-08-12 18:02:52 -04:00
_VIDEO_RE_TPL = r ' href= " \ s*/watch \ ?v= %s (?:&(?:[^ " ]*?index=(?P<index> \ d+))?(?:[^>]+>(?P<title>[^<]+))?)? '
_VIDEO_RE = _VIDEO_RE_TPL % r ' (?P<id>[0-9A-Za-z_-] {11} ) '
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube:playlist '
2014-09-13 01:19:20 -04:00
_TESTS = [ {
2020-03-02 13:46:00 -05:00
' url ' : ' https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc ' ,
2014-09-13 01:19:20 -04:00
' info_dict ' : {
2020-03-02 13:46:00 -05:00
' uploader_id ' : ' UCmlqkdCBesrv2Lak1mF_MxA ' ,
' uploader ' : ' Sergey M. ' ,
' id ' : ' PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc ' ,
2020-09-02 16:37:35 -04:00
' title ' : ' youtube-dl public playlist ' ,
2014-09-13 01:19:20 -04:00
} ,
2020-03-02 13:46:00 -05:00
' playlist_count ' : 1 ,
2014-09-13 01:31:48 -04:00
} , {
2020-03-02 13:46:00 -05:00
' url ' : ' https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf ' ,
2014-09-13 01:31:48 -04:00
' info_dict ' : {
2020-03-02 13:46:00 -05:00
' uploader_id ' : ' UCmlqkdCBesrv2Lak1mF_MxA ' ,
' uploader ' : ' Sergey M. ' ,
' id ' : ' PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf ' ,
2020-09-02 16:37:35 -04:00
' title ' : ' youtube-dl empty playlist ' ,
2014-09-13 01:31:48 -04:00
} ,
' playlist_count ' : 0 ,
} , {
' note ' : ' Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list. ' ,
' url ' : ' https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC ' ,
' info_dict ' : {
' title ' : ' 29C3: Not my department ' ,
2015-02-01 09:33:32 -05:00
' id ' : ' PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' Christiaan008 ' ,
' uploader_id ' : ' ChRiStIaAn008 ' ,
2014-09-13 01:31:48 -04:00
} ,
2020-03-02 13:46:00 -05:00
' playlist_count ' : 96 ,
2014-09-13 01:31:48 -04:00
} , {
' note ' : ' issue #673 ' ,
' url ' : ' PLBB231211A4F62143 ' ,
' info_dict ' : {
2014-10-26 19:06:47 -04:00
' title ' : ' [OLD]Team Fortress 2 (Class-based LP) ' ,
2015-02-01 09:33:32 -05:00
' id ' : ' PLBB231211A4F62143 ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' Wickydoo ' ,
' uploader_id ' : ' Wickydoo ' ,
2014-09-13 01:31:48 -04:00
} ,
' playlist_mincount ' : 26 ,
} , {
' note ' : ' Large playlist ' ,
' url ' : ' https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q ' ,
' info_dict ' : {
' title ' : ' Uploads from Cauchemar ' ,
2015-02-01 09:33:32 -05:00
' id ' : ' UUBABnxM4Ar9ten8Mdjj1j0Q ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' Cauchemar ' ,
' uploader_id ' : ' Cauchemar89 ' ,
2014-09-13 01:31:48 -04:00
} ,
' playlist_mincount ' : 799 ,
} , {
' url ' : ' PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl ' ,
' info_dict ' : {
' title ' : ' YDL_safe_search ' ,
2015-02-01 09:33:32 -05:00
' id ' : ' PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl ' ,
2014-09-13 01:31:48 -04:00
} ,
' playlist_count ' : 2 ,
2017-01-22 14:49:56 -05:00
' skip ' : ' This playlist is private ' ,
2014-09-24 04:34:29 -04:00
} , {
' note ' : ' embedded ' ,
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu ' ,
2014-09-24 04:34:29 -04:00
' playlist_count ' : 4 ,
' info_dict ' : {
' title ' : ' JODA15 ' ,
2015-02-01 09:33:32 -05:00
' id ' : ' PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' milan ' ,
' uploader_id ' : ' UCEI1-PVPcYXjB73Hfelbmaw ' ,
2014-09-24 04:34:29 -04:00
}
2017-02-28 11:06:47 -05:00
} , {
' url ' : ' http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl ' ,
' playlist_mincount ' : 485 ,
' info_dict ' : {
2019-07-20 13:01:46 -04:00
' title ' : ' 2018 Chinese New Singles (11/6 updated) ' ,
2017-02-28 11:06:47 -05:00
' id ' : ' PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' LBK ' ,
' uploader_id ' : ' sdragonfang ' ,
2017-02-28 11:06:47 -05:00
}
2014-09-24 19:58:49 -04:00
} , {
' note ' : ' Embedded SWF player ' ,
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0 ' ,
2014-09-24 19:58:49 -04:00
' playlist_count ' : 4 ,
' info_dict ' : {
' title ' : ' JODA7 ' ,
2015-02-01 09:33:32 -05:00
' id ' : ' YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ ' ,
2019-07-20 13:01:46 -04:00
} ,
' skip ' : ' This playlist does not exist ' ,
2014-12-15 13:19:15 -05:00
} , {
' note ' : ' Buggy playlist: the webpage has a " Load more " button but it doesn \' t have more videos ' ,
' url ' : ' https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA ' ,
' info_dict ' : {
2015-02-01 09:33:32 -05:00
' title ' : ' Uploads from Interstellar Movie ' ,
' id ' : ' UUXw-G3eDE9trcvY2sBMM_aA ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' Interstellar Movie ' ,
' uploader_id ' : ' InterstellarMovie1 ' ,
2014-12-15 13:19:15 -05:00
} ,
2016-10-18 16:27:18 -04:00
' playlist_mincount ' : 21 ,
2016-09-02 12:43:20 -04:00
} , {
# Playlist URL that does not actually serve a playlist
' url ' : ' https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4 ' ,
' info_dict ' : {
' id ' : ' FqZTN594JQw ' ,
' ext ' : ' webm ' ,
' title ' : " Smiley ' s People 01 detective, Adventure Series, Action " ,
' uploader ' : ' STREEM ' ,
' uploader_id ' : ' UCyPhqAZgwYWZfxElWVbVJng ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/channel/UCyPhqAZgwYWZfxElWVbVJng ' ,
2016-09-02 12:43:20 -04:00
' upload_date ' : ' 20150526 ' ,
' license ' : ' Standard YouTube License ' ,
' description ' : ' md5:507cdcb5a49ac0da37a920ece610be80 ' ,
' categories ' : [ ' People & Blogs ' ] ,
' tags ' : list ,
2018-11-02 19:26:16 -04:00
' view_count ' : int ,
2016-09-02 12:43:20 -04:00
' like_count ' : int ,
' dislike_count ' : int ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
2019-07-20 13:01:46 -04:00
' skip ' : ' This video is not available. ' ,
2016-09-02 12:43:20 -04:00
' add_ie ' : [ YoutubeIE . ie_key ( ) ] ,
2016-10-18 16:27:18 -04:00
} , {
' url ' : ' https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5 ' ,
' info_dict ' : {
' id ' : ' yeWKywCrFtk ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Small Scale Baler and Braiding Rugs ' ,
' uploader ' : ' Backus-Page House Museum ' ,
' uploader_id ' : ' backuspagemuseum ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/backuspagemuseum ' ,
2016-10-18 16:27:18 -04:00
' upload_date ' : ' 20161008 ' ,
' description ' : ' md5:800c0c78d5eb128500bffd4f0b4f2e8a ' ,
' categories ' : [ ' Nonprofits & Activism ' ] ,
' tags ' : list ,
' like_count ' : int ,
' dislike_count ' : int ,
} ,
' params ' : {
' noplaylist ' : True ,
' skip_download ' : True ,
} ,
2019-07-20 12:46:34 -04:00
} , {
# https://github.com/ytdl-org/youtube-dl/issues/21844
' url ' : ' https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba ' ,
' info_dict ' : {
' title ' : ' Data Analysis with Dr Mike Pound ' ,
' id ' : ' PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba ' ,
' uploader_id ' : ' Computerphile ' ,
' uploader ' : ' Computerphile ' ,
} ,
' playlist_mincount ' : 11 ,
2016-09-04 09:12:34 -04:00
} , {
' url ' : ' https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21 ' ,
' only_matching ' : True ,
2017-02-02 10:01:11 -05:00
} , {
' url ' : ' TLGGrESM50VT6acwMjAyMjAxNw ' ,
' only_matching ' : True ,
2018-08-27 15:14:47 -04:00
} , {
# music album playlist
' url ' : ' OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM ' ,
' only_matching ' : True ,
2018-11-05 07:08:39 -05:00
} , {
' url ' : ' https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU ' ,
' only_matching ' : True ,
2019-11-30 11:51:34 -05:00
} , {
' url ' : ' https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g ' ,
' only_matching ' : True ,
2014-09-13 01:19:20 -04:00
} ]
2013-06-23 13:58:33 -04:00
2013-11-13 10:39:11 -05:00
def _real_initialize ( self ) :
self . _login ( )
2019-08-12 18:02:52 -04:00
def extract_videos_from_page ( self , page ) :
ids_in_page = [ ]
titles_in_page = [ ]
for item in re . findall (
r ' (<[^>]* \ bdata-video-id \ s*= \ s*[ " \' ][0-9A-Za-z_-] {11} [^>]+>) ' , page ) :
attrs = extract_attributes ( item )
video_id = attrs [ ' data-video-id ' ]
video_title = unescapeHTML ( attrs . get ( ' data-title ' ) )
if video_title :
video_title = video_title . strip ( )
ids_in_page . append ( video_id )
titles_in_page . append ( video_title )
# Fallback with old _VIDEO_RE
self . extract_videos_from_page_impl (
self . _VIDEO_RE , page , ids_in_page , titles_in_page )
# Relaxed fallbacks
self . extract_videos_from_page_impl (
r ' href= " \ s*/watch \ ?v \ s*= \ s*(?P<id>[0-9A-Za-z_-] {11} ) ' , page ,
ids_in_page , titles_in_page )
self . extract_videos_from_page_impl (
r ' data-video-ids \ s*= \ s*[ " \' ](?P<id>[0-9A-Za-z_-] {11} ) ' , page ,
ids_in_page , titles_in_page )
return zip ( ids_in_page , titles_in_page )
2013-11-26 15:35:03 -05:00
def _extract_mix ( self , playlist_id ) :
2015-02-23 14:35:15 -05:00
# The mixes are generated from a single video
2013-11-26 15:35:03 -05:00
# the id of the playlist is just 'RD' + video_id
2016-04-17 11:07:57 -04:00
ids = [ ]
last_id = playlist_id [ - 11 : ]
for n in itertools . count ( 1 ) :
2020-06-30 15:56:16 -04:00
url = ' https://www.youtube.com/watch?v= %s &list= %s ' % ( last_id , playlist_id )
2016-04-17 11:07:57 -04:00
webpage = self . _download_webpage (
url , playlist_id , ' Downloading page {0} of Youtube mix ' . format ( n ) )
new_ids = orderedSet ( re . findall (
r ''' (?xs)data-video-username= " .*? " .*?
href = " /watch \ ?v=([0-9A-Za-z_-] {11} )&[^ " ] * ? list = % s ''' % r e.escape(playlist_id),
webpage ) )
# Fetch new pages until all the videos are repeated, it seems that
# there are always 51 unique videos.
new_ids = [ _id for _id in new_ids if _id not in ids ]
if not new_ids :
break
ids . extend ( new_ids )
last_id = ids [ - 1 ]
url_results = self . _ids_to_results ( ids )
2014-02-23 11:17:36 -05:00
search_title = lambda class_name : get_element_by_attribute ( ' class ' , class_name , webpage )
2014-08-31 19:00:40 -04:00
title_span = (
2019-05-10 16:56:22 -04:00
search_title ( ' playlist-title ' )
or search_title ( ' title long-title ' )
or search_title ( ' title ' ) )
2013-11-27 14:01:51 -05:00
title = clean_html ( title_span )
2013-11-26 15:35:03 -05:00
return self . playlist_result ( url_results , playlist_id , title )
2015-03-26 11:41:09 -04:00
def _extract_playlist ( self , playlist_id ) :
2014-02-21 05:19:55 -05:00
url = self . _TEMPLATE_URL % playlist_id
page = self . _download_webpage ( url , playlist_id )
2019-03-09 07:14:41 -05:00
# the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2017-01-22 14:35:38 -05:00
for match in re . findall ( r ' <div class= " yt-alert-message " [^>]*>([^<]+)</div> ' , page ) :
2015-04-28 11:07:56 -04:00
match = match . strip ( )
# Check if the playlist exists or is private
2017-01-22 14:49:56 -05:00
mobj = re . match ( r ' [^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]* ' , match )
if mobj :
reason = mobj . group ( ' reason ' )
message = ' This playlist %s ' % reason
if ' private ' in reason :
message + = ' , use --username or --netrc to access it '
message + = ' . '
raise ExtractorError ( message , expected = True )
2015-04-28 11:07:56 -04:00
elif re . match ( r ' [^<]*Invalid parameters[^<]* ' , match ) :
raise ExtractorError (
' Invalid parameters. Maybe URL is incorrect. ' ,
expected = True )
elif re . match ( r ' [^<]*Choose your language[^<]* ' , match ) :
continue
else :
self . report_warning ( ' Youtube gives an alert message: ' + match )
2014-05-01 09:40:35 -04:00
2014-02-21 05:19:55 -05:00
playlist_title = self . _html_search_regex (
2015-11-18 12:28:05 -05:00
r ' (?s)<h1 class= " pl-header-title[^ " ]* " [^>]*> \ s*(.*?) \ s*</h1> ' ,
2016-09-02 12:43:20 -04:00
page , ' title ' , default = None )
2013-06-23 13:58:33 -04:00
2017-12-18 15:51:28 -05:00
_UPLOADER_BASE = r ' class=[ " \' ]pl-header-details[^>]+> \ s*<li> \ s*<a[^>]+ \ bhref= '
2019-09-22 13:20:52 -04:00
uploader = self . _html_search_regex (
2017-12-18 15:51:28 -05:00
r ' %s [ " \' ]/(?:user|channel)/[^>]+>([^<]+) ' % _UPLOADER_BASE ,
page , ' uploader ' , default = None )
mobj = re . search (
r ' %s ([ " \' ])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?)) \ 1 ' % _UPLOADER_BASE ,
page )
if mobj :
uploader_id = mobj . group ( ' uploader_id ' )
uploader_url = compat_urlparse . urljoin ( url , mobj . group ( ' path ' ) )
else :
uploader_id = uploader_url = None
2016-09-02 12:43:20 -04:00
has_videos = True
if not playlist_title :
try :
# Some playlist URLs don't actually serve a playlist (e.g.
# https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
next ( self . _entries ( page , playlist_id ) )
except StopIteration :
has_videos = False
2017-12-18 15:51:28 -05:00
playlist = self . playlist_result (
2016-09-02 12:43:20 -04:00
self . _entries ( page , playlist_id ) , playlist_id , playlist_title )
2017-12-18 15:51:28 -05:00
playlist . update ( {
' uploader ' : uploader ,
' uploader_id ' : uploader_id ,
' uploader_url ' : uploader_url ,
} )
return has_videos , playlist
2013-06-23 13:58:33 -04:00
2016-02-18 11:03:46 -05:00
def _check_download_just_video ( self , url , playlist_id ) :
2015-03-26 11:41:09 -04:00
# Check if it's a video-specific URL
query_dict = compat_urlparse . parse_qs ( compat_urlparse . urlparse ( url ) . query )
2016-10-18 16:27:18 -04:00
video_id = query_dict . get ( ' v ' , [ None ] ) [ 0 ] or self . _search_regex (
2017-02-28 11:06:47 -05:00
r ' (?:(?:^|//)youtu \ .be/|youtube \ .com/embed/(?!videoseries))([0-9A-Za-z_-] {11} ) ' , url ,
2016-10-18 16:27:18 -04:00
' video id ' , default = None )
if video_id :
2015-03-26 11:41:09 -04:00
if self . _downloader . params . get ( ' noplaylist ' ) :
self . to_screen ( ' Downloading just video %s because of --no-playlist ' % video_id )
2016-09-02 12:43:20 -04:00
return video_id , self . url_result ( video_id , ' Youtube ' , video_id = video_id )
2015-03-26 11:41:09 -04:00
else :
self . to_screen ( ' Downloading playlist %s - add --no-playlist to just download video %s ' % ( playlist_id , video_id ) )
2016-09-02 12:43:20 -04:00
return video_id , None
return None , None
2015-03-26 11:41:09 -04:00
2016-02-18 11:03:46 -05:00
def _real_extract ( self , url ) :
# Extract playlist id
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
raise ExtractorError ( ' Invalid URL: %s ' % url )
playlist_id = mobj . group ( 1 ) or mobj . group ( 2 )
2016-09-02 12:43:20 -04:00
video_id , video = self . _check_download_just_video ( url , playlist_id )
2016-02-18 11:03:46 -05:00
if video :
return video
2016-04-12 11:38:31 -04:00
if playlist_id . startswith ( ( ' RD ' , ' UL ' , ' PU ' ) ) :
2015-03-26 11:41:09 -04:00
# Mixes require a custom extraction process
return self . _extract_mix ( playlist_id )
2016-09-02 12:43:20 -04:00
has_videos , playlist = self . _extract_playlist ( playlist_id )
if has_videos or not video_id :
return playlist
# Some playlist URLs don't actually serve a playlist (see
2019-03-09 07:14:41 -05:00
# https://github.com/ytdl-org/youtube-dl/issues/10537).
2016-09-02 12:43:20 -04:00
# Fallback to plain video extraction if there is a video id
# along with playlist id.
return self . url_result ( video_id , ' Youtube ' , video_id = video_id )
2015-03-26 11:41:09 -04:00
2013-06-23 13:58:33 -04:00
2015-10-17 14:11:34 -04:00
class YoutubeChannelIE ( YoutubePlaylistBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com channels '
2019-11-30 11:51:34 -05:00
_VALID_URL = r ' https?://(?:youtu \ .be|(?: \ w+ \ .)?youtube(?:-nocookie|kids)? \ .com|(?:www \ .)?invidio \ .us)/channel/(?P<id>[0-9A-Za-z_-]+) '
2015-04-21 12:36:41 -04:00
_TEMPLATE_URL = ' https://www.youtube.com/channel/ %s /videos '
2015-10-17 14:11:34 -04:00
_VIDEO_RE = r ' (?:title= " (?P<title>[^ " ]+) " [^>]+)?href= " /watch \ ?v=(?P<id>[0-9A-Za-z_-]+)&? '
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube:channel '
2014-09-24 04:25:47 -04:00
_TESTS = [ {
' note ' : ' paginated channel ' ,
' url ' : ' https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w ' ,
' playlist_mincount ' : 91 ,
2015-02-01 09:33:32 -05:00
' info_dict ' : {
2015-10-23 08:16:08 -04:00
' id ' : ' UUKfVa3S1e4PHvxWcwyMMg8w ' ,
' title ' : ' Uploads from lex will ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' lex will ' ,
' uploader_id ' : ' UCKfVa3S1e4PHvxWcwyMMg8w ' ,
2015-02-01 09:33:32 -05:00
}
2015-10-23 08:23:45 -04:00
} , {
' note ' : ' Age restricted channel ' ,
# from https://www.youtube.com/user/DeusExOfficial
' url ' : ' https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w ' ,
' playlist_mincount ' : 64 ,
' info_dict ' : {
' id ' : ' UUs0ifCMCm1icqRbqhUINa0w ' ,
' title ' : ' Uploads from Deus Ex ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' Deus Ex ' ,
' uploader_id ' : ' DeusExOfficial ' ,
2015-10-23 08:23:45 -04:00
} ,
2018-09-23 13:14:49 -04:00
} , {
' url ' : ' https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA ' ,
' only_matching ' : True ,
2019-11-30 11:51:34 -05:00
} , {
' url ' : ' https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA ' ,
' only_matching ' : True ,
2014-09-24 04:25:47 -04:00
} ]
2013-06-23 13:58:33 -04:00
2015-12-19 20:48:16 -05:00
@classmethod
def suitable ( cls , url ) :
2016-03-24 15:18:14 -04:00
return ( False if YoutubePlaylistsIE . suitable ( url ) or YoutubeLiveIE . suitable ( url )
else super ( YoutubeChannelIE , cls ) . suitable ( url ) )
2015-12-19 20:48:16 -05:00
2016-07-08 21:37:02 -04:00
def _build_template_url ( self , url , channel_id ) :
return self . _TEMPLATE_URL % channel_id
2013-06-23 13:58:33 -04:00
def _real_extract ( self , url ) :
2014-12-06 06:20:54 -05:00
channel_id = self . _match_id ( url )
2013-06-23 13:58:33 -04:00
2016-07-08 21:37:02 -04:00
url = self . _build_template_url ( url , channel_id )
2015-05-30 08:29:16 -04:00
# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
# Workaround by extracting as a playlist if managed to obtain channel playlist URL
# otherwise fallback on channel by page extraction
channel_page = self . _download_webpage (
url + ' ?view=57 ' , channel_id ,
' Downloading channel page ' , fatal = False )
2015-09-13 18:32:20 -04:00
if channel_page is False :
channel_playlist_id = False
else :
channel_playlist_id = self . _html_search_meta (
' channelId ' , channel_page , ' channel id ' , default = None )
if not channel_playlist_id :
2016-07-05 12:30:44 -04:00
channel_url = self . _html_search_meta (
( ' al:ios:url ' , ' twitter:app:url:iphone ' , ' twitter:app:url:ipad ' ) ,
channel_page , ' channel url ' , default = None )
if channel_url :
channel_playlist_id = self . _search_regex (
r ' vnd \ .youtube://user/([0-9A-Za-z_-]+) ' ,
channel_url , ' channel id ' , default = None )
2015-05-30 08:29:16 -04:00
if channel_playlist_id and channel_playlist_id . startswith ( ' UC ' ) :
playlist_id = ' UU ' + channel_playlist_id [ 2 : ]
2015-05-30 16:50:22 -04:00
return self . url_result (
compat_urlparse . urljoin ( url , ' /playlist?list= %s ' % playlist_id ) , ' YoutubePlaylist ' )
2015-05-30 08:29:16 -04:00
2015-04-21 12:37:45 -04:00
channel_page = self . _download_webpage ( url , channel_id , ' Downloading page #1 ' )
2013-12-08 01:30:42 -05:00
autogenerated = re . search ( r ''' (?x)
class = " [^ " ] * ? ( ? :
channel - header - autogenerated - label |
yt - channel - title - autogenerated
) [ ^ " ]* " ''' , channel_page) is not None
2013-06-23 13:58:33 -04:00
2013-11-15 05:51:45 -05:00
if autogenerated :
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
2014-12-06 08:02:19 -05:00
entries = [
2015-04-12 13:19:00 -04:00
self . url_result (
video_id , ' Youtube ' , video_id = video_id ,
video_title = video_title )
2015-04-13 10:28:16 -04:00
for video_id , video_title in self . extract_videos_from_page ( channel_page ) ]
2014-12-06 08:02:19 -05:00
return self . playlist_result ( entries , channel_id )
2016-07-05 12:30:44 -04:00
try :
next ( self . _entries ( channel_page , channel_id ) )
except StopIteration :
alert_message = self . _html_search_regex (
r ' (?s)<div[^>]+class=([ " \' ]).*? \ byt-alert-message \ b.*? \ 1[^>]*>(?P<alert>[^<]+)</div> ' ,
channel_page , ' alert ' , default = None , group = ' alert ' )
if alert_message :
raise ExtractorError ( ' Youtube said: %s ' % alert_message , expected = True )
2015-10-17 14:11:34 -04:00
return self . playlist_result ( self . _entries ( channel_page , channel_id ) , channel_id )
2013-06-23 13:58:33 -04:00
2015-04-21 12:36:41 -04:00
class YoutubeUserIE ( YoutubeChannelIE ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com user videos (URL or " ytuser " keyword) '
2020-09-11 23:08:57 -04:00
_VALID_URL = r ' (?:(?:https?://(?: \ w+ \ .)?youtube \ .com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9 % -])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_ % -]+) '
2016-07-08 21:37:02 -04:00
_TEMPLATE_URL = ' https://www.youtube.com/ %s / %s /videos '
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube:user '
2013-06-23 13:58:33 -04:00
2014-09-24 04:25:47 -04:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/user/TheLinuxFoundation ' ,
' playlist_mincount ' : 320 ,
' info_dict ' : {
2016-07-05 12:30:44 -04:00
' id ' : ' UUfX55Sx5hEFjoC3cNs6mCUQ ' ,
' title ' : ' Uploads from The Linux Foundation ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' The Linux Foundation ' ,
' uploader_id ' : ' TheLinuxFoundation ' ,
2014-09-24 04:25:47 -04:00
}
2016-07-08 21:37:02 -04:00
} , {
# Only available via https://www.youtube.com/c/12minuteathlete/videos
# but not https://www.youtube.com/user/12minuteathlete/videos
' url ' : ' https://www.youtube.com/c/12minuteathlete/videos ' ,
' playlist_mincount ' : 249 ,
' info_dict ' : {
' id ' : ' UUVjM-zV6_opMDx7WYxnjZiQ ' ,
' title ' : ' Uploads from 12 Minute Athlete ' ,
2019-07-20 13:01:46 -04:00
' uploader ' : ' 12 Minute Athlete ' ,
' uploader_id ' : ' the12minuteathlete ' ,
2016-07-08 21:37:02 -04:00
}
2014-09-24 04:25:47 -04:00
} , {
' url ' : ' ytuser:phihag ' ,
' only_matching ' : True ,
2016-06-10 04:37:12 -04:00
} , {
' url ' : ' https://www.youtube.com/c/gametrailers ' ,
' only_matching ' : True ,
2020-09-11 23:08:57 -04:00
} , {
' url ' : ' https://www.youtube.com/c/Pawe % C5 % 82Zadro % C5 % BCniak ' ,
' only_matching ' : True ,
2016-07-08 21:37:02 -04:00
} , {
' url ' : ' https://www.youtube.com/gametrailers ' ,
' only_matching ' : True ,
2016-07-05 12:30:44 -04:00
} , {
2017-02-27 10:22:43 -05:00
# This channel is not available, geo restricted to JP
2016-07-05 12:30:44 -04:00
' url ' : ' https://www.youtube.com/user/kananishinoSMEJ/videos ' ,
' only_matching ' : True ,
2014-09-24 04:25:47 -04:00
} ]
2013-09-06 10:24:24 -04:00
@classmethod
2013-09-05 16:38:23 -04:00
def suitable ( cls , url ) :
2013-09-06 10:24:24 -04:00
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
2016-04-15 14:04:37 -04:00
other_yt_ies = iter ( klass for ( name , klass ) in globals ( ) . items ( ) if name . startswith ( ' Youtube ' ) and name . endswith ( ' IE ' ) and klass is not cls )
if any ( ie . suitable ( url ) for ie in other_yt_ies ) :
2014-11-23 14:41:03 -05:00
return False
else :
return super ( YoutubeUserIE , cls ) . suitable ( url )
2013-09-05 16:38:23 -04:00
2016-07-08 21:37:02 -04:00
def _build_template_url ( self , url , channel_id ) :
mobj = re . match ( self . _VALID_URL , url )
return self . _TEMPLATE_URL % ( mobj . group ( ' user ' ) or ' user ' , mobj . group ( ' id ' ) )
2013-06-23 14:28:15 -04:00
2016-03-24 15:18:14 -04:00
class YoutubeLiveIE ( YoutubeBaseInfoExtractor ) :
IE_DESC = ' YouTube.com live streams '
2016-11-16 11:14:06 -05:00
_VALID_URL = r ' (?P<base_url>https?://(?: \ w+ \ .)?youtube \ .com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live '
2016-03-24 15:18:14 -04:00
IE_NAME = ' youtube:live '
_TESTS = [ {
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/user/TheYoungTurks/live ' ,
2016-03-24 15:18:14 -04:00
' info_dict ' : {
' id ' : ' a48o2S1cPoo ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Young Turks - Live Main Show ' ,
' uploader ' : ' The Young Turks ' ,
' uploader_id ' : ' TheYoungTurks ' ,
2017-01-02 07:08:07 -05:00
' uploader_url ' : r ' re:https?://(?:www \ .)?youtube \ .com/user/TheYoungTurks ' ,
2016-03-24 15:18:14 -04:00
' upload_date ' : ' 20150715 ' ,
' license ' : ' Standard YouTube License ' ,
' description ' : ' md5:438179573adcdff3c97ebb1ee632b891 ' ,
' categories ' : [ ' News & Politics ' ] ,
' tags ' : [ ' Cenk Uygur (TV Program Creator) ' , ' The Young Turks (Award-Winning Work) ' , ' Talk Show (TV Genre) ' ] ,
' like_count ' : int ,
' dislike_count ' : int ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} , {
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live ' ,
2016-03-24 15:18:14 -04:00
' only_matching ' : True ,
2016-10-03 15:10:23 -04:00
} , {
' url ' : ' https://www.youtube.com/c/CommanderVideoHq/live ' ,
' only_matching ' : True ,
2016-11-16 11:14:06 -05:00
} , {
' url ' : ' https://www.youtube.com/TheYoungTurks/live ' ,
' only_matching ' : True ,
2016-03-24 15:18:14 -04:00
} ]
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
channel_id = mobj . group ( ' id ' )
base_url = mobj . group ( ' base_url ' )
webpage = self . _download_webpage ( url , channel_id , fatal = False )
if webpage :
page_type = self . _og_search_property (
2018-01-20 05:57:20 -05:00
' type ' , webpage , ' page type ' , default = ' ' )
2016-03-24 15:18:14 -04:00
video_id = self . _html_search_meta (
' videoId ' , webpage , ' video id ' , default = None )
2018-01-20 05:57:20 -05:00
if page_type . startswith ( ' video ' ) and video_id and re . match (
r ' ^[0-9A-Za-z_-] {11} $ ' , video_id ) :
2016-03-24 15:18:14 -04:00
return self . url_result ( video_id , YoutubeIE . ie_key ( ) )
return self . url_result ( base_url )
2015-12-19 20:48:16 -05:00
class YoutubePlaylistsIE ( YoutubePlaylistsBaseInfoExtractor ) :
IE_DESC = ' YouTube.com user/channel playlists '
2020-06-27 23:30:03 -04:00
_VALID_URL = r ' https?://(?: \ w+ \ .)?youtube \ .com/(?:user|channel|c)/(?P<id>[^/]+)/playlists '
2015-12-19 20:48:16 -05:00
IE_NAME = ' youtube:playlists '
2015-11-21 17:17:07 -05:00
2015-11-21 18:03:23 -05:00
_TESTS = [ {
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/user/ThirstForScience/playlists ' ,
2015-11-21 17:17:07 -05:00
' playlist_mincount ' : 4 ,
' info_dict ' : {
' id ' : ' ThirstForScience ' ,
2019-07-20 13:01:46 -04:00
' title ' : ' ThirstForScience ' ,
2015-11-21 17:17:07 -05:00
} ,
2015-11-21 18:03:23 -05:00
} , {
# with "Load more" button
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd ' ,
2015-11-21 18:03:23 -05:00
' playlist_mincount ' : 70 ,
' info_dict ' : {
' id ' : ' igorkle1 ' ,
' title ' : ' Игорь Клейнер ' ,
} ,
2015-12-19 20:48:16 -05:00
} , {
' url ' : ' https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists ' ,
' playlist_mincount ' : 17 ,
' info_dict ' : {
' id ' : ' UCiU1dHvZObB2iP6xkJ__Icw ' ,
' title ' : ' Chem Player ' ,
} ,
2019-07-20 13:01:46 -04:00
' skip ' : ' Blocked ' ,
2020-06-27 23:30:03 -04:00
} , {
' url ' : ' https://www.youtube.com/c/ChristophLaimer/playlists ' ,
' only_matching ' : True ,
2015-11-21 18:03:23 -05:00
} ]
2015-11-21 17:17:07 -05:00
2018-02-22 15:34:55 -05:00
class YoutubeSearchBaseInfoExtractor ( YoutubePlaylistBaseInfoExtractor ) :
_VIDEO_RE = r ' href= " \ s*/watch \ ?v=(?P<id>[0-9A-Za-z_-] {11} )(?:[^ " ]* " [^>]+ \ btitle= " (?P<title>[^ " ]+))? '
class YoutubeSearchIE ( SearchInfoExtractor , YoutubeSearchBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com searches '
2015-04-21 13:30:31 -04:00
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
_MAX_RESULTS = float ( ' inf ' )
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube:search '
2013-06-23 14:28:15 -04:00
_SEARCH_KEY = ' ytsearch '
2020-10-24 00:57:14 -04:00
_SEARCH_PARAMS = None
2015-04-22 10:28:33 -04:00
_TESTS = [ ]
2013-06-23 14:28:15 -04:00
2020-10-24 00:57:14 -04:00
def _entries ( self , query , n ) :
data = {
' context ' : {
' client ' : {
' clientName ' : ' WEB ' ,
' clientVersion ' : ' 2.20201021.03.00 ' ,
}
} ,
' query ' : query ,
2017-02-02 12:28:24 -05:00
}
2020-10-24 00:57:14 -04:00
if self . _SEARCH_PARAMS :
data [ ' params ' ] = self . _SEARCH_PARAMS
total = 0
for page_num in itertools . count ( 1 ) :
search = self . _download_json (
' https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8 ' ,
video_id = ' query " %s " ' % query ,
note = ' Downloading page %s ' % page_num ,
errnote = ' Unable to download API page ' , fatal = False ,
data = json . dumps ( data ) . encode ( ' utf8 ' ) ,
headers = { ' content-type ' : ' application/json ' } )
if not search :
2015-04-21 13:30:31 -04:00
break
2020-10-24 00:57:14 -04:00
slr_contents = try_get (
search ,
( lambda x : x [ ' contents ' ] [ ' twoColumnSearchResultsRenderer ' ] [ ' primaryContents ' ] [ ' sectionListRenderer ' ] [ ' contents ' ] ,
lambda x : x [ ' onResponseReceivedCommands ' ] [ 0 ] [ ' appendContinuationItemsAction ' ] [ ' continuationItems ' ] ) ,
list )
if not slr_contents :
2017-02-02 12:28:24 -05:00
break
2020-10-24 00:57:14 -04:00
isr_contents = try_get (
slr_contents ,
lambda x : x [ 0 ] [ ' itemSectionRenderer ' ] [ ' contents ' ] ,
list )
if not isr_contents :
break
for content in isr_contents :
if not isinstance ( content , dict ) :
continue
video = content . get ( ' videoRenderer ' )
if not isinstance ( video , dict ) :
continue
video_id = video . get ( ' videoId ' )
if not video_id :
continue
title = try_get ( video , lambda x : x [ ' title ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] , compat_str )
description = try_get ( video , lambda x : x [ ' descriptionSnippet ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] , compat_str )
duration = parse_duration ( try_get ( video , lambda x : x [ ' lengthText ' ] [ ' simpleText ' ] , compat_str ) )
view_count_text = try_get ( video , lambda x : x [ ' viewCountText ' ] [ ' simpleText ' ] , compat_str ) or ' '
view_count = int_or_none ( self . _search_regex (
r ' ^( \ d+) ' , re . sub ( r ' \ s ' , ' ' , view_count_text ) ,
' view count ' , default = None ) )
uploader = try_get ( video , lambda x : x [ ' ownerText ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] , compat_str )
total + = 1
yield {
' _type ' : ' url_transparent ' ,
' ie_key ' : YoutubeIE . ie_key ( ) ,
' id ' : video_id ,
' url ' : video_id ,
' title ' : title ,
' description ' : description ,
' duration ' : duration ,
' view_count ' : view_count ,
' uploader ' : uploader ,
}
if total == n :
return
token = try_get (
slr_contents ,
lambda x : x [ 1 ] [ ' continuationItemRenderer ' ] [ ' continuationEndpoint ' ] [ ' continuationCommand ' ] [ ' token ' ] ,
compat_str )
if not token :
break
data [ ' continuation ' ] = token
2013-06-23 14:28:15 -04:00
2020-10-24 00:57:14 -04:00
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
return self . playlist_result ( self . _entries ( query , n ) , query )
2013-07-01 11:59:28 -04:00
2014-03-03 21:32:28 -05:00
2013-11-02 22:40:48 -04:00
class YoutubeSearchDateIE ( YoutubeSearchIE ) :
2013-12-03 07:55:25 -05:00
IE_NAME = YoutubeSearchIE . IE_NAME + ' :date '
2013-11-02 22:40:48 -04:00
_SEARCH_KEY = ' ytsearchdate '
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com searches, newest videos first '
2020-10-24 00:57:14 -04:00
_SEARCH_PARAMS = ' CAI % 3D '
2013-07-01 11:59:28 -04:00
2014-03-03 21:32:28 -05:00
2018-02-22 15:34:55 -05:00
class YoutubeSearchURLIE ( YoutubeSearchBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com search URLs '
IE_NAME = ' youtube:search_url '
2016-02-15 13:29:51 -05:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/results \ ?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$) '
2020-09-22 14:52:52 -04:00
_SEARCH_DATA = r ' (?:window \ [ " ytInitialData " \ ]|ytInitialData) \ W?= \ W?( { .*?}); '
2014-09-24 04:25:47 -04:00
_TESTS = [ {
2020-09-02 16:37:35 -04:00
' url ' : ' https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video ' ,
2014-09-24 04:25:47 -04:00
' playlist_mincount ' : 5 ,
' info_dict ' : {
2020-09-02 16:37:35 -04:00
' title ' : ' youtube-dl test video ' ,
2014-09-24 04:25:47 -04:00
}
2016-02-15 13:29:51 -05:00
} , {
' url ' : ' https://www.youtube.com/results?q=test&sp=EgQIBBgB ' ,
' only_matching ' : True ,
2014-09-24 04:25:47 -04:00
} ]
2014-03-03 21:32:28 -05:00
2020-06-23 02:56:21 -04:00
def _find_videos_in_json ( self , extracted ) :
videos = [ ]
def _real_find ( obj ) :
if obj is None or isinstance ( obj , str ) :
return
if type ( obj ) is list :
for elem in obj :
_real_find ( elem )
if type ( obj ) is dict :
if " videoId " in obj :
videos . append ( obj )
return
for _ , o in obj . items ( ) :
_real_find ( o )
_real_find ( extracted )
return videos
2020-06-19 15:29:47 -04:00
def extract_videos_from_page_impl ( self , page , ids_in_page , titles_in_page ) :
search_response = self . _parse_json ( self . _search_regex ( self . _SEARCH_DATA , page , ' ytInitialData ' ) , None )
2020-06-23 02:56:21 -04:00
result_items = self . _find_videos_in_json ( search_response )
2020-06-19 15:29:47 -04:00
2020-09-30 09:49:51 -04:00
for renderer in result_items :
video_id = try_get ( renderer , lambda x : x [ ' videoId ' ] )
video_title = try_get ( renderer , lambda x : x [ ' title ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] ) or try_get ( renderer , lambda x : x [ ' title ' ] [ ' simpleText ' ] )
2020-06-19 15:29:47 -04:00
if video_id is None or video_title is None :
2020-09-30 09:49:51 -04:00
# we do not have a videoRenderer or title extraction broke
2020-06-19 15:29:47 -04:00
continue
video_title = video_title . strip ( )
try :
idx = ids_in_page . index ( video_id )
if video_title and not titles_in_page [ idx ] :
titles_in_page [ idx ] = video_title
except ValueError :
ids_in_page . append ( video_id )
titles_in_page . append ( video_title )
def extract_videos_from_page ( self , page ) :
ids_in_page = [ ]
titles_in_page = [ ]
self . extract_videos_from_page_impl ( page , ids_in_page , titles_in_page )
return zip ( ids_in_page , titles_in_page )
2014-03-03 21:32:28 -05:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
2015-07-17 13:51:57 -04:00
query = compat_urllib_parse_unquote_plus ( mobj . group ( ' query ' ) )
2014-03-03 21:32:28 -05:00
webpage = self . _download_webpage ( url , query )
2016-04-26 12:29:29 -04:00
return self . playlist_result ( self . _process_page ( webpage ) , playlist_title = query )
2014-03-03 21:32:28 -05:00
2015-11-21 17:18:20 -05:00
class YoutubeShowIE ( YoutubePlaylistsBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_DESC = ' YouTube.com (multi-season) shows '
2016-09-08 07:29:05 -04:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/show/(?P<id>[^?#]*) '
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube:show '
2014-09-24 04:25:47 -04:00
_TESTS = [ {
2015-09-07 06:56:16 -04:00
' url ' : ' https://www.youtube.com/show/airdisasters ' ,
2015-09-07 02:47:55 -04:00
' playlist_mincount ' : 5 ,
2014-09-24 04:25:47 -04:00
' info_dict ' : {
' id ' : ' airdisasters ' ,
' title ' : ' Air Disasters ' ,
}
} ]
2013-07-01 11:59:28 -04:00
def _real_extract ( self , url ) :
2015-11-21 17:18:20 -05:00
playlist_id = self . _match_id ( url )
return super ( YoutubeShowIE , self ) . _real_extract (
' https://www.youtube.com/show/ %s /playlists ' % playlist_id )
2013-07-07 07:58:23 -04:00
2013-07-24 14:40:12 -04:00
class YoutubeFeedsInfoExtractor ( YoutubeBaseInfoExtractor ) :
2013-07-20 13:33:40 -04:00
"""
2015-05-15 11:06:59 -04:00
Base class for feed extractors
2013-07-20 13:33:40 -04:00
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties .
"""
2013-07-24 14:40:12 -04:00
_LOGIN_REQUIRED = True
2020-09-03 14:41:45 -04:00
_FEED_DATA = r ' (?:window \ [ " ytInitialData " \ ]|ytInitialData) \ W?= \ W?( { .*?}); '
2020-07-30 10:38:56 -04:00
_YTCFG_DATA = r " ytcfg.set \ (( { .*?}) \ ) "
2013-07-20 13:33:40 -04:00
@property
def IE_NAME ( self ) :
2014-09-13 01:51:06 -04:00
return ' youtube: %s ' % self . _FEED_NAME
2013-07-07 07:58:23 -04:00
2013-07-08 05:23:05 -04:00
def _real_initialize ( self ) :
2013-07-24 14:40:12 -04:00
self . _login ( )
2013-07-08 05:23:05 -04:00
2020-07-30 10:34:48 -04:00
def _find_videos_in_json ( self , extracted ) :
videos = [ ]
2020-07-31 04:05:11 -04:00
c = { }
2020-07-30 10:34:48 -04:00
def _real_find ( obj ) :
if obj is None or isinstance ( obj , str ) :
return
if type ( obj ) is list :
for elem in obj :
_real_find ( elem )
if type ( obj ) is dict :
if " videoId " in obj :
videos . append ( obj )
return
2020-07-30 10:38:56 -04:00
2020-07-30 10:34:48 -04:00
if " nextContinuationData " in obj :
2020-07-31 04:05:11 -04:00
c [ " continuation " ] = obj [ " nextContinuationData " ]
2020-07-30 10:38:56 -04:00
return
2020-07-30 10:34:48 -04:00
for _ , o in obj . items ( ) :
_real_find ( o )
_real_find ( extracted )
2020-07-31 04:05:11 -04:00
return videos , try_get ( c , lambda x : x [ " continuation " ] )
2020-07-30 10:38:56 -04:00
2018-04-21 19:07:32 -04:00
def _entries ( self , page ) :
2020-07-30 10:34:48 -04:00
info = [ ]
2020-08-24 08:29:16 -04:00
yt_conf = self . _parse_json ( self . _search_regex ( self . _YTCFG_DATA , page , ' ytcfg.set ' , default = " null " ) , None , fatal = False )
2020-07-30 10:34:48 -04:00
search_response = self . _parse_json ( self . _search_regex ( self . _FEED_DATA , page , ' ytInitialData ' ) , None )
2015-05-14 17:41:27 -04:00
for page_num in itertools . count ( 1 ) :
2020-07-30 10:34:48 -04:00
video_info , continuation = self . _find_videos_in_json ( search_response )
2015-05-15 11:42:34 -04:00
2020-07-30 10:38:56 -04:00
new_info = [ ]
2020-07-30 10:34:48 -04:00
for v in video_info :
v_id = try_get ( v , lambda x : x [ ' videoId ' ] )
if not v_id :
continue
2020-07-30 10:38:56 -04:00
have_video = False
2020-07-30 10:34:48 -04:00
for old in info :
if old [ ' videoId ' ] == v_id :
have_video = True
break
if not have_video :
new_info . append ( v )
if not new_info :
2015-05-15 11:42:34 -04:00
break
2020-07-30 10:34:48 -04:00
info . extend ( new_info )
2015-05-14 17:41:27 -04:00
2020-07-30 10:34:48 -04:00
for video in new_info :
2020-08-24 08:30:08 -04:00
yield self . url_result ( try_get ( video , lambda x : x [ ' videoId ' ] ) , YoutubeIE . ie_key ( ) , video_title = try_get ( video , lambda x : x [ ' title ' ] [ ' runs ' ] [ 0 ] [ ' text ' ] ) or try_get ( video , lambda x : x [ ' title ' ] [ ' simpleText ' ] ) )
2018-04-21 19:07:32 -04:00
2020-08-24 08:29:16 -04:00
if not continuation or not yt_conf :
2015-05-14 17:41:27 -04:00
break
2020-07-30 10:34:48 -04:00
search_response = self . _download_json (
' https://www.youtube.com/browse_ajax ' , self . _PLAYLIST_TITLE ,
2015-05-14 17:41:27 -04:00
' Downloading page # %s ' % page_num ,
2020-06-15 14:59:46 -04:00
transform_source = uppercase_escape ,
2020-07-30 10:34:48 -04:00
query = {
" ctoken " : try_get ( continuation , lambda x : x [ " continuation " ] ) ,
" continuation " : try_get ( continuation , lambda x : x [ " continuation " ] ) ,
" itct " : try_get ( continuation , lambda x : x [ " clickTrackingParams " ] )
} ,
headers = {
" X-YouTube-Client-Name " : try_get ( yt_conf , lambda x : x [ " INNERTUBE_CONTEXT_CLIENT_NAME " ] ) ,
" X-YouTube-Client-Version " : try_get ( yt_conf , lambda x : x [ " INNERTUBE_CONTEXT_CLIENT_VERSION " ] ) ,
" X-Youtube-Identity-Token " : try_get ( yt_conf , lambda x : x [ " ID_TOKEN " ] ) ,
" X-YouTube-Device " : try_get ( yt_conf , lambda x : x [ " DEVICE " ] ) ,
" X-YouTube-Page-CL " : try_get ( yt_conf , lambda x : x [ " PAGE_CL " ] ) ,
" X-YouTube-Page-Label " : try_get ( yt_conf , lambda x : x [ " PAGE_BUILD_LABEL " ] ) ,
2020-07-30 10:38:56 -04:00
" X-YouTube-Variants-Checksum " : try_get ( yt_conf , lambda x : x [ " VARIANTS_CHECKSUM " ] ) ,
2020-07-30 10:34:48 -04:00
} )
2015-05-14 17:41:27 -04:00
2018-04-21 19:07:32 -04:00
def _real_extract ( self , url ) :
page = self . _download_webpage (
' https://www.youtube.com/feed/ %s ' % self . _FEED_NAME ,
self . _PLAYLIST_TITLE )
2015-05-15 11:06:59 -04:00
return self . playlist_result (
2018-04-21 19:07:32 -04:00
self . _entries ( page ) , playlist_title = self . _PLAYLIST_TITLE )
2015-05-15 11:06:59 -04:00
class YoutubeWatchLaterIE ( YoutubePlaylistIE ) :
IE_NAME = ' youtube:watchlater '
IE_DESC = ' Youtube watch later list, " :ytwatchlater " for short (requires authentication) '
2016-09-08 07:29:05 -04:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/(?:feed/watch_later|(?:playlist|watch) \ ?(?:.+&)?list=WL)|:ytwatchlater '
2015-05-15 11:06:59 -04:00
2016-02-18 10:50:21 -05:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/playlist?list=WL ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL ' ,
' only_matching ' : True ,
} ]
2015-05-15 11:06:59 -04:00
def _real_extract ( self , url ) :
2016-09-03 13:29:01 -04:00
_ , video = self . _check_download_just_video ( url , ' WL ' )
2016-02-18 11:03:46 -05:00
if video :
return video
2016-09-02 12:43:20 -04:00
_ , playlist = self . _extract_playlist ( ' WL ' )
return playlist
2013-11-24 08:33:50 -05:00
2014-11-23 14:41:03 -05:00
2013-07-24 14:45:19 -04:00
class YoutubeFavouritesIE ( YoutubeBaseInfoExtractor ) :
2014-09-13 01:51:06 -04:00
IE_NAME = ' youtube:favorites '
2014-11-23 14:09:10 -05:00
IE_DESC = ' YouTube.com favourite videos, " :ytfav " for short (requires authentication) '
2016-09-08 07:29:05 -04:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/my_favorites|:ytfav(?:ou?rites)? '
2013-07-24 14:45:19 -04:00
_LOGIN_REQUIRED = True
def _real_extract ( self , url ) :
webpage = self . _download_webpage ( ' https://www.youtube.com/my_favorites ' , ' Youtube Favourites videos ' )
2014-09-13 01:51:06 -04:00
playlist_id = self . _search_regex ( r ' list=(.+?)[ " &] ' , webpage , ' favourites playlist id ' )
2013-07-24 14:45:19 -04:00
return self . url_result ( playlist_id , ' YoutubePlaylist ' )
2013-10-07 06:21:24 -04:00
2015-05-15 11:06:59 -04:00
class YoutubeRecommendedIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = ' YouTube.com recommended videos, " :ytrec " for short (requires authentication) '
2016-09-08 07:29:05 -04:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/feed/recommended|:ytrec(?:ommended)? '
2015-05-15 11:06:59 -04:00
_FEED_NAME = ' recommended '
_PLAYLIST_TITLE = ' Youtube Recommended videos '
2014-08-31 17:44:43 -04:00
2015-05-15 11:06:59 -04:00
class YoutubeSubscriptionsIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = ' YouTube.com subscriptions feed, " ytsubs " keyword (requires authentication) '
2016-09-08 07:29:05 -04:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/feed/subscriptions|:ytsubs(?:criptions)? '
2015-05-15 11:06:59 -04:00
_FEED_NAME = ' subscriptions '
_PLAYLIST_TITLE = ' Youtube Subscriptions '
2014-08-31 17:44:43 -04:00
2015-05-15 11:06:59 -04:00
class YoutubeHistoryIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = ' Youtube watch history, " :ythistory " for short (requires authentication) '
2016-09-08 07:29:05 -04:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/feed/history|:ythistory '
2015-05-15 11:06:59 -04:00
_FEED_NAME = ' history '
_PLAYLIST_TITLE = ' Youtube History '
2014-08-31 17:44:43 -04:00
2013-10-07 06:21:24 -04:00
class YoutubeTruncatedURLIE ( InfoExtractor ) :
IE_NAME = ' youtube:truncated_url '
IE_DESC = False # Do not list
2014-01-23 10:14:54 -05:00
_VALID_URL = r ''' (?x)
2015-01-24 05:42:20 -05:00
( ? : https ? : / / ) ?
( ? : \w + \. ) ? [ yY ] [ oO ] [ uU ] [ tT ] [ uU ] [ bB ] [ eE ] ( ? : - nocookie ) ? \. com /
( ? : watch \? ( ? :
2014-07-01 09:48:18 -04:00
feature = [ a - z_ ] + |
2015-01-24 05:42:20 -05:00
annotation_id = annotation_ [ ^ & ] + |
x - yt - cl = [ 0 - 9 ] + |
2015-01-29 21:45:29 -05:00
hl = [ ^ & ] * |
2015-09-13 18:26:12 -04:00
t = [ 0 - 9 ] +
2015-01-24 05:42:20 -05:00
) ?
|
attribution_link \? a = [ ^ & ] +
)
$
2014-01-23 10:14:54 -05:00
'''
2013-10-07 06:21:24 -04:00
2014-07-01 09:48:18 -04:00
_TESTS = [ {
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/watch?annotation_id=annotation_3951667041 ' ,
2014-07-01 09:48:18 -04:00
' only_matching ' : True ,
2014-07-01 09:49:34 -04:00
} , {
2016-09-17 10:48:20 -04:00
' url ' : ' https://www.youtube.com/watch? ' ,
2014-07-01 09:49:34 -04:00
' only_matching ' : True ,
2015-01-24 05:42:20 -05:00
} , {
' url ' : ' https://www.youtube.com/watch?x-yt-cl=84503534 ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.youtube.com/watch?feature=foo ' ,
' only_matching ' : True ,
2015-01-29 21:45:29 -05:00
} , {
' url ' : ' https://www.youtube.com/watch?hl=en-GB ' ,
' only_matching ' : True ,
2015-09-13 18:26:12 -04:00
} , {
' url ' : ' https://www.youtube.com/watch?t=2372 ' ,
' only_matching ' : True ,
2014-07-01 09:48:18 -04:00
} ]
2013-10-07 06:21:24 -04:00
def _real_extract ( self , url ) :
raise ExtractorError (
2014-09-13 01:51:06 -04:00
' Did you forget to quote the URL? Remember that & is a meta '
' character in most shells, so you want to put the URL in quotes, '
2020-09-02 16:37:35 -04:00
' like youtube-dl '
2016-09-17 10:48:20 -04:00
' " https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc " '
2020-09-02 16:37:35 -04:00
' or simply youtube-dl BaW_jenozKc . ' ,
2013-10-07 06:21:24 -04:00
expected = True )
2015-01-01 17:44:39 -05:00
class YoutubeTruncatedIDIE ( InfoExtractor ) :
IE_NAME = ' youtube:truncated_id '
IE_DESC = False # Do not list
2015-01-24 05:42:20 -05:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/watch \ ?v=(?P<id>[0-9A-Za-z_-] { 1,10})$ '
2015-01-01 17:44:39 -05:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/watch?v=N_708QY7Ob ' ,
' only_matching ' : True ,
} ]
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
raise ExtractorError (
' Incomplete YouTube ID %s . URL %s looks truncated. ' % ( video_id , url ) ,
expected = True )