[cleanup] Misc fixes

Closes #4027
This commit is contained in:
pukkandan 2022-06-11 00:33:54 +05:30
parent d05460e5fe
commit 56ba69e4c9
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
13 changed files with 72 additions and 83 deletions

View file

@ -431,7 +431,7 @@ ##### Example
r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title') r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
``` ```
Or even better: which tolerates potential changes in the `style` attribute's value. Or even better:
```python ```python
title = self._search_regex( # correct title = self._search_regex( # correct
@ -439,7 +439,7 @@ ##### Example
webpage, 'title', group='title') webpage, 'title', group='title')
``` ```
Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute: which also handles both single quotes in addition to double quotes.
The code definitely should not look like: The code definitely should not look like:

View file

@ -103,7 +103,7 @@ # NEW FEATURES
* **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md) * **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md)
* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN * **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN etc.
* **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details * **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details
@ -1710,7 +1710,7 @@ # EXTRACTOR ARGUMENTS
#### youtube #### youtube
* `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and auto-translated subtitles respectively * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and auto-translated subtitles respectively
* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but tv_embedded and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) * `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly)
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)

View file

@ -140,6 +140,9 @@ def run(self):
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: Implementation', 'Programming Language :: Python :: Implementation',
'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy', 'Programming Language :: Python :: Implementation :: PyPy',

View file

@ -2570,7 +2570,7 @@ def is_wellformed(f):
format['dynamic_range'] = 'SDR' format['dynamic_range'] = 'SDR'
if (info_dict.get('duration') and format.get('tbr') if (info_dict.get('duration') and format.get('tbr')
and not format.get('filesize') and not format.get('filesize_approx')): and not format.get('filesize') and not format.get('filesize_approx')):
format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
# Add HTTP headers, so that external programs can use them from the # Add HTTP headers, so that external programs can use them from the
# json output # json output
@ -3059,16 +3059,15 @@ def existing_video_file(*filepaths):
return file return file
success = True success = True
merger = FFmpegMergerPP(self) merger, fd = FFmpegMergerPP(self), None
if info_dict.get('url'):
fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
if fd is not FFmpegFD and ( if fd is not FFmpegFD and (
info_dict.get('section_start') or info_dict.get('section_end')): info_dict.get('section_start') or info_dict.get('section_end')):
msg = ('This format cannot be partially downloaded' if merger.available msg = ('This format cannot be partially downloaded' if merger.available
else 'You have requested downloading the video partially, but ffmpeg is not installed') else 'You have requested downloading the video partially, but ffmpeg is not installed')
if not self.params.get('ignoreerrors'): self.report_error(f'{msg}. Aborting')
self.report_error(f'{msg}. Aborting due to --abort-on-error')
return return
self.report_warning(f'{msg}. The entire video will be downloaded')
if info_dict.get('requested_formats') is not None: if info_dict.get('requested_formats') is not None:

View file

@ -337,14 +337,11 @@ def decrypt(self, encrypted_value):
def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None):
if sys.platform in ('linux', 'linux2'): if sys.platform == 'darwin':
return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
elif sys.platform == 'darwin':
return MacChromeCookieDecryptor(browser_keyring_name, logger) return MacChromeCookieDecryptor(browser_keyring_name, logger)
elif sys.platform == 'win32': elif sys.platform in ('win32', 'cygwin'):
return WindowsChromeCookieDecryptor(browser_root, logger) return WindowsChromeCookieDecryptor(browser_root, logger)
else: return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
raise NotImplementedError(f'Chrome cookie decryption is not supported on this platform: {sys.platform}')
class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):

View file

@ -1487,7 +1487,7 @@ def extract_video_object(e):
# however some websites are using 'Text' type instead. # however some websites are using 'Text' type instead.
# 1. https://schema.org/VideoObject # 1. https://schema.org/VideoObject
'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
'filesize': float_or_none(e.get('contentSize')), 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
'tbr': int_or_none(e.get('bitrate')), 'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')), 'width': int_or_none(e.get('width')),
'height': int_or_none(e.get('height')), 'height': int_or_none(e.get('height')),

View file

@ -534,7 +534,6 @@
) )
from .foxsports import FoxSportsIE from .foxsports import FoxSportsIE
from .fptplay import FptplayIE from .fptplay import FptplayIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE from .franceinter import FranceInterIE
from .francetv import ( from .francetv import (
FranceTVIE, FranceTVIE,
@ -1348,7 +1347,7 @@
from .radiode import RadioDeIE from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE from .radiofrance import FranceCultureIE, RadioFranceIE
from .radiozet import RadioZetPodcastIE from .radiozet import RadioZetPodcastIE
from .radiokapital import ( from .radiokapital import (
RadioKapitalIE, RadioKapitalIE,

View file

@ -1,46 +0,0 @@
from .common import InfoExtractor
from ..utils import int_or_none, parse_duration, unified_strdate
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
_TESTS = [
{
'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
'info_dict': {
'id': '8440487',
'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
'ext': 'mp3',
'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?',
'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?',
'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
'upload_date': '20220514',
'duration': 2750,
},
},
]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
webpage = self._download_webpage(url, display_id)
# _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
return {
'id': video_id,
'display_id': display_id,
'url': video_data['contentUrl'],
'ext': video_data.get('encodingFormat'),
'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
'duration': parse_duration(video_data.get('duration')),
'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
webpage, 'title', default=self._og_search_title(webpage)),
'description': self._html_search_regex(
r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader': self._html_search_regex(
r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
'upload_date': unified_strdate(self._search_regex(
r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
}

View file

@ -2,11 +2,7 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import int_or_none, traverse_obj, urlencode_postdata
int_or_none,
traverse_obj,
urlencode_postdata,
)
class FreeTvBaseIE(InfoExtractor): class FreeTvBaseIE(InfoExtractor):

View file

@ -1,6 +1,7 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_duration, unified_strdate
class RadioFranceIE(InfoExtractor): class RadioFranceIE(InfoExtractor):
@ -54,3 +55,47 @@ def _real_extract(self, url):
'description': description, 'description': description,
'uploader': uploader, 'uploader': uploader,
} }
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
_TESTS = [
{
'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
'info_dict': {
'id': '8440487',
'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
'ext': 'mp3',
'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?',
'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?',
'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
'upload_date': '20220514',
'duration': 2750,
},
},
]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
webpage = self._download_webpage(url, display_id)
# _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
return {
'id': video_id,
'display_id': display_id,
'url': video_data['contentUrl'],
'ext': video_data.get('encodingFormat'),
'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
'duration': parse_duration(video_data.get('duration')),
'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
webpage, 'title', default=self._og_search_title(webpage)),
'description': self._html_search_regex(
r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader': self._html_search_regex(
r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
'upload_date': unified_strdate(self._search_regex(
r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
}

View file

@ -3674,8 +3674,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
initial_data = None initial_data = None
if webpage: if webpage:
initial_data = self._search_json( initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False)
self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', video_id, fatal=False)
if not initial_data: if not initial_data:
query = {'videoId': video_id} query = {'videoId': video_id}
query.update(self._get_checkok_params()) query.update(self._get_checkok_params())

View file

@ -45,9 +45,6 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
an initial argument and then with the returned value of the previous an initial argument and then with the returned value of the previous
PostProcessor. PostProcessor.
The chain will be stopped if one of them ever returns None or the end
of the chain is reached.
PostProcessor objects follow a "mutual registration" process similar PostProcessor objects follow a "mutual registration" process similar
to InfoExtractor objects. to InfoExtractor objects.

View file

@ -3498,13 +3498,13 @@ def _match_func(info_dict, incomplete=False):
def download_range_func(chapters, ranges): def download_range_func(chapters, ranges):
def inner(info_dict, ydl): def inner(info_dict, ydl):
warning = ('There are no chapters matching the regex' if info_dict.get('chapters') warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
else 'Chapter information is unavailable') else 'Cannot match chapters since chapter information is unavailable')
for regex in chapters or []: for regex in chapters or []:
for i, chapter in enumerate(info_dict.get('chapters') or []): for i, chapter in enumerate(info_dict.get('chapters') or []):
if re.search(regex, chapter['title']): if re.search(regex, chapter['title']):
warning = None warning = None
yield {**chapter, 'index': i} yield {**chapter, 'index': i}
if warning: if chapters and warning:
ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
yield from ({'start_time': start, 'end_time': end} for start, end in ranges or []) yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
@ -4903,9 +4903,9 @@ def to_high_limit_path(path):
return path return path
def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None): def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
val = traverse_obj(obj, *variadic(field)) val = traverse_obj(obj, *variadic(field))
if val in ignore: if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
return default return default
return template % (func(val) if func else val) return template % (func(val) if func else val)