[cleanup] Misc fixes

Closes #4027
2024-12-02 12:02:07 -05:00 · 2022-06-11 00:33:54 +05:30 · 2022-06-11 00:33:54 +05:30 · 56ba69e4c9
commit 56ba69e4c9
parent d05460e5fe
13 changed files with 72 additions and 83 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -431,7 +431,7 @@ ##### Example
    r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
 ```
-Or even better:
+which tolerates potential changes in the `style` attribute's value. Or even better:
 ```python
 title = self._search_regex(  # correct
@ -439,7 +439,7 @@ ##### Example
    webpage, 'title', group='title')
 ```
-Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute: 
+which also handles both single quotes in addition to double quotes.
 The code definitely should not look like:
--- a/README.md
+++ b/README.md
@ -103,7 +103,7 @@ # NEW FEATURES
 * **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md)
-* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN
+* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN etc.
 * **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details
@ -1710,7 +1710,7 @@ # EXTRACTOR ARGUMENTS
 #### youtube
 * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and auto-translated subtitles respectively
-* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but tv_embedded and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
+* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
 * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
 * `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly)
 * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
--- a/setup.py
+++ b/setup.py
@ -140,6 +140,9 @@ def run(self):
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
        'Programming Language :: Python :: 3.11',
        'Programming Language :: Python :: Implementation',
        'Programming Language :: Python :: Implementation :: CPython',
        'Programming Language :: Python :: Implementation :: PyPy',
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2570,7 +2570,7 @@ def is_wellformed(f):
                format['dynamic_range'] = 'SDR'
            if (info_dict.get('duration') and format.get('tbr')
                    and not format.get('filesize') and not format.get('filesize_approx')):
-                format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8)
+                format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
            # Add HTTP headers, so that external programs can use them from the
            # json output
@ -3059,16 +3059,15 @@ def existing_video_file(*filepaths):
                    return file
                success = True
-                merger = FFmpegMergerPP(self)
+                merger, fd = FFmpegMergerPP(self), None
                if info_dict.get('url'):
                    fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
                    if fd is not FFmpegFD and (
                            info_dict.get('section_start') or info_dict.get('section_end')):
                        msg = ('This format cannot be partially downloaded' if merger.available
                               else 'You have requested downloading the video partially, but ffmpeg is not installed')
-                    if not self.params.get('ignoreerrors'):
+                        self.report_error(f'{msg}. Aborting')
                        self.report_error(f'{msg}. Aborting due to --abort-on-error')
                        return
                    self.report_warning(f'{msg}. The entire video will be downloaded')
                if info_dict.get('requested_formats') is not None:
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@ -337,14 +337,11 @@ def decrypt(self, encrypted_value):
 def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None):
-    if sys.platform in ('linux', 'linux2'):
+    if sys.platform == 'darwin':
        return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
    elif sys.platform == 'darwin':
        return MacChromeCookieDecryptor(browser_keyring_name, logger)
-    elif sys.platform == 'win32':
+    elif sys.platform in ('win32', 'cygwin'):
        return WindowsChromeCookieDecryptor(browser_root, logger)
-    else:
+    return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring)
        raise NotImplementedError(f'Chrome cookie decryption is not supported on this platform: {sys.platform}')
 class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -1487,7 +1487,7 @@ def extract_video_object(e):
                # however some websites are using 'Text' type instead.
                # 1. https://schema.org/VideoObject
                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
-                'filesize': float_or_none(e.get('contentSize')),
+                'filesize': int_or_none(float_or_none(e.get('contentSize'))),
                'tbr': int_or_none(e.get('bitrate')),
                'width': int_or_none(e.get('width')),
                'height': int_or_none(e.get('height')),
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -534,7 +534,6 @@
 )
 from .foxsports import FoxSportsIE
 from .fptplay import FptplayIE
 from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
    FranceTVIE,
@ -1348,7 +1347,7 @@
 from .radiode import RadioDeIE
 from .radiojavan import RadioJavanIE
 from .radiobremen import RadioBremenIE
-from .radiofrance import RadioFranceIE
+from .radiofrance import FranceCultureIE, RadioFranceIE
 from .radiozet import RadioZetPodcastIE
 from .radiokapital import (
    RadioKapitalIE,
--- a/yt_dlp/extractor/franceculture.py
+++ b/yt_dlp/extractor/franceculture.py
@ -1,46 +0,0 @@
 from .common import InfoExtractor
 from ..utils import int_or_none, parse_duration, unified_strdate
 class FranceCultureIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
    _TESTS = [
        {
            'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
            'info_dict': {
                'id': '8440487',
                'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
                'ext': 'mp3',
                'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
                'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
                'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
                'upload_date': '20220514',
                'duration': 2750,
            },
        },
    ]
    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
        webpage = self._download_webpage(url, display_id)
        # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
        video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
        return {
            'id': video_id,
            'display_id': display_id,
            'url': video_data['contentUrl'],
            'ext': video_data.get('encodingFormat'),
            'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
            'duration': parse_duration(video_data.get('duration')),
            'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
                                             webpage, 'title', default=self._og_search_title(webpage)),
            'description': self._html_search_regex(
                r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
            'thumbnail': self._og_search_thumbnail(webpage),
            'uploader': self._html_search_regex(
                r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
            'upload_date': unified_strdate(self._search_regex(
                r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
        }
--- a/yt_dlp/extractor/freetv.py
+++ b/yt_dlp/extractor/freetv.py
@ -2,11 +2,7 @@
 import re
 from .common import InfoExtractor
-from ..utils import (
+from ..utils import int_or_none, traverse_obj, urlencode_postdata
    int_or_none,
    traverse_obj,
    urlencode_postdata,
 )
 class FreeTvBaseIE(InfoExtractor):
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@ -1,6 +1,7 @@
 import re
 from .common import InfoExtractor
 from ..utils import parse_duration, unified_strdate
 class RadioFranceIE(InfoExtractor):
@ -54,3 +55,47 @@ def _real_extract(self, url):
            'description': description,
            'uploader': uploader,
        }
 class FranceCultureIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
    _TESTS = [
        {
            'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
            'info_dict': {
                'id': '8440487',
                'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
                'ext': 'mp3',
                'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
                'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
                'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
                'upload_date': '20220514',
                'duration': 2750,
            },
        },
    ]
    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
        webpage = self._download_webpage(url, display_id)
        # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
        video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
        return {
            'id': video_id,
            'display_id': display_id,
            'url': video_data['contentUrl'],
            'ext': video_data.get('encodingFormat'),
            'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
            'duration': parse_duration(video_data.get('duration')),
            'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
                                             webpage, 'title', default=self._og_search_title(webpage)),
            'description': self._html_search_regex(
                r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
            'thumbnail': self._og_search_thumbnail(webpage),
            'uploader': self._html_search_regex(
                r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
            'upload_date': unified_strdate(self._search_regex(
                r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
        }
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -3674,8 +3674,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
        initial_data = None
        if webpage:
-            initial_data = self._search_json(
+            initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False)
                self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', video_id, fatal=False)
        if not initial_data:
            query = {'videoId': video_id}
            query.update(self._get_checkok_params())
--- a/yt_dlp/postprocessor/common.py
+++ b/yt_dlp/postprocessor/common.py
@ -45,9 +45,6 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
    an initial argument and then with the returned value of the previous
    PostProcessor.
    The chain will be stopped if one of them ever returns None or the end
    of the chain is reached.
    PostProcessor objects follow a "mutual registration" process similar
    to InfoExtractor objects.
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -3498,13 +3498,13 @@ def _match_func(info_dict, incomplete=False):
 def download_range_func(chapters, ranges):
    def inner(info_dict, ydl):
        warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
-                   else 'Chapter information is unavailable')
+                   else 'Cannot match chapters since chapter information is unavailable')
        for regex in chapters or []:
            for i, chapter in enumerate(info_dict.get('chapters') or []):
                if re.search(regex, chapter['title']):
                    warning = None
                    yield {**chapter, 'index': i}
-        if warning:
+        if chapters and warning:
            ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
        yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
@ -4903,9 +4903,9 @@ def to_high_limit_path(path):
    return path
-def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
+def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
    val = traverse_obj(obj, *variadic(field))
-    if val in ignore:
+    if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
        return default
    return template % (func(val) if func else val)