From 0f60ba6e656516ec24d619d20d61249be6296105 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 27 Sep 2022 02:30:50 +0530 Subject: [PATCH] [extractor] Improve json+ld extraction Related #5035 --- yt_dlp/extractor/common.py | 11 +++++++++-- yt_dlp/extractor/generic.py | 2 +- yt_dlp/utils.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 87660bb23..d36f025ab 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1536,10 +1536,10 @@ def extract_chapter_information(e): info['chapters'] = chapters def extract_video_object(e): - assert is_type(e, 'VideoObject') author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), + 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'thumbnails': [{'url': unescapeHTML(url)} @@ -1552,12 +1552,19 @@ def extract_video_object(e): # however some websites are using 'Text' type instead. # 1. https://schema.org/VideoObject 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, + 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), 'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), + 'tags': try_call(lambda: e.get('keywords').split(',')), }) + if is_type(e, 'AudioObject'): + info.update({ + 'vcodec': 'none', + 'abr': int_or_none(e.get('bitrate')), + }) extract_interaction_statistic(e) extract_chapter_information(e) @@ -1608,7 +1615,7 @@ def traverse_json_ld(json_ld, at_top_level=True): extract_video_object(e['video'][0]) elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): extract_video_object(e['subjectOf'][0]) - elif is_type(e, 'VideoObject'): + elif is_type(e, 'VideoObject', 'AudioObject'): extract_video_object(e) if expected_type is None: continue diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 672034c6d..73aefc782 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2910,7 +2910,7 @@ def _real_extract(self, url): if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') return merge_dicts({ - '_type': 'url_transparent', + '_type': 'video' if json_ld.get('ext') else 'url_transparent', 'url': smuggle_url(json_ld['url'], { 'force_videoid': video_id, 'to_generic': True, diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d655bfdd0..724e34ef7 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -232,7 +232,7 @@ def random_user_agent(): ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P{.+?})\s*' +JSON_LD_RE = r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P{.+?}|\[.+?\])\s*' NUMBER_RE = r'\d+(?:\.\d+)?'