From e81a47460365738a0add4d4da52a712c0091704f Mon Sep 17 00:00:00 2001 From: snipem Date: Fri, 3 Apr 2015 15:34:49 +0200 Subject: [PATCH 1/5] [Gamersyde] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/gamersyde.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 youtube_dl/extractor/gamersyde.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index aae4aae4c1..2935d5b33c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -177,6 +177,7 @@ GameOneIE, GameOnePlaylistIE, ) +from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gametrailers import GametrailersIE diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py new file mode 100644 index 0000000000..c401062163 --- /dev/null +++ b/youtube_dl/extractor/gamersyde.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re +import json +import time +from .common import InfoExtractor + + +class GamersydeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' + _TEST = { + 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', + 'md5': 'f38d400d32f19724570040d5ce3a505f', + 'info_dict': { + 'id': '34371', + 'ext': 'mp4', + 'title': 'Bloodborne - Birth of a hero', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _calculateDuration(self, durationString): + duration = time.strptime(durationString, "%M minutes %S seconds") + return duration.tm_min * 60 + duration.tm_sec + + def _fixJsonSyntax(self, json): + + json = re.sub(r"{\s*(\w)", r'{"\1', json) + json = re.sub(r",\s*(\w)", r',"\1', json) + json = re.sub(r"(\w): ", r'\1":', json) + json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) + json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) + + return json + + def _real_extract(self, url): + + video_id = self._search_regex(r'-(.*?)_[a-z]{2}.html$', url, 'video_id') + webpage = self._download_webpage(url, video_id) + + filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) + filesJson = self._fixJsonSyntax(filesJson) + + data = json.loads(filesJson) + playlist = data[0] + + formats = [] + + title = re.sub(r"[0-9]+ - ", "", playlist['title']) + + for playlistEntry in playlist['sources']: + format = { + 'url': playlistEntry['file'], + 'format_id': playlistEntry['label'] + } + + formats.append(format) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': playlist['image'] + } From 115c281672bd7479f87c48249f6a0186ac7d19cc Mon Sep 17 00:00:00 2001 From: snipem Date: Sat, 4 Apr 2015 12:31:48 +0200 Subject: [PATCH 2/5] [Gamersyde] Improved robustness, added duration and tests Fix for Json syntax is now less error prone for Json syntax inside of values. Extractor is now also using native Json handling. Added tests for several videos that were producing errors in the first place. --- youtube_dl/extractor/gamersyde.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index c401062163..5c68a68914 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -8,7 +8,6 @@ class GamersydeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' - _TEST = { 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { @@ -17,6 +16,11 @@ class GamersydeIE(InfoExtractor): 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } + }, + { + 'url': 'http://www.gamersyde.com/hqstream_dark_souls_ii_scholar_of_the_first_sin_gameplay_part_1-34417_en.html', + 'info_dict': { + 'ext': 'mp4', } def _calculateDuration(self, durationString): @@ -27,7 +31,6 @@ def _fixJsonSyntax(self, json): json = re.sub(r"{\s*(\w)", r'{"\1', json) json = re.sub(r",\s*(\w)", r',"\1', json) - json = re.sub(r"(\w): ", r'\1":', json) json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) @@ -40,7 +43,6 @@ def _real_extract(self, url): filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) filesJson = self._fixJsonSyntax(filesJson) - data = json.loads(filesJson) playlist = data[0] From 3d24d997ae1f92686aa7edd0bfeed28353fbfb2e Mon Sep 17 00:00:00 2001 From: snipem Date: Sat, 4 Apr 2015 12:42:14 +0200 Subject: [PATCH 3/5] Fixed intendation of test cases Leaded to error on Linux machine --- youtube_dl/extractor/gamersyde.py | 45 ++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index 5c68a68914..cc6fa40371 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -1,39 +1,62 @@ # coding: utf-8 from __future__ import unicode_literals import re -import json import time + from .common import InfoExtractor class GamersydeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' + _TESTS = [{ 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { 'id': '34371', 'ext': 'mp4', + 'duration': 372, 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } - }, - { + }, { 'url': 'http://www.gamersyde.com/hqstream_dark_souls_ii_scholar_of_the_first_sin_gameplay_part_1-34417_en.html', + 'md5': '94bd7c3feff3275576cf5cb6c8a3a720', 'info_dict': { + 'id': '34417', 'ext': 'mp4', + 'duration': 270, + 'title': 'Dark Souls II: Scholar of the First Sin - Gameplay - Part 1', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, { + 'url': 'http://www.gamersyde.com/hqstream_grand_theft_auto_v_heists_trailer-33786_en.html', + 'md5': '65e442f5f340d571ece8c80d50700369', + 'info_dict': { + 'id': '33786', + 'ext': 'mp4', + 'duration': 59, + 'title': 'Grand Theft Auto V - Heists Trailer', + 'thumbnail': 're:^https?://.*\.jpg$', + } } + ] def _calculateDuration(self, durationString): - duration = time.strptime(durationString, "%M minutes %S seconds") + if (durationString.find("minutes") > -1): + duration = time.strptime(durationString, "%M minutes %S seconds") + else: + duration = time.strptime(durationString, "%S seconds") return duration.tm_min * 60 + duration.tm_sec def _fixJsonSyntax(self, json): - json = re.sub(r"{\s*(\w)", r'{"\1', json) - json = re.sub(r",\s*(\w)", r',"\1', json) json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) - + json = json.replace('file: "', '"file": "') + json = json.replace('title: "', '"title": "') + json = json.replace('label: "', '"label": "') + json = json.replace('image: "', '"image": "') + json = json.replace('sources: [', '"sources": [') return json def _real_extract(self, url): @@ -42,13 +65,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) - filesJson = self._fixJsonSyntax(filesJson) - data = json.loads(filesJson) + data = self._parse_json(filesJson,video_id, transform_source=self._fixJsonSyntax) + playlist = data[0] formats = [] title = re.sub(r"[0-9]+ - ", "", playlist['title']) + + length = self._search_regex(r'(([0-9]{1,2} minutes ){0,1}[0-9]{1,2} seconds)', webpage, 'length') + duration = self._calculateDuration(length) for playlistEntry in playlist['sources']: format = { @@ -62,5 +88,6 @@ def _real_extract(self, url): 'id': video_id, 'title': title, 'formats': formats, + 'duration': duration, 'thumbnail': playlist['image'] } From ba9e68f40261355ceae5bb87c5707adc7f7beb2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 17:48:55 +0600 Subject: [PATCH 4/5] [utils] Drop trailing comma before closing brace --- test/test_utils.py | 6 ++++++ youtube_dl/utils.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index abaf1ab733..4e524aca3b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -470,6 +470,12 @@ def test_js_to_json_edgecases(self): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + on = js_to_json('["abc", "def",]') + self.assertEqual(json.loads(on), ['abc', 'def']) + + on = js_to_json('{"abc": "def",}') + self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 90e0ed9ab7..e1761265c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1577,7 +1577,7 @@ def fix_kv(m): '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| [a-zA-Z_][.a-zA-Z_0-9]* ''', fix_kv, code) - res = re.sub(r',(\s*\])', lambda m: m.group(1), res) + res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) return res From 5c29dbd0c76083eaf596f623fabb612575f71861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 17:53:22 +0600 Subject: [PATCH 5/5] [gamersyde] Simplify --- youtube_dl/extractor/gamersyde.py | 103 ++++++++++++------------------ 1 file changed, 40 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py index cc6fa40371..d545e01bb8 100644 --- a/youtube_dl/extractor/gamersyde.py +++ b/youtube_dl/extractor/gamersyde.py @@ -1,14 +1,18 @@ -# coding: utf-8 from __future__ import unicode_literals + import re -import time from .common import InfoExtractor +from ..utils import ( + js_to_json, + parse_duration, + remove_start, +) class GamersydeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_' - _TESTS = [{ + _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P[\da-z_]+)-(?P\d+)_[a-z]{2}\.html' + _TEST = { 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { @@ -18,76 +22,49 @@ class GamersydeIE(InfoExtractor): 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } - }, { - 'url': 'http://www.gamersyde.com/hqstream_dark_souls_ii_scholar_of_the_first_sin_gameplay_part_1-34417_en.html', - 'md5': '94bd7c3feff3275576cf5cb6c8a3a720', - 'info_dict': { - 'id': '34417', - 'ext': 'mp4', - 'duration': 270, - 'title': 'Dark Souls II: Scholar of the First Sin - Gameplay - Part 1', - 'thumbnail': 're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.gamersyde.com/hqstream_grand_theft_auto_v_heists_trailer-33786_en.html', - 'md5': '65e442f5f340d571ece8c80d50700369', - 'info_dict': { - 'id': '33786', - 'ext': 'mp4', - 'duration': 59, - 'title': 'Grand Theft Auto V - Heists Trailer', - 'thumbnail': 're:^https?://.*\.jpg$', - } } - ] - - def _calculateDuration(self, durationString): - if (durationString.find("minutes") > -1): - duration = time.strptime(durationString, "%M minutes %S seconds") - else: - duration = time.strptime(durationString, "%S seconds") - return duration.tm_min * 60 + duration.tm_sec - - def _fixJsonSyntax(self, json): - - json = re.sub(r",\s*}", "}", json, flags=re.DOTALL) - json = re.sub(r",\s*]", "]", json, flags=re.DOTALL) - json = json.replace('file: "', '"file": "') - json = json.replace('title: "', '"title": "') - json = json.replace('label: "', '"label": "') - json = json.replace('image: "', '"image": "') - json = json.replace('sources: [', '"sources": [') - return json def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') - video_id = self._search_regex(r'-(.*?)_[a-z]{2}.html$', url, 'video_id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - filesJson = self._search_regex(r'playlist: (.*?)\}\);', webpage, 'files', flags=re.DOTALL) - data = self._parse_json(filesJson,video_id, transform_source=self._fixJsonSyntax) - - playlist = data[0] + playlist = self._parse_json( + self._search_regex( + r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'), + display_id, transform_source=js_to_json) formats = [] - - title = re.sub(r"[0-9]+ - ", "", playlist['title']) - - length = self._search_regex(r'(([0-9]{1,2} minutes ){0,1}[0-9]{1,2} seconds)', webpage, 'length') - duration = self._calculateDuration(length) - - for playlistEntry in playlist['sources']: - format = { - 'url': playlistEntry['file'], - 'format_id': playlistEntry['label'] + for source in playlist['sources']: + video_url = source.get('file') + if not video_url: + continue + format_id = source.get('label') + f = { + 'url': video_url, + 'format_id': format_id, } + m = re.search(r'^(?P\d+)[pP](?P\d+)fps', format_id) + if m: + f.update({ + 'height': int(m.group('height')), + 'fps': int(m.group('fps')), + }) + formats.append(f) + self._sort_formats(formats) - formats.append(format) + title = remove_start(playlist['title'], '%s - ' % video_id) + thumbnail = playlist.get('image') + duration = parse_duration(self._search_regex( + r'Length:([^<]+)<', webpage, 'duration', fatal=False)) return { 'id': video_id, + 'display_id': display_id, 'title': title, - 'formats': formats, + 'thumbnail': thumbnail, 'duration': duration, - 'thumbnail': playlist['image'] - } + 'formats': formats, + }