mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-07 20:30:41 -05:00
[utils] Use bytes-like objects in dfxp2srt
This fixes handling of non-UTF8 TTML subtitles Closes #14191
This commit is contained in:
parent
68d43a61b5
commit
3869028ffb
4 changed files with 41 additions and 11 deletions
|
@ -1,3 +1,9 @@
|
||||||
|
version <unreleased>
|
||||||
|
|
||||||
|
Core
|
||||||
|
* [utils] Fix handling raw TTML subtitles (#14191)
|
||||||
|
|
||||||
|
|
||||||
version 2017.09.15
|
version 2017.09.15
|
||||||
|
|
||||||
Core
|
Core
|
||||||
|
|
|
@ -1064,7 +1064,7 @@ def test_dfxp2srt(self):
|
||||||
<p begin="3" dur="-1">Ignored, three</p>
|
<p begin="3" dur="-1">Ignored, three</p>
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</tt>'''
|
</tt>'''.encode('utf-8')
|
||||||
srt_data = '''1
|
srt_data = '''1
|
||||||
00:00:00,000 --> 00:00:01,000
|
00:00:00,000 --> 00:00:01,000
|
||||||
The following line contains Chinese characters and special symbols
|
The following line contains Chinese characters and special symbols
|
||||||
|
@ -1089,7 +1089,7 @@ def test_dfxp2srt(self):
|
||||||
<p begin="0" end="1">The first line</p>
|
<p begin="0" end="1">The first line</p>
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</tt>'''
|
</tt>'''.encode('utf-8')
|
||||||
srt_data = '''1
|
srt_data = '''1
|
||||||
00:00:00,000 --> 00:00:01,000
|
00:00:00,000 --> 00:00:01,000
|
||||||
The first line
|
The first line
|
||||||
|
@ -1115,7 +1115,7 @@ def test_dfxp2srt(self):
|
||||||
<p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
|
<p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</tt>'''
|
</tt>'''.encode('utf-8')
|
||||||
srt_data = '''1
|
srt_data = '''1
|
||||||
00:00:02,080 --> 00:00:05,839
|
00:00:02,080 --> 00:00:05,839
|
||||||
<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
|
<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
|
||||||
|
@ -1138,6 +1138,26 @@ def test_dfxp2srt(self):
|
||||||
'''
|
'''
|
||||||
self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
|
self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
|
||||||
|
|
||||||
|
dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?>
|
||||||
|
<tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
|
||||||
|
<body>
|
||||||
|
<div xml:lang="en">
|
||||||
|
<p begin="0" end="1">Line 1</p>
|
||||||
|
<p begin="1" end="2">第二行</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</tt>'''.encode('utf-16')
|
||||||
|
srt_data = '''1
|
||||||
|
00:00:00,000 --> 00:00:01,000
|
||||||
|
Line 1
|
||||||
|
|
||||||
|
2
|
||||||
|
00:00:01,000 --> 00:00:02,000
|
||||||
|
第二行
|
||||||
|
|
||||||
|
'''
|
||||||
|
self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data)
|
||||||
|
|
||||||
def test_cli_option(self):
|
def test_cli_option(self):
|
||||||
self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
|
self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
|
||||||
self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
|
self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
|
||||||
|
|
|
@ -585,7 +585,7 @@ def run(self, info):
|
||||||
dfxp_file = old_file
|
dfxp_file = old_file
|
||||||
srt_file = subtitles_filename(filename, lang, 'srt')
|
srt_file = subtitles_filename(filename, lang, 'srt')
|
||||||
|
|
||||||
with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
|
with open(dfxp_file, 'rb') as f:
|
||||||
srt_data = dfxp2srt(f.read())
|
srt_data = dfxp2srt(f.read())
|
||||||
|
|
||||||
with io.open(srt_file, 'wt', encoding='utf-8') as f:
|
with io.open(srt_file, 'wt', encoding='utf-8') as f:
|
||||||
|
|
|
@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds):
|
||||||
|
|
||||||
|
|
||||||
def dfxp2srt(dfxp_data):
|
def dfxp2srt(dfxp_data):
|
||||||
|
'''
|
||||||
|
@param dfxp_data A bytes-like object containing DFXP data
|
||||||
|
@returns A unicode object containing converted SRT data
|
||||||
|
'''
|
||||||
LEGACY_NAMESPACES = (
|
LEGACY_NAMESPACES = (
|
||||||
('http://www.w3.org/ns/ttml', [
|
(b'http://www.w3.org/ns/ttml', [
|
||||||
'http://www.w3.org/2004/11/ttaf1',
|
b'http://www.w3.org/2004/11/ttaf1',
|
||||||
'http://www.w3.org/2006/04/ttaf1',
|
b'http://www.w3.org/2006/04/ttaf1',
|
||||||
'http://www.w3.org/2006/10/ttaf1',
|
b'http://www.w3.org/2006/10/ttaf1',
|
||||||
]),
|
]),
|
||||||
('http://www.w3.org/ns/ttml#styling', [
|
(b'http://www.w3.org/ns/ttml#styling', [
|
||||||
'http://www.w3.org/ns/ttml#style',
|
b'http://www.w3.org/ns/ttml#style',
|
||||||
]),
|
]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2674,7 +2678,7 @@ def parse_node(node):
|
||||||
for ns in v:
|
for ns in v:
|
||||||
dfxp_data = dfxp_data.replace(ns, k)
|
dfxp_data = dfxp_data.replace(ns, k)
|
||||||
|
|
||||||
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
|
dfxp = compat_etree_fromstring(dfxp_data)
|
||||||
out = []
|
out = []
|
||||||
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
|
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue