[utils] is_html: Handle double BOM

Closes #2885
This commit is contained in:
pukkandan 2022-05-18 06:42:43 +05:30
parent aedaa455d9
commit 80e8493ee7
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
2 changed files with 21 additions and 7 deletions

View file

@ -2527,6 +2527,21 @@ class GenericIE(InfoExtractor):
'upload_date': '20220504', 'upload_date': '20220504',
}, },
}, },
{
# Webpage contains double BOM
'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
'md5': 'df02cadc719dcc63d43288366f037754',
'info_dict': {
'id': 'paris-d-moll',
'ext': 'mp4',
'upload_date': '20220518',
'title': 'Paris d-moll',
'description': 'md5:319e37ea5542293db37e1e13072fe330',
'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg',
'timestamp': 1652833414,
'age_limit': 0,
}
}
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):

View file

@ -3290,14 +3290,13 @@ def is_html(first_bytes):
(b'\xff\xfe', 'utf-16-le'), (b'\xff\xfe', 'utf-16-le'),
(b'\xfe\xff', 'utf-16-be'), (b'\xfe\xff', 'utf-16-be'),
] ]
for bom, enc in BOMS:
if first_bytes.startswith(bom):
s = first_bytes[len(bom):].decode(enc, 'replace')
break
else:
s = first_bytes.decode('utf-8', 'replace')
return re.match(r'^\s*<', s) encoding = 'utf-8'
for bom, enc in BOMS:
while first_bytes.startswith(bom):
encoding, first_bytes = enc, first_bytes[len(bom):]
return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
def determine_protocol(info_dict): def determine_protocol(info_dict):