mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-12-24 18:51:00 +00:00
[utils] Improve parsing for nested HTML elements (#2129)
and add functions to return the HTML of elements Authored by: zmousm
This commit is contained in:
parent
e8736539f3
commit
6f32a0b5b7
2 changed files with 216 additions and 28 deletions
|
@ -44,6 +44,12 @@
|
||||||
get_element_by_attribute,
|
get_element_by_attribute,
|
||||||
get_elements_by_class,
|
get_elements_by_class,
|
||||||
get_elements_by_attribute,
|
get_elements_by_attribute,
|
||||||
|
get_element_html_by_class,
|
||||||
|
get_element_html_by_attribute,
|
||||||
|
get_elements_html_by_class,
|
||||||
|
get_elements_html_by_attribute,
|
||||||
|
get_elements_text_and_html_by_attribute,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
InAdvancePagedList,
|
InAdvancePagedList,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
intlist_to_bytes,
|
intlist_to_bytes,
|
||||||
|
@ -118,6 +124,7 @@
|
||||||
compat_chr,
|
compat_chr,
|
||||||
compat_etree_fromstring,
|
compat_etree_fromstring,
|
||||||
compat_getenv,
|
compat_getenv,
|
||||||
|
compat_HTMLParseError,
|
||||||
compat_os_name,
|
compat_os_name,
|
||||||
compat_setenv,
|
compat_setenv,
|
||||||
)
|
)
|
||||||
|
@ -1575,46 +1582,116 @@ def test_urshift(self):
|
||||||
self.assertEqual(urshift(3, 1), 1)
|
self.assertEqual(urshift(3, 1), 1)
|
||||||
self.assertEqual(urshift(-3, 1), 2147483646)
|
self.assertEqual(urshift(-3, 1), 2147483646)
|
||||||
|
|
||||||
|
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
|
||||||
|
<span class="foo bar">nice</span>
|
||||||
|
'''
|
||||||
|
|
||||||
def test_get_element_by_class(self):
|
def test_get_element_by_class(self):
|
||||||
html = '''
|
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
|
||||||
<span class="foo bar">nice</span>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.assertEqual(get_element_by_class('foo', html), 'nice')
|
self.assertEqual(get_element_by_class('foo', html), 'nice')
|
||||||
self.assertEqual(get_element_by_class('no-such-class', html), None)
|
self.assertEqual(get_element_by_class('no-such-class', html), None)
|
||||||
|
|
||||||
|
def test_get_element_html_by_class(self):
|
||||||
|
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(get_element_html_by_class('foo', html), html.strip())
|
||||||
|
self.assertEqual(get_element_by_class('no-such-class', html), None)
|
||||||
|
|
||||||
|
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
|
||||||
|
<div itemprop="author" itemscope>foo</div>
|
||||||
|
'''
|
||||||
|
|
||||||
def test_get_element_by_attribute(self):
|
def test_get_element_by_attribute(self):
|
||||||
html = '''
|
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
|
||||||
<span class="foo bar">nice</span>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
|
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
|
||||||
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
|
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
|
||||||
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
|
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
|
||||||
|
|
||||||
html = '''
|
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
|
||||||
<div itemprop="author" itemscope>foo</div>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
|
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
|
||||||
|
|
||||||
|
def test_get_element_html_by_attribute(self):
|
||||||
|
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
|
||||||
|
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
|
||||||
|
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
|
||||||
|
|
||||||
|
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
|
||||||
|
|
||||||
|
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
|
||||||
|
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
|
||||||
|
'''
|
||||||
|
GET_ELEMENTS_BY_CLASS_RES = ['<span class="foo bar">nice</span>', '<span class="foo bar">also nice</span>']
|
||||||
|
|
||||||
def test_get_elements_by_class(self):
|
def test_get_elements_by_class(self):
|
||||||
html = '''
|
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||||
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
|
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
|
||||||
self.assertEqual(get_elements_by_class('no-such-class', html), [])
|
self.assertEqual(get_elements_by_class('no-such-class', html), [])
|
||||||
|
|
||||||
|
def test_get_elements_html_by_class(self):
|
||||||
|
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
|
||||||
|
self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
|
||||||
|
|
||||||
def test_get_elements_by_attribute(self):
|
def test_get_elements_by_attribute(self):
|
||||||
html = '''
|
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||||
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
|
|
||||||
'''
|
|
||||||
|
|
||||||
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
|
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
|
||||||
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
|
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
|
||||||
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
|
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
|
||||||
|
|
||||||
|
def test_get_elements_html_by_attribute(self):
|
||||||
|
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES)
|
||||||
|
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
|
||||||
|
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
|
||||||
|
|
||||||
|
def test_get_elements_text_and_html_by_attribute(self):
|
||||||
|
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
get_elements_text_and_html_by_attribute('class', 'foo bar', html),
|
||||||
|
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
|
||||||
|
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
|
||||||
|
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
|
||||||
|
|
||||||
|
GET_ELEMENT_BY_TAG_TEST_STRING = '''
|
||||||
|
random text lorem ipsum</p>
|
||||||
|
<div>
|
||||||
|
this should be returned
|
||||||
|
<span>this should also be returned</span>
|
||||||
|
<div>
|
||||||
|
this should also be returned
|
||||||
|
</div>
|
||||||
|
closing tag above should not trick, so this should also be returned
|
||||||
|
</div>
|
||||||
|
but this text should not be returned
|
||||||
|
'''
|
||||||
|
GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
|
||||||
|
GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
|
||||||
|
GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
|
||||||
|
GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
|
||||||
|
|
||||||
|
def test_get_element_text_and_html_by_tag(self):
|
||||||
|
html = self.GET_ELEMENT_BY_TAG_TEST_STRING
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
get_element_text_and_html_by_tag('div', html),
|
||||||
|
(self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
|
||||||
|
self.assertEqual(
|
||||||
|
get_element_text_and_html_by_tag('span', html),
|
||||||
|
(self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
|
||||||
|
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
|
||||||
|
|
||||||
def test_iri_to_uri(self):
|
def test_iri_to_uri(self):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
||||||
|
|
137
yt_dlp/utils.py
137
yt_dlp/utils.py
|
@ -416,17 +416,33 @@ def get_element_by_id(id, html):
|
||||||
return get_element_by_attribute('id', id, html)
|
return get_element_by_attribute('id', id, html)
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_html_by_id(id, html):
|
||||||
|
"""Return the html of the tag with the specified ID in the passed HTML document"""
|
||||||
|
return get_element_html_by_attribute('id', id, html)
|
||||||
|
|
||||||
|
|
||||||
def get_element_by_class(class_name, html):
|
def get_element_by_class(class_name, html):
|
||||||
"""Return the content of the first tag with the specified class in the passed HTML document"""
|
"""Return the content of the first tag with the specified class in the passed HTML document"""
|
||||||
retval = get_elements_by_class(class_name, html)
|
retval = get_elements_by_class(class_name, html)
|
||||||
return retval[0] if retval else None
|
return retval[0] if retval else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_html_by_class(class_name, html):
|
||||||
|
"""Return the html of the first tag with the specified class in the passed HTML document"""
|
||||||
|
retval = get_elements_html_by_class(class_name, html)
|
||||||
|
return retval[0] if retval else None
|
||||||
|
|
||||||
|
|
||||||
def get_element_by_attribute(attribute, value, html, escape_value=True):
|
def get_element_by_attribute(attribute, value, html, escape_value=True):
|
||||||
retval = get_elements_by_attribute(attribute, value, html, escape_value)
|
retval = get_elements_by_attribute(attribute, value, html, escape_value)
|
||||||
return retval[0] if retval else None
|
return retval[0] if retval else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_html_by_attribute(attribute, value, html, escape_value=True):
|
||||||
|
retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
|
||||||
|
return retval[0] if retval else None
|
||||||
|
|
||||||
|
|
||||||
def get_elements_by_class(class_name, html):
|
def get_elements_by_class(class_name, html):
|
||||||
"""Return the content of all tags with the specified class in the passed HTML document as a list"""
|
"""Return the content of all tags with the specified class in the passed HTML document as a list"""
|
||||||
return get_elements_by_attribute(
|
return get_elements_by_attribute(
|
||||||
|
@ -434,31 +450,126 @@ def get_elements_by_class(class_name, html):
|
||||||
html, escape_value=False)
|
html, escape_value=False)
|
||||||
|
|
||||||
|
|
||||||
def get_elements_by_attribute(attribute, value, html, escape_value=True):
|
def get_elements_html_by_class(class_name, html):
|
||||||
|
"""Return the html of all tags with the specified class in the passed HTML document as a list"""
|
||||||
|
return get_elements_html_by_attribute(
|
||||||
|
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
|
||||||
|
html, escape_value=False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements_by_attribute(*args, **kwargs):
|
||||||
"""Return the content of the tag with the specified attribute in the passed HTML document"""
|
"""Return the content of the tag with the specified attribute in the passed HTML document"""
|
||||||
|
return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements_html_by_attribute(*args, **kwargs):
|
||||||
|
"""Return the html of the tag with the specified attribute in the passed HTML document"""
|
||||||
|
return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
|
||||||
|
|
||||||
|
|
||||||
|
def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
|
||||||
|
"""
|
||||||
|
Return the text (content) and the html (whole) of the tag with the specified
|
||||||
|
attribute in the passed HTML document
|
||||||
|
"""
|
||||||
|
|
||||||
value = re.escape(value) if escape_value else value
|
value = re.escape(value) if escape_value else value
|
||||||
|
|
||||||
retlist = []
|
retlist = []
|
||||||
for m in re.finditer(r'''(?xs)
|
for m in re.finditer(r'''(?xs)
|
||||||
<([a-zA-Z0-9:._-]+)
|
<(?P<tag>[a-zA-Z0-9:._-]+)
|
||||||
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
|
(?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
|
||||||
\s+%s=['"]?%s['"]?
|
\s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
|
||||||
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
|
(?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
|
||||||
\s*>
|
\s*>
|
||||||
(?P<content>.*?)
|
''' % {'attribute': re.escape(attribute), 'value': value}, html):
|
||||||
</\1>
|
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
|
||||||
''' % (re.escape(attribute), value), html):
|
|
||||||
res = m.group('content')
|
|
||||||
|
|
||||||
if res.startswith('"') or res.startswith("'"):
|
retlist.append((
|
||||||
res = res[1:-1]
|
unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
|
||||||
|
whole,
|
||||||
retlist.append(unescapeHTML(res))
|
))
|
||||||
|
|
||||||
return retlist
|
return retlist
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLBreakOnClosingTagParser(compat_HTMLParser):
|
||||||
|
"""
|
||||||
|
HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
|
||||||
|
closing tag for the first opening tag it has encountered, and can be used
|
||||||
|
as a context manager
|
||||||
|
"""
|
||||||
|
|
||||||
|
class HTMLBreakOnClosingTagException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.tagstack = collections.deque()
|
||||||
|
compat_HTMLParser.__init__(self)
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *_):
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
# handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
|
||||||
|
# so data remains buffered; we no longer have any interest in it, thus
|
||||||
|
# override this method to discard it
|
||||||
|
pass
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, _):
|
||||||
|
self.tagstack.append(tag)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if not self.tagstack:
|
||||||
|
raise compat_HTMLParseError('no tags in the stack')
|
||||||
|
while self.tagstack:
|
||||||
|
inner_tag = self.tagstack.pop()
|
||||||
|
if inner_tag == tag:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
|
||||||
|
if not self.tagstack:
|
||||||
|
raise self.HTMLBreakOnClosingTagException()
|
||||||
|
|
||||||
|
|
||||||
|
def get_element_text_and_html_by_tag(tag, html):
|
||||||
|
"""
|
||||||
|
For the first element with the specified tag in the passed HTML document
|
||||||
|
return its' content (text) and the whole element (html)
|
||||||
|
"""
|
||||||
|
def find_or_raise(haystack, needle, exc):
|
||||||
|
try:
|
||||||
|
return haystack.index(needle)
|
||||||
|
except ValueError:
|
||||||
|
raise exc
|
||||||
|
closing_tag = f'</{tag}>'
|
||||||
|
whole_start = find_or_raise(
|
||||||
|
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
|
||||||
|
content_start = find_or_raise(
|
||||||
|
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
|
||||||
|
content_start += whole_start + 1
|
||||||
|
with HTMLBreakOnClosingTagParser() as parser:
|
||||||
|
parser.feed(html[whole_start:content_start])
|
||||||
|
if not parser.tagstack or parser.tagstack[0] != tag:
|
||||||
|
raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
|
||||||
|
offset = content_start
|
||||||
|
while offset < len(html):
|
||||||
|
next_closing_tag_start = find_or_raise(
|
||||||
|
html[offset:], closing_tag,
|
||||||
|
compat_HTMLParseError(f'closing {tag} tag not found'))
|
||||||
|
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
|
||||||
|
try:
|
||||||
|
parser.feed(html[offset:offset + next_closing_tag_end])
|
||||||
|
offset += next_closing_tag_end
|
||||||
|
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
|
||||||
|
return html[content_start:offset + next_closing_tag_start], \
|
||||||
|
html[whole_start:offset + next_closing_tag_end]
|
||||||
|
raise compat_HTMLParseError('unexpected end of html')
|
||||||
|
|
||||||
|
|
||||||
class HTMLAttributeParser(compat_HTMLParser):
|
class HTMLAttributeParser(compat_HTMLParser):
|
||||||
"""Trivial HTML parser to gather the attributes for a single element"""
|
"""Trivial HTML parser to gather the attributes for a single element"""
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue