[utils] Improve parsing for nested HTML elements (#2129)

and add functions to return the HTML of elements

Authored by: zmousm
This commit is contained in:
Zenon Mousmoulas 2022-01-05 20:37:49 +02:00 committed by GitHub
parent e8736539f3
commit 6f32a0b5b7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 216 additions and 28 deletions

View file

@ -44,6 +44,12 @@
get_element_by_attribute, get_element_by_attribute,
get_elements_by_class, get_elements_by_class,
get_elements_by_attribute, get_elements_by_attribute,
get_element_html_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
get_elements_html_by_attribute,
get_elements_text_and_html_by_attribute,
get_element_text_and_html_by_tag,
InAdvancePagedList, InAdvancePagedList,
int_or_none, int_or_none,
intlist_to_bytes, intlist_to_bytes,
@ -118,6 +124,7 @@
compat_chr, compat_chr,
compat_etree_fromstring, compat_etree_fromstring,
compat_getenv, compat_getenv,
compat_HTMLParseError,
compat_os_name, compat_os_name,
compat_setenv, compat_setenv,
) )
@ -1575,46 +1582,116 @@ def test_urshift(self):
self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646) self.assertEqual(urshift(-3, 1), 2147483646)
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span>
'''
def test_get_element_by_class(self): def test_get_element_by_class(self):
html = ''' html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
<span class="foo bar">nice</span>
'''
self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None) self.assertEqual(get_element_by_class('no-such-class', html), None)
def test_get_element_html_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_class('foo', html), html.strip())
self.assertEqual(get_element_by_class('no-such-class', html), None)
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
<div itemprop="author" itemscope>foo</div>
'''
def test_get_element_by_attribute(self): def test_get_element_by_attribute(self):
html = ''' html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
<span class="foo bar">nice</span>
'''
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
html = ''' html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
<div itemprop="author" itemscope>foo</div>
'''
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
'''
GET_ELEMENTS_BY_CLASS_RES = ['<span class="foo bar">nice</span>', '<span class="foo bar">also nice</span>']
def test_get_elements_by_class(self): def test_get_elements_by_class(self):
html = ''' html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
'''
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), []) self.assertEqual(get_elements_by_class('no-such-class', html), [])
def test_get_elements_html_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
def test_get_elements_by_attribute(self): def test_get_elements_by_attribute(self):
html = ''' html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
'''
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_text_and_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(
get_elements_text_and_html_by_attribute('class', 'foo bar', html),
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
GET_ELEMENT_BY_TAG_TEST_STRING = '''
random text lorem ipsum</p>
<div>
this should be returned
<span>this should also be returned</span>
<div>
this should also be returned
</div>
closing tag above should not trick, so this should also be returned
</div>
but this text should not be returned
'''
GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
def test_get_element_text_and_html_by_tag(self):
html = self.GET_ELEMENT_BY_TAG_TEST_STRING
self.assertEqual(
get_element_text_and_html_by_tag('div', html),
(self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
(self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
def test_iri_to_uri(self): def test_iri_to_uri(self):
self.assertEqual( self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),

View file

@ -416,17 +416,33 @@ def get_element_by_id(id, html):
return get_element_by_attribute('id', id, html) return get_element_by_attribute('id', id, html)
def get_element_html_by_id(id, html):
"""Return the html of the tag with the specified ID in the passed HTML document"""
return get_element_html_by_attribute('id', id, html)
def get_element_by_class(class_name, html): def get_element_by_class(class_name, html):
"""Return the content of the first tag with the specified class in the passed HTML document""" """Return the content of the first tag with the specified class in the passed HTML document"""
retval = get_elements_by_class(class_name, html) retval = get_elements_by_class(class_name, html)
return retval[0] if retval else None return retval[0] if retval else None
def get_element_html_by_class(class_name, html):
"""Return the html of the first tag with the specified class in the passed HTML document"""
retval = get_elements_html_by_class(class_name, html)
return retval[0] if retval else None
def get_element_by_attribute(attribute, value, html, escape_value=True): def get_element_by_attribute(attribute, value, html, escape_value=True):
retval = get_elements_by_attribute(attribute, value, html, escape_value) retval = get_elements_by_attribute(attribute, value, html, escape_value)
return retval[0] if retval else None return retval[0] if retval else None
def get_element_html_by_attribute(attribute, value, html, escape_value=True):
retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
return retval[0] if retval else None
def get_elements_by_class(class_name, html): def get_elements_by_class(class_name, html):
"""Return the content of all tags with the specified class in the passed HTML document as a list""" """Return the content of all tags with the specified class in the passed HTML document as a list"""
return get_elements_by_attribute( return get_elements_by_attribute(
@ -434,31 +450,126 @@ def get_elements_by_class(class_name, html):
html, escape_value=False) html, escape_value=False)
def get_elements_by_attribute(attribute, value, html, escape_value=True): def get_elements_html_by_class(class_name, html):
"""Return the html of all tags with the specified class in the passed HTML document as a list"""
return get_elements_html_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
html, escape_value=False)
def get_elements_by_attribute(*args, **kwargs):
"""Return the content of the tag with the specified attribute in the passed HTML document""" """Return the content of the tag with the specified attribute in the passed HTML document"""
return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
def get_elements_html_by_attribute(*args, **kwargs):
"""Return the html of the tag with the specified attribute in the passed HTML document"""
return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
"""
Return the text (content) and the html (whole) of the tag with the specified
attribute in the passed HTML document
"""
value = re.escape(value) if escape_value else value value = re.escape(value) if escape_value else value
retlist = [] retlist = []
for m in re.finditer(r'''(?xs) for m in re.finditer(r'''(?xs)
<([a-zA-Z0-9:._-]+) <(?P<tag>[a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
\s+%s=['"]?%s['"]? \s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? (?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
\s*> \s*>
(?P<content>.*?) ''' % {'attribute': re.escape(attribute), 'value': value}, html):
</\1> content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
''' % (re.escape(attribute), value), html):
res = m.group('content')
if res.startswith('"') or res.startswith("'"): retlist.append((
res = res[1:-1] unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
whole,
retlist.append(unescapeHTML(res)) ))
return retlist return retlist
class HTMLBreakOnClosingTagParser(compat_HTMLParser):
"""
HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
closing tag for the first opening tag it has encountered, and can be used
as a context manager
"""
class HTMLBreakOnClosingTagException(Exception):
pass
def __init__(self):
self.tagstack = collections.deque()
compat_HTMLParser.__init__(self)
def __enter__(self):
return self
def __exit__(self, *_):
self.close()
def close(self):
# handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
# so data remains buffered; we no longer have any interest in it, thus
# override this method to discard it
pass
def handle_starttag(self, tag, _):
self.tagstack.append(tag)
def handle_endtag(self, tag):
if not self.tagstack:
raise compat_HTMLParseError('no tags in the stack')
while self.tagstack:
inner_tag = self.tagstack.pop()
if inner_tag == tag:
break
else:
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
if not self.tagstack:
raise self.HTMLBreakOnClosingTagException()
def get_element_text_and_html_by_tag(tag, html):
"""
For the first element with the specified tag in the passed HTML document
return its' content (text) and the whole element (html)
"""
def find_or_raise(haystack, needle, exc):
try:
return haystack.index(needle)
except ValueError:
raise exc
closing_tag = f'</{tag}>'
whole_start = find_or_raise(
html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
content_start = find_or_raise(
html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
content_start += whole_start + 1
with HTMLBreakOnClosingTagParser() as parser:
parser.feed(html[whole_start:content_start])
if not parser.tagstack or parser.tagstack[0] != tag:
raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
offset = content_start
while offset < len(html):
next_closing_tag_start = find_or_raise(
html[offset:], closing_tag,
compat_HTMLParseError(f'closing {tag} tag not found'))
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
try:
parser.feed(html[offset:offset + next_closing_tag_end])
offset += next_closing_tag_end
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
return html[content_start:offset + next_closing_tag_start], \
html[whole_start:offset + next_closing_tag_end]
raise compat_HTMLParseError('unexpected end of html')
class HTMLAttributeParser(compat_HTMLParser): class HTMLAttributeParser(compat_HTMLParser):
"""Trivial HTML parser to gather the attributes for a single element""" """Trivial HTML parser to gather the attributes for a single element"""