Merge pull request #2190 from dalf/fix-htmltextextractor

[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
This commit is contained in:
Alexandre Flament 2020-09-19 15:59:03 +02:00 committed by GitHub
commit 530fc4bda7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 2 deletions

View file

@ -77,6 +77,10 @@ def highlight_content(content, query):
return content
class HTMLTextExtractorException(Exception):
pass
class HTMLTextExtractor(HTMLParser):
def __init__(self):
@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
return
if tag != self.tags[-1]:
raise Exception("invalid html")
raise HTMLTextExtractorException()
self.tags.pop()
@ -128,7 +132,10 @@ def html_to_text(html):
html = html.replace('\n', ' ')
html = ' '.join(html.split())
s = HTMLTextExtractor()
s.feed(html)
try:
s.feed(html)
except HTMLTextExtractorException:
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
return s.get_text()

View file

@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
self.assertIsNotNone(utils.html_to_text(html))
self.assertEqual(utils.html_to_text(html), "Test text")
def test_html_to_text_invalid(self):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
def test_prettify_url(self):
data = (('https://searx.me/', 'https://searx.me/'),
('https://searx.me/ű', 'https://searx.me/ű'),
@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
self.html_text_extractor.handle_entityref(entity)
self.assertIn(entity, self.html_text_extractor.result)
def test_invalid_html(self):
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
with self.assertRaises(utils.HTMLTextExtractorException):
self.html_text_extractor.feed(text)
class TestUnicodeWriter(SearxTestCase):