mirror of
https://github.com/searxng/searxng.git
synced 2025-09-03 20:13:50 +00:00
[fix] revision of utils.HTMLTextExtractor (#5125)
Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
This commit is contained in:
parent
b606103352
commit
4fb6105d69
2 changed files with 41 additions and 65 deletions
|
@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
|
|||
return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
|
||||
|
||||
|
||||
class _HTMLTextExtractorException(Exception):
|
||||
"""Internal exception raised when the HTML is invalid"""
|
||||
|
||||
|
||||
class _HTMLTextExtractor(HTMLParser):
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
"""Internal class to extract text from HTML"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
|
|||
return
|
||||
|
||||
if tag != self.tags[-1]:
|
||||
raise _HTMLTextExtractorException()
|
||||
self.result.append(f"</{tag}>")
|
||||
return
|
||||
|
||||
self.tags.pop()
|
||||
|
||||
|
@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
|
|||
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
|
||||
'Example'
|
||||
|
||||
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
|
||||
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
|
||||
'regexp: (?<![a-zA-Z]'
|
||||
|
||||
>>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
|
||||
'Lorem ipsum </i>dolor sit amet</p>'
|
||||
|
||||
>>> html_to_text(r'> < a')
|
||||
'> < a'
|
||||
|
||||
"""
|
||||
if not html_str:
|
||||
return ""
|
||||
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
|
||||
html_str = ' '.join(html_str.split())
|
||||
s = _HTMLTextExtractor()
|
||||
s = HTMLTextExtractor()
|
||||
try:
|
||||
s.feed(html_str)
|
||||
s.close()
|
||||
except AssertionError:
|
||||
s = _HTMLTextExtractor()
|
||||
s = HTMLTextExtractor()
|
||||
s.feed(escape(html_str, quote=True))
|
||||
s.close()
|
||||
except _HTMLTextExtractorException:
|
||||
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
|
||||
return s.get_text()
|
||||
|
||||
|
||||
|
|
|
@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
|
|||
self.assertIsNotNone(utils.searxng_useragent())
|
||||
self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
|
||||
|
||||
def test_html_to_text(self):
|
||||
html_str = """
|
||||
<a href="/testlink" class="link_access_account">
|
||||
<style>
|
||||
.toto {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
<span class="toto">
|
||||
<span>
|
||||
<img src="test.jpg" />
|
||||
</span>
|
||||
</span>
|
||||
<span class="titi">
|
||||
Test text
|
||||
</span>
|
||||
<script>value='dummy';</script>
|
||||
</a>
|
||||
"""
|
||||
self.assertIsInstance(utils.html_to_text(html_str), str)
|
||||
self.assertIsNotNone(utils.html_to_text(html_str))
|
||||
self.assertEqual(utils.html_to_text(html_str), "Test text")
|
||||
self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
|
||||
|
||||
def test_extract_text(self):
|
||||
html_str = """
|
||||
<a href="/testlink" class="link_access_account">
|
||||
|
@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
|
|||
with self.assertRaises(Exception):
|
||||
utils.extract_url([], 'https://example.com')
|
||||
|
||||
def test_html_to_text_invalid(self):
|
||||
_html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||
self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
|
||||
|
||||
def test_ecma_unscape(self):
|
||||
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
|
||||
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
|
||||
self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
|
||||
|
||||
|
||||
class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access
|
||||
|
||||
def test__init__(self):
|
||||
self.assertEqual(self.html_text_extractor.result, [])
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
('xF', '\x0f'),
|
||||
('XF', '\x0f'),
|
||||
('97', 'a'),
|
||||
('Example <span id="42">#2</span>', 'Example #2'),
|
||||
('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
|
||||
(r'regexp: (?<![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
|
||||
(r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
|
||||
(r'> < a', '> < a'),
|
||||
]
|
||||
)
|
||||
def test_handle_charref(self, charref: str, expected: str):
|
||||
self.html_text_extractor.handle_charref(charref)
|
||||
self.assertIn(expected, self.html_text_extractor.result)
|
||||
def test_html_to_text(self, html_str: str, text_str: str):
|
||||
self.assertEqual(utils.html_to_text(html_str), text_str)
|
||||
|
||||
def test_handle_entityref(self):
|
||||
entity = 'test'
|
||||
self.html_text_extractor.handle_entityref(entity)
|
||||
self.assertIn(entity, self.html_text_extractor.result)
|
||||
|
||||
def test_invalid_html(self):
|
||||
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||
with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access
|
||||
self.html_text_extractor.feed(text)
|
||||
def test_html_to_text_with_a_style_span(self):
|
||||
html_str = """
|
||||
<a href="/testlink" class="link_access_account">
|
||||
<style>
|
||||
.toto {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
<span class="toto">
|
||||
<span>
|
||||
<img src="test.jpg" />
|
||||
</span>
|
||||
</span>
|
||||
<span class="titi">
|
||||
Test text
|
||||
</span>
|
||||
<script>value='dummy';</script>
|
||||
</a>
|
||||
"""
|
||||
self.assertIsInstance(utils.html_to_text(html_str), str)
|
||||
self.assertEqual(utils.html_to_text(html_str), "Test text")
|
||||
|
||||
|
||||
class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring
|
||||
|
|
Loading…
Reference in a new issue