[fix] revision of utils.HTMLTextExtractor (#5125)

Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
2025-09-03 20:13:50 +00:00 · 2025-08-18 16:30:51 +02:00 · 2025-08-18 16:30:51 +02:00 · 4fb6105d69
commit 4fb6105d69
parent b606103352
2 changed files with 41 additions and 65 deletions
--- a/searx/utils.py
+++ b/searx/utils.py
@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str:
    return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))


-class _HTMLTextExtractorException(Exception):
-    """Internal exception raised when the HTML is invalid"""
-
-
-class _HTMLTextExtractor(HTMLParser):
+class HTMLTextExtractor(HTMLParser):
    """Internal class to extract text from HTML"""

    def __init__(self):
@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser):
            return

        if tag != self.tags[-1]:
-            raise _HTMLTextExtractorException()
+            self.result.append(f"</{tag}>")
+            return

        self.tags.pop()

@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str:
        >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
        'Example'

-        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
+        >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
        'regexp: (?<![a-zA-Z]'
+
+        >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
+        'Lorem ipsum </i>dolor sit amet</p>'
+
+        >>> html_to_text(r'&#x3e &#x3c &#97')
+        '> < a'
+
    """
    if not html_str:
        return ""
    html_str = html_str.replace('\n', ' ').replace('\r', ' ')
    html_str = ' '.join(html_str.split())
-    s = _HTMLTextExtractor()
+    s = HTMLTextExtractor()
    try:
        s.feed(html_str)
        s.close()
    except AssertionError:
-        s = _HTMLTextExtractor()
+        s = HTMLTextExtractor()
        s.feed(escape(html_str, quote=True))
        s.close()
-    except _HTMLTextExtractorException:
-        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
    return s.get_text()


--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
        self.assertIsNotNone(utils.searxng_useragent())
        self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))

-    def test_html_to_text(self):
-        html_str = """
-        <a href="/testlink" class="link_access_account">
-            <style>
-                .toto {
-                    color: red;
-                }
-            </style>
-            <span class="toto">
-                <span>
-                    <img src="test.jpg" />
-                </span>
-            </span>
-            <span class="titi">
-                            Test text
-            </span>
-            <script>value='dummy';</script>
-        </a>
-        """
-        self.assertIsInstance(utils.html_to_text(html_str), str)
-        self.assertIsNotNone(utils.html_to_text(html_str))
-        self.assertEqual(utils.html_to_text(html_str), "Test text")
-        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
-
    def test_extract_text(self):
        html_str = """
        <a href="/testlink" class="link_access_account">
@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
        with self.assertRaises(Exception):
            utils.extract_url([], 'https://example.com')

-    def test_html_to_text_invalid(self):
-        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
-
    def test_ecma_unscape(self):
        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')

-
-class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
-
-    def setUp(self):
-        super().setUp()
-
-        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access
-
-    def test__init__(self):
-        self.assertEqual(self.html_text_extractor.result, [])
-
    @parameterized.expand(
        [
-            ('xF', '\x0f'),
-            ('XF', '\x0f'),
-            ('97', 'a'),
+            ('Example <span id="42">#2</span>', 'Example #2'),
+            ('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
+            (r'regexp: (?&lt;![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
+            (r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
+            (r'&#x3e &#x3c &#97', '> < a'),
        ]
    )
-    def test_handle_charref(self, charref: str, expected: str):
-        self.html_text_extractor.handle_charref(charref)
-        self.assertIn(expected, self.html_text_extractor.result)
+    def test_html_to_text(self, html_str: str, text_str: str):
+        self.assertEqual(utils.html_to_text(html_str), text_str)

-    def test_handle_entityref(self):
-        entity = 'test'
-        self.html_text_extractor.handle_entityref(entity)
-        self.assertIn(entity, self.html_text_extractor.result)
-
-    def test_invalid_html(self):
-        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
-            self.html_text_extractor.feed(text)
+    def test_html_to_text_with_a_style_span(self):
+        html_str = """
+        <a href="/testlink" class="link_access_account">
+            <style>
+                .toto {
+                    color: red;
+                }
+            </style>
+            <span class="toto">
+                <span>
+                    <img src="test.jpg" />
+                </span>
+            </span>
+            <span class="titi">
+                            Test text
+            </span>
+            <script>value='dummy';</script>
+        </a>
+        """
+        self.assertIsInstance(utils.html_to_text(html_str), str)
+        self.assertEqual(utils.html_to_text(html_str), "Test text")


 class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring