From fa1ef9a07b79ab740c127bac0d11b8315a5130ff Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Sun, 18 Jun 2023 16:43:48 +0200
Subject: [PATCH] [mod] move some code from webapp module to webutils module
 (no functional change)

Over the years the webapp module became more and more a mess.  To improve the
modulaization a little this patch moves some implementations from the webapp
module to webutils module.

HINT: this patch brings non functional change

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
---
 searx/results.py                    |  17 +++-
 searx/webapp.py                     | 151 +++++++---------------------
 searx/webutils.py                   | 115 +++++++++++++++++++--
 searxng_extra/standalone_searx.py   |   4 +-
 tests/unit/test_standalone_searx.py |   4 +-
 tests/unit/test_webapp.py           |   2 +-
 tests/unit/test_webutils.py         |   2 +-
 7 files changed, 164 insertions(+), 131 deletions(-)

diff --git a/searx/results.py b/searx/results.py
index 5dd1bff21..caf02213d 100644
--- a/searx/results.py
+++ b/searx/results.py
@@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set
 from urllib.parse import urlparse, unquote
 
 from searx import logger
+from searx import utils
 from searx.engines import engines
 from searx.metrics import histogram_observe, counter_add, count_error
 
@@ -353,6 +354,10 @@ class ResultContainer:
         for result in self._merged_results:
             score = result_score(result)
             result['score'] = score
+            if result.get('content'):
+                result['content'] = utils.html_to_text(result['content']).strip()
+            # removing html content and whitespace duplications
+            result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
             for result_engine in result['engines']:
                 counter_add(score, 'engine', result_engine, 'score')
 
@@ -415,11 +420,19 @@ class ResultContainer:
     def results_length(self):
         return len(self._merged_results)
 
-    def results_number(self):
+    @property
+    def number_of_results(self) -> int:
+        """Returns the average of results number, returns zero if the average
+        result number is smaller than the actual result count."""
+
         resultnum_sum = sum(self._number_of_results)
         if not resultnum_sum or not self._number_of_results:
             return 0
-        return resultnum_sum / len(self._number_of_results)
+
+        average = int(resultnum_sum / len(self._number_of_results))
+        if average < self.results_length():
+            average = 0
+        return average
 
     def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
         if engines[engine_name].display_error_messages:
diff --git a/searx/webapp.py b/searx/webapp.py
index d6322447a..59c1dd1a1 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -58,7 +58,7 @@ from searx import (
 
 from searx import infopage
 from searx.data import ENGINE_DESCRIPTIONS
-from searx.results import Timing, UnresponsiveEngine
+from searx.results import Timing
 from searx.settings_defaults import OUTPUT_FORMATS
 from searx.settings_loader import get_default_settings_path
 from searx.exceptions import SearxParameterException
@@ -68,18 +68,18 @@ from searx.engines import (
     engines,
     engine_shortcuts,
 )
+
+from searx import webutils
 from searx.webutils import (
-    UnicodeWriter,
     highlight_content,
     get_static_files,
     get_result_templates,
     get_themes,
-    prettify_url,
+    exception_classname_to_text,
     new_hmac,
     is_hmac_of,
     is_flask_run_cmdline,
     group_engines_in_tab,
-    searxng_l10n_timespan,
 )
 from searx.webadapter import (
     get_search_query_from_webapp,
@@ -87,7 +87,6 @@ from searx.webadapter import (
     parse_lang,
 )
 from searx.utils import (
-    html_to_text,
     gen_useragent,
     dict_subset,
 )
@@ -165,39 +164,6 @@ app.jinja_env.add_extension('jinja2.ext.loopcontrols')  # pylint: disable=no-mem
 app.jinja_env.filters['group_engines_in_tab'] = group_engines_in_tab  # pylint: disable=no-member
 app.secret_key = settings['server']['secret_key']
 
-timeout_text = gettext('timeout')
-parsing_error_text = gettext('parsing error')
-http_protocol_error_text = gettext('HTTP protocol error')
-network_error_text = gettext('network error')
-ssl_cert_error_text = gettext("SSL error: certificate validation has failed")
-exception_classname_to_text = {
-    None: gettext('unexpected crash'),
-    'timeout': timeout_text,
-    'asyncio.TimeoutError': timeout_text,
-    'httpx.TimeoutException': timeout_text,
-    'httpx.ConnectTimeout': timeout_text,
-    'httpx.ReadTimeout': timeout_text,
-    'httpx.WriteTimeout': timeout_text,
-    'httpx.HTTPStatusError': gettext('HTTP error'),
-    'httpx.ConnectError': gettext("HTTP connection error"),
-    'httpx.RemoteProtocolError': http_protocol_error_text,
-    'httpx.LocalProtocolError': http_protocol_error_text,
-    'httpx.ProtocolError': http_protocol_error_text,
-    'httpx.ReadError': network_error_text,
-    'httpx.WriteError': network_error_text,
-    'httpx.ProxyError': gettext("proxy error"),
-    'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
-    'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
-    'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
-    'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
-    'searx.exceptions.SearxEngineXPathException': parsing_error_text,
-    'KeyError': parsing_error_text,
-    'json.decoder.JSONDecodeError': parsing_error_text,
-    'lxml.etree.ParserError': parsing_error_text,
-    'ssl.SSLCertVerificationError': ssl_cert_error_text,  # for Python > 3.7
-    'ssl.CertificateError': ssl_cert_error_text,  # for Python 3.7
-}
-
 
 class ExtendedRequest(flask.Request):
     """This class is never initialized and only used for type checking."""
@@ -686,9 +652,7 @@ def search():
         search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp(
             request.preferences, request.form
         )
-        # search = Search(search_query) #  without plugins
         search = SearchWithPlugins(search_query, request.user_plugins, request)  # pylint: disable=redefined-outer-name
-
         result_container = search.search()
 
     except SearxParameterException as e:
@@ -698,45 +662,54 @@ def search():
         logger.exception(e, exc_info=True)
         return index_error(output_format, gettext('search error')), 500
 
-    # results
-    results = result_container.get_ordered_results()
-    number_of_results = result_container.results_number()
-    if number_of_results < result_container.results_length():
-        number_of_results = 0
-
-    # checkin for a external bang
+    # 1. check if the result is a redirect for an external bang
     if result_container.redirect_url:
         return redirect(result_container.redirect_url)
 
-    # Server-Timing header
+    # 2. add Server-Timing header for measuring performance characteristics of
+    # web applications
     request.timings = result_container.get_timings()  # pylint: disable=assigning-non-slot
 
+    # 3. formats without a template
+
+    if output_format == 'json':
+
+        response = webutils.get_json_response(search_query, result_container)
+        return Response(response, mimetype='application/json')
+
+    if output_format == 'csv':
+
+        csv = webutils.CSVWriter(StringIO())
+        webutils.write_csv_response(csv, result_container)
+        csv.stream.seek(0)
+
+        response = Response(csv.stream.read(), mimetype='application/csv')
+        cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
+        response.headers.add('Content-Disposition', cont_disp)
+        return response
+
+    # 4. formats rendered by a template / RSS & HTML
+
     current_template = None
     previous_result = None
 
-    # output
+    results = result_container.get_ordered_results()
     for result in results:
         if output_format == 'html':
             if 'content' in result and result['content']:
                 result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
             if 'title' in result and result['title']:
                 result['title'] = highlight_content(escape(result['title'] or ''), search_query.query)
-        else:
-            if result.get('content'):
-                result['content'] = html_to_text(result['content']).strip()
-            # removing html content and whitespace duplications
-            result['title'] = ' '.join(html_to_text(result['title']).strip().split())
 
         if 'url' in result:
-            result['pretty_url'] = prettify_url(result['url'])
-
+            result['pretty_url'] = webutils.prettify_url(result['url'])
         if result.get('publishedDate'):  # do not try to get a date from an empty string or a None type
             try:  # test if publishedDate >= 1900 (datetime module bug)
                 result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
             except ValueError:
                 result['publishedDate'] = None
             else:
-                result['publishedDate'] = searxng_l10n_timespan(result['publishedDate'])
+                result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate'])
 
         # set result['open_group'] = True when the template changes from the previous result
         # set result['close_group'] = True when the template changes on the next result
@@ -750,42 +723,7 @@ def search():
     if previous_result:
         previous_result['close_group'] = True
 
-    if output_format == 'json':
-        x = {
-            'query': search_query.query,
-            'number_of_results': number_of_results,
-            'results': results,
-            'answers': list(result_container.answers),
-            'corrections': list(result_container.corrections),
-            'infoboxes': result_container.infoboxes,
-            'suggestions': list(result_container.suggestions),
-            'unresponsive_engines': __get_translated_errors(result_container.unresponsive_engines),
-        }
-        response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item)
-        return Response(response, mimetype='application/json')
-
-    if output_format == 'csv':
-        csv = UnicodeWriter(StringIO())
-        keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
-        csv.writerow(keys)
-        for row in results:
-            row['host'] = row['parsed_url'].netloc
-            row['type'] = 'result'
-            csv.writerow([row.get(key, '') for key in keys])
-        for a in result_container.answers:
-            row = {'title': a, 'type': 'answer'}
-            csv.writerow([row.get(key, '') for key in keys])
-        for a in result_container.suggestions:
-            row = {'title': a, 'type': 'suggestion'}
-            csv.writerow([row.get(key, '') for key in keys])
-        for a in result_container.corrections:
-            row = {'title': a, 'type': 'correction'}
-            csv.writerow([row.get(key, '') for key in keys])
-        csv.stream.seek(0)
-        response = Response(csv.stream.read(), mimetype='application/csv')
-        cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
-        response.headers.add('Content-Disposition', cont_disp)
-        return response
+    # 4.a RSS
 
     if output_format == 'rss':
         response_rss = render(
@@ -795,11 +733,11 @@ def search():
             corrections=result_container.corrections,
             suggestions=result_container.suggestions,
             q=request.form['q'],
-            number_of_results=number_of_results,
+            number_of_results=result_container.number_of_results,
         )
         return Response(response_rss, mimetype='text/xml')
 
-    # HTML output format
+    # 4.b HTML
 
     # suggestions: use RawTextQuery to get the suggestion URLs with the same bang
     suggestion_urls = list(
@@ -827,14 +765,14 @@ def search():
         selected_categories = search_query.categories,
         pageno = search_query.pageno,
         time_range = search_query.time_range or '',
-        number_of_results = format_decimal(number_of_results),
+        number_of_results = format_decimal(result_container.number_of_results),
         suggestions = suggestion_urls,
         answers = result_container.answers,
         corrections = correction_urls,
         infoboxes = result_container.infoboxes,
         engine_data = result_container.engine_data,
         paging = result_container.paging,
-        unresponsive_engines = __get_translated_errors(
+        unresponsive_engines = webutils.get_translated_errors(
             result_container.unresponsive_engines
         ),
         current_locale = request.preferences.get_value("locale"),
@@ -849,25 +787,6 @@ def search():
     )
 
 
-def __get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]):
-    translated_errors = []
-
-    # make a copy unresponsive_engines to avoid "RuntimeError: Set changed size
-    # during iteration" it happens when an engine modifies the ResultContainer
-    # after the search_multiple_requests method has stopped waiting
-
-    for unresponsive_engine in unresponsive_engines:
-        error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type)
-        if not error_user_text:
-            error_user_text = exception_classname_to_text[None]
-        error_msg = gettext(error_user_text)
-        if unresponsive_engine.suspended:
-            error_msg = gettext('Suspended') + ': ' + error_msg
-        translated_errors.append((unresponsive_engine.engine, error_msg))
-
-    return sorted(translated_errors, key=lambda e: e[0])
-
-
 @app.route('/about', methods=['GET'])
 def about():
     """Redirect to about page"""
diff --git a/searx/webutils.py b/searx/webutils.py
index 470833291..ddd9891bf 100644
--- a/searx/webutils.py
+++ b/searx/webutils.py
@@ -9,31 +9,80 @@ import hmac
 import re
 import inspect
 import itertools
+import json
 from datetime import datetime, timedelta
 from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING
 
 from io import StringIO
 from codecs import getincrementalencoder
 
-from flask_babel import gettext, format_date
+from flask_babel import gettext, format_date  # type: ignore
 
 from searx import logger, settings
 from searx.engines import DEFAULT_CATEGORY
 
 if TYPE_CHECKING:
     from searx.enginelib import Engine
-
+    from searx.results import ResultContainer
+    from searx.search import SearchQuery
+    from searx.results import UnresponsiveEngine
 
 VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$')
 
 logger = logger.getChild('webutils')
 
+timeout_text = gettext('timeout')
+parsing_error_text = gettext('parsing error')
+http_protocol_error_text = gettext('HTTP protocol error')
+network_error_text = gettext('network error')
+ssl_cert_error_text = gettext("SSL error: certificate validation has failed")
+exception_classname_to_text = {
+    None: gettext('unexpected crash'),
+    'timeout': timeout_text,
+    'asyncio.TimeoutError': timeout_text,
+    'httpx.TimeoutException': timeout_text,
+    'httpx.ConnectTimeout': timeout_text,
+    'httpx.ReadTimeout': timeout_text,
+    'httpx.WriteTimeout': timeout_text,
+    'httpx.HTTPStatusError': gettext('HTTP error'),
+    'httpx.ConnectError': gettext("HTTP connection error"),
+    'httpx.RemoteProtocolError': http_protocol_error_text,
+    'httpx.LocalProtocolError': http_protocol_error_text,
+    'httpx.ProtocolError': http_protocol_error_text,
+    'httpx.ReadError': network_error_text,
+    'httpx.WriteError': network_error_text,
+    'httpx.ProxyError': gettext("proxy error"),
+    'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
+    'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
+    'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
+    'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
+    'searx.exceptions.SearxEngineXPathException': parsing_error_text,
+    'KeyError': parsing_error_text,
+    'json.decoder.JSONDecodeError': parsing_error_text,
+    'lxml.etree.ParserError': parsing_error_text,
+    'ssl.SSLCertVerificationError': ssl_cert_error_text,  # for Python > 3.7
+    'ssl.CertificateError': ssl_cert_error_text,  # for Python 3.7
+}
 
-class UnicodeWriter:
-    """
-    A CSV writer which will write rows to CSV file "f",
-    which is encoded in the given encoding.
-    """
+
+def get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]):
+    translated_errors = []
+
+    for unresponsive_engine in unresponsive_engines:
+        error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type)
+        if not error_user_text:
+            error_user_text = exception_classname_to_text[None]
+        error_msg = gettext(error_user_text)
+        if unresponsive_engine.suspended:
+            error_msg = gettext('Suspended') + ': ' + error_msg
+        translated_errors.append((unresponsive_engine.engine, error_msg))
+
+    return sorted(translated_errors, key=lambda e: e[0])
+
+
+class CSVWriter:
+    """A CSV writer which will write rows to CSV file "f", which is encoded in
+    the given encoding."""
 
     def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
         # Redirect output to a queue
@@ -59,6 +108,58 @@ class UnicodeWriter:
             self.writerow(row)
 
 
+def write_csv_response(csv: CSVWriter, rc: ResultContainer) -> None:
+    """Write rows of the results to a query (``application/csv``) into a CSV
+    table (:py:obj:`CSVWriter`).  First line in the table contain the column
+    names.  The column "type" specifies the type, the following types are
+    included in the table:
+
+    - result
+    - answer
+    - suggestion
+    - correction
+
+    """
+
+    results = rc.get_ordered_results()
+    keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
+    csv.writerow(keys)
+
+    for row in results:
+        row['host'] = row['parsed_url'].netloc
+        row['type'] = 'result'
+        csv.writerow([row.get(key, '') for key in keys])
+
+    for a in rc.answers:
+        row = {'title': a, 'type': 'answer'}
+        csv.writerow([row.get(key, '') for key in keys])
+
+    for a in rc.suggestions:
+        row = {'title': a, 'type': 'suggestion'}
+        csv.writerow([row.get(key, '') for key in keys])
+
+    for a in rc.corrections:
+        row = {'title': a, 'type': 'correction'}
+        csv.writerow([row.get(key, '') for key in keys])
+
+
+def get_json_response(sq: SearchQuery, rc: ResultContainer) -> str:
+    """Returns the JSON string of the results to a query (``application/json``)"""
+    results = rc.number_of_results
+    x = {
+        'query': sq.query,
+        'number_of_results': results,
+        'results': rc.get_ordered_results(),
+        'answers': list(rc.answers),
+        'corrections': list(rc.corrections),
+        'infoboxes': rc.infoboxes,
+        'suggestions': list(rc.suggestions),
+        'unresponsive_engines': get_translated_errors(rc.unresponsive_engines),
+    }
+    response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item)
+    return response
+
+
 def get_themes(templates_path):
     """Returns available themes list."""
     return os.listdir(templates_path)
diff --git a/searxng_extra/standalone_searx.py b/searxng_extra/standalone_searx.py
index 0a1e18846..d7964bd04 100755
--- a/searxng_extra/standalone_searx.py
+++ b/searxng_extra/standalone_searx.py
@@ -60,7 +60,7 @@ Example to run it from python:
     "infoboxes": [ {...} ],
     "paging": true,
     "results": [... ],
-    "results_number": 820000000.0,
+    "number_of_results": 820000000.0,
     "search": {
         "lang": "all",
         "pageno": 1,
@@ -150,7 +150,7 @@ def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]:
         "suggestions": list(result_container.suggestions),
         "answers": list(result_container.answers),
         "paging": result_container.paging,
-        "results_number": result_container.results_number(),
+        "number_of_results": result_container.number_of_results,
     }
     return result_container_json
 
diff --git a/tests/unit/test_standalone_searx.py b/tests/unit/test_standalone_searx.py
index a3d8b4d4f..6a450485f 100644
--- a/tests/unit/test_standalone_searx.py
+++ b/tests/unit/test_standalone_searx.py
@@ -57,7 +57,7 @@ class StandaloneSearx(SearxTestCase):
                 'suggestions': [],
                 'answers': [],
                 'paging': False,
-                'results_number': 0,
+                'number_of_results': 0,
             },
         )
 
@@ -73,7 +73,7 @@ class StandaloneSearx(SearxTestCase):
                     'infoboxes': m_search.infoboxes,
                     'paging': m_search.paging,
                     'results': m_search.get_ordered_results(),
-                    'results_number': m_search.results_number(),
+                    'number_of_results': m_search.number_of_results,
                     'search': {
                         'lang': m_sq.lang,
                         'pageno': m_sq.pageno,
diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py
index 1c8f8a403..948b40aee 100644
--- a/tests/unit/test_webapp.py
+++ b/tests/unit/test_webapp.py
@@ -69,7 +69,7 @@ class ViewsTestCase(SearxTestCase):
                 infoboxes=[],
                 unresponsive_engines=set(),
                 results=test_results,
-                results_number=lambda: 3,
+                number_of_results=3,
                 results_length=lambda: len(test_results),
                 get_timings=lambda: timings,
                 redirect_url=None,
diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py
index acf1aeeb7..244d2b180 100644
--- a/tests/unit/test_webutils.py
+++ b/tests/unit/test_webutils.py
@@ -64,7 +64,7 @@ class TestWebUtils(SearxTestCase):
 
 class TestUnicodeWriter(SearxTestCase):
     def setUp(self):
-        self.unicode_writer = webutils.UnicodeWriter(mock.MagicMock())
+        self.unicode_writer = webutils.CSVWriter(mock.MagicMock())
 
     def test_write_row(self):
         row = [1, 2, 3]