From fa1ef9a07b79ab740c127bac0d11b8315a5130ff Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 18 Jun 2023 16:43:48 +0200 Subject: [PATCH] [mod] move some code from webapp module to webutils module (no functional change) Over the years the webapp module became more and more a mess. To improve the modulaization a little this patch moves some implementations from the webapp module to webutils module. HINT: this patch brings non functional change Signed-off-by: Markus Heiser --- searx/results.py | 17 +++- searx/webapp.py | 151 +++++++--------------------- searx/webutils.py | 115 +++++++++++++++++++-- searxng_extra/standalone_searx.py | 4 +- tests/unit/test_standalone_searx.py | 4 +- tests/unit/test_webapp.py | 2 +- tests/unit/test_webutils.py | 2 +- 7 files changed, 164 insertions(+), 131 deletions(-) diff --git a/searx/results.py b/searx/results.py index 5dd1bff21..caf02213d 100644 --- a/searx/results.py +++ b/searx/results.py @@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set from urllib.parse import urlparse, unquote from searx import logger +from searx import utils from searx.engines import engines from searx.metrics import histogram_observe, counter_add, count_error @@ -353,6 +354,10 @@ class ResultContainer: for result in self._merged_results: score = result_score(result) result['score'] = score + if result.get('content'): + result['content'] = utils.html_to_text(result['content']).strip() + # removing html content and whitespace duplications + result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split()) for result_engine in result['engines']: counter_add(score, 'engine', result_engine, 'score') @@ -415,11 +420,19 @@ class ResultContainer: def results_length(self): return len(self._merged_results) - def results_number(self): + @property + def number_of_results(self) -> int: + """Returns the average of results number, returns zero if the average + result number is smaller than the actual result count.""" + resultnum_sum = sum(self._number_of_results) if not resultnum_sum or not self._number_of_results: return 0 - return resultnum_sum / len(self._number_of_results) + + average = int(resultnum_sum / len(self._number_of_results)) + if average < self.results_length(): + average = 0 + return average def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False): if engines[engine_name].display_error_messages: diff --git a/searx/webapp.py b/searx/webapp.py index d6322447a..59c1dd1a1 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -58,7 +58,7 @@ from searx import ( from searx import infopage from searx.data import ENGINE_DESCRIPTIONS -from searx.results import Timing, UnresponsiveEngine +from searx.results import Timing from searx.settings_defaults import OUTPUT_FORMATS from searx.settings_loader import get_default_settings_path from searx.exceptions import SearxParameterException @@ -68,18 +68,18 @@ from searx.engines import ( engines, engine_shortcuts, ) + +from searx import webutils from searx.webutils import ( - UnicodeWriter, highlight_content, get_static_files, get_result_templates, get_themes, - prettify_url, + exception_classname_to_text, new_hmac, is_hmac_of, is_flask_run_cmdline, group_engines_in_tab, - searxng_l10n_timespan, ) from searx.webadapter import ( get_search_query_from_webapp, @@ -87,7 +87,6 @@ from searx.webadapter import ( parse_lang, ) from searx.utils import ( - html_to_text, gen_useragent, dict_subset, ) @@ -165,39 +164,6 @@ app.jinja_env.add_extension('jinja2.ext.loopcontrols') # pylint: disable=no-mem app.jinja_env.filters['group_engines_in_tab'] = group_engines_in_tab # pylint: disable=no-member app.secret_key = settings['server']['secret_key'] -timeout_text = gettext('timeout') -parsing_error_text = gettext('parsing error') -http_protocol_error_text = gettext('HTTP protocol error') -network_error_text = gettext('network error') -ssl_cert_error_text = gettext("SSL error: certificate validation has failed") -exception_classname_to_text = { - None: gettext('unexpected crash'), - 'timeout': timeout_text, - 'asyncio.TimeoutError': timeout_text, - 'httpx.TimeoutException': timeout_text, - 'httpx.ConnectTimeout': timeout_text, - 'httpx.ReadTimeout': timeout_text, - 'httpx.WriteTimeout': timeout_text, - 'httpx.HTTPStatusError': gettext('HTTP error'), - 'httpx.ConnectError': gettext("HTTP connection error"), - 'httpx.RemoteProtocolError': http_protocol_error_text, - 'httpx.LocalProtocolError': http_protocol_error_text, - 'httpx.ProtocolError': http_protocol_error_text, - 'httpx.ReadError': network_error_text, - 'httpx.WriteError': network_error_text, - 'httpx.ProxyError': gettext("proxy error"), - 'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"), - 'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"), - 'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"), - 'searx.exceptions.SearxEngineAPIException': gettext("server API error"), - 'searx.exceptions.SearxEngineXPathException': parsing_error_text, - 'KeyError': parsing_error_text, - 'json.decoder.JSONDecodeError': parsing_error_text, - 'lxml.etree.ParserError': parsing_error_text, - 'ssl.SSLCertVerificationError': ssl_cert_error_text, # for Python > 3.7 - 'ssl.CertificateError': ssl_cert_error_text, # for Python 3.7 -} - class ExtendedRequest(flask.Request): """This class is never initialized and only used for type checking.""" @@ -686,9 +652,7 @@ def search(): search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp( request.preferences, request.form ) - # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name - result_container = search.search() except SearxParameterException as e: @@ -698,45 +662,54 @@ def search(): logger.exception(e, exc_info=True) return index_error(output_format, gettext('search error')), 500 - # results - results = result_container.get_ordered_results() - number_of_results = result_container.results_number() - if number_of_results < result_container.results_length(): - number_of_results = 0 - - # checkin for a external bang + # 1. check if the result is a redirect for an external bang if result_container.redirect_url: return redirect(result_container.redirect_url) - # Server-Timing header + # 2. add Server-Timing header for measuring performance characteristics of + # web applications request.timings = result_container.get_timings() # pylint: disable=assigning-non-slot + # 3. formats without a template + + if output_format == 'json': + + response = webutils.get_json_response(search_query, result_container) + return Response(response, mimetype='application/json') + + if output_format == 'csv': + + csv = webutils.CSVWriter(StringIO()) + webutils.write_csv_response(csv, result_container) + csv.stream.seek(0) + + response = Response(csv.stream.read(), mimetype='application/csv') + cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query) + response.headers.add('Content-Disposition', cont_disp) + return response + + # 4. formats rendered by a template / RSS & HTML + current_template = None previous_result = None - # output + results = result_container.get_ordered_results() for result in results: if output_format == 'html': if 'content' in result and result['content']: result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query) if 'title' in result and result['title']: result['title'] = highlight_content(escape(result['title'] or ''), search_query.query) - else: - if result.get('content'): - result['content'] = html_to_text(result['content']).strip() - # removing html content and whitespace duplications - result['title'] = ' '.join(html_to_text(result['title']).strip().split()) if 'url' in result: - result['pretty_url'] = prettify_url(result['url']) - + result['pretty_url'] = webutils.prettify_url(result['url']) if result.get('publishedDate'): # do not try to get a date from an empty string or a None type try: # test if publishedDate >= 1900 (datetime module bug) result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') except ValueError: result['publishedDate'] = None else: - result['publishedDate'] = searxng_l10n_timespan(result['publishedDate']) + result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate']) # set result['open_group'] = True when the template changes from the previous result # set result['close_group'] = True when the template changes on the next result @@ -750,42 +723,7 @@ def search(): if previous_result: previous_result['close_group'] = True - if output_format == 'json': - x = { - 'query': search_query.query, - 'number_of_results': number_of_results, - 'results': results, - 'answers': list(result_container.answers), - 'corrections': list(result_container.corrections), - 'infoboxes': result_container.infoboxes, - 'suggestions': list(result_container.suggestions), - 'unresponsive_engines': __get_translated_errors(result_container.unresponsive_engines), - } - response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item) - return Response(response, mimetype='application/json') - - if output_format == 'csv': - csv = UnicodeWriter(StringIO()) - keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type') - csv.writerow(keys) - for row in results: - row['host'] = row['parsed_url'].netloc - row['type'] = 'result' - csv.writerow([row.get(key, '') for key in keys]) - for a in result_container.answers: - row = {'title': a, 'type': 'answer'} - csv.writerow([row.get(key, '') for key in keys]) - for a in result_container.suggestions: - row = {'title': a, 'type': 'suggestion'} - csv.writerow([row.get(key, '') for key in keys]) - for a in result_container.corrections: - row = {'title': a, 'type': 'correction'} - csv.writerow([row.get(key, '') for key in keys]) - csv.stream.seek(0) - response = Response(csv.stream.read(), mimetype='application/csv') - cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query) - response.headers.add('Content-Disposition', cont_disp) - return response + # 4.a RSS if output_format == 'rss': response_rss = render( @@ -795,11 +733,11 @@ def search(): corrections=result_container.corrections, suggestions=result_container.suggestions, q=request.form['q'], - number_of_results=number_of_results, + number_of_results=result_container.number_of_results, ) return Response(response_rss, mimetype='text/xml') - # HTML output format + # 4.b HTML # suggestions: use RawTextQuery to get the suggestion URLs with the same bang suggestion_urls = list( @@ -827,14 +765,14 @@ def search(): selected_categories = search_query.categories, pageno = search_query.pageno, time_range = search_query.time_range or '', - number_of_results = format_decimal(number_of_results), + number_of_results = format_decimal(result_container.number_of_results), suggestions = suggestion_urls, answers = result_container.answers, corrections = correction_urls, infoboxes = result_container.infoboxes, engine_data = result_container.engine_data, paging = result_container.paging, - unresponsive_engines = __get_translated_errors( + unresponsive_engines = webutils.get_translated_errors( result_container.unresponsive_engines ), current_locale = request.preferences.get_value("locale"), @@ -849,25 +787,6 @@ def search(): ) -def __get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]): - translated_errors = [] - - # make a copy unresponsive_engines to avoid "RuntimeError: Set changed size - # during iteration" it happens when an engine modifies the ResultContainer - # after the search_multiple_requests method has stopped waiting - - for unresponsive_engine in unresponsive_engines: - error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type) - if not error_user_text: - error_user_text = exception_classname_to_text[None] - error_msg = gettext(error_user_text) - if unresponsive_engine.suspended: - error_msg = gettext('Suspended') + ': ' + error_msg - translated_errors.append((unresponsive_engine.engine, error_msg)) - - return sorted(translated_errors, key=lambda e: e[0]) - - @app.route('/about', methods=['GET']) def about(): """Redirect to about page""" diff --git a/searx/webutils.py b/searx/webutils.py index 470833291..ddd9891bf 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -9,31 +9,80 @@ import hmac import re import inspect import itertools +import json from datetime import datetime, timedelta from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING from io import StringIO from codecs import getincrementalencoder -from flask_babel import gettext, format_date +from flask_babel import gettext, format_date # type: ignore from searx import logger, settings from searx.engines import DEFAULT_CATEGORY if TYPE_CHECKING: from searx.enginelib import Engine - + from searx.results import ResultContainer + from searx.search import SearchQuery + from searx.results import UnresponsiveEngine VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$') logger = logger.getChild('webutils') +timeout_text = gettext('timeout') +parsing_error_text = gettext('parsing error') +http_protocol_error_text = gettext('HTTP protocol error') +network_error_text = gettext('network error') +ssl_cert_error_text = gettext("SSL error: certificate validation has failed") +exception_classname_to_text = { + None: gettext('unexpected crash'), + 'timeout': timeout_text, + 'asyncio.TimeoutError': timeout_text, + 'httpx.TimeoutException': timeout_text, + 'httpx.ConnectTimeout': timeout_text, + 'httpx.ReadTimeout': timeout_text, + 'httpx.WriteTimeout': timeout_text, + 'httpx.HTTPStatusError': gettext('HTTP error'), + 'httpx.ConnectError': gettext("HTTP connection error"), + 'httpx.RemoteProtocolError': http_protocol_error_text, + 'httpx.LocalProtocolError': http_protocol_error_text, + 'httpx.ProtocolError': http_protocol_error_text, + 'httpx.ReadError': network_error_text, + 'httpx.WriteError': network_error_text, + 'httpx.ProxyError': gettext("proxy error"), + 'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"), + 'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"), + 'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"), + 'searx.exceptions.SearxEngineAPIException': gettext("server API error"), + 'searx.exceptions.SearxEngineXPathException': parsing_error_text, + 'KeyError': parsing_error_text, + 'json.decoder.JSONDecodeError': parsing_error_text, + 'lxml.etree.ParserError': parsing_error_text, + 'ssl.SSLCertVerificationError': ssl_cert_error_text, # for Python > 3.7 + 'ssl.CertificateError': ssl_cert_error_text, # for Python 3.7 +} -class UnicodeWriter: - """ - A CSV writer which will write rows to CSV file "f", - which is encoded in the given encoding. - """ + +def get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]): + translated_errors = [] + + for unresponsive_engine in unresponsive_engines: + error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type) + if not error_user_text: + error_user_text = exception_classname_to_text[None] + error_msg = gettext(error_user_text) + if unresponsive_engine.suspended: + error_msg = gettext('Suspended') + ': ' + error_msg + translated_errors.append((unresponsive_engine.engine, error_msg)) + + return sorted(translated_errors, key=lambda e: e[0]) + + +class CSVWriter: + """A CSV writer which will write rows to CSV file "f", which is encoded in + the given encoding.""" def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): # Redirect output to a queue @@ -59,6 +108,58 @@ class UnicodeWriter: self.writerow(row) +def write_csv_response(csv: CSVWriter, rc: ResultContainer) -> None: + """Write rows of the results to a query (``application/csv``) into a CSV + table (:py:obj:`CSVWriter`). First line in the table contain the column + names. The column "type" specifies the type, the following types are + included in the table: + + - result + - answer + - suggestion + - correction + + """ + + results = rc.get_ordered_results() + keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type') + csv.writerow(keys) + + for row in results: + row['host'] = row['parsed_url'].netloc + row['type'] = 'result' + csv.writerow([row.get(key, '') for key in keys]) + + for a in rc.answers: + row = {'title': a, 'type': 'answer'} + csv.writerow([row.get(key, '') for key in keys]) + + for a in rc.suggestions: + row = {'title': a, 'type': 'suggestion'} + csv.writerow([row.get(key, '') for key in keys]) + + for a in rc.corrections: + row = {'title': a, 'type': 'correction'} + csv.writerow([row.get(key, '') for key in keys]) + + +def get_json_response(sq: SearchQuery, rc: ResultContainer) -> str: + """Returns the JSON string of the results to a query (``application/json``)""" + results = rc.number_of_results + x = { + 'query': sq.query, + 'number_of_results': results, + 'results': rc.get_ordered_results(), + 'answers': list(rc.answers), + 'corrections': list(rc.corrections), + 'infoboxes': rc.infoboxes, + 'suggestions': list(rc.suggestions), + 'unresponsive_engines': get_translated_errors(rc.unresponsive_engines), + } + response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item) + return response + + def get_themes(templates_path): """Returns available themes list.""" return os.listdir(templates_path) diff --git a/searxng_extra/standalone_searx.py b/searxng_extra/standalone_searx.py index 0a1e18846..d7964bd04 100755 --- a/searxng_extra/standalone_searx.py +++ b/searxng_extra/standalone_searx.py @@ -60,7 +60,7 @@ Example to run it from python: "infoboxes": [ {...} ], "paging": true, "results": [... ], - "results_number": 820000000.0, + "number_of_results": 820000000.0, "search": { "lang": "all", "pageno": 1, @@ -150,7 +150,7 @@ def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]: "suggestions": list(result_container.suggestions), "answers": list(result_container.answers), "paging": result_container.paging, - "results_number": result_container.results_number(), + "number_of_results": result_container.number_of_results, } return result_container_json diff --git a/tests/unit/test_standalone_searx.py b/tests/unit/test_standalone_searx.py index a3d8b4d4f..6a450485f 100644 --- a/tests/unit/test_standalone_searx.py +++ b/tests/unit/test_standalone_searx.py @@ -57,7 +57,7 @@ class StandaloneSearx(SearxTestCase): 'suggestions': [], 'answers': [], 'paging': False, - 'results_number': 0, + 'number_of_results': 0, }, ) @@ -73,7 +73,7 @@ class StandaloneSearx(SearxTestCase): 'infoboxes': m_search.infoboxes, 'paging': m_search.paging, 'results': m_search.get_ordered_results(), - 'results_number': m_search.results_number(), + 'number_of_results': m_search.number_of_results, 'search': { 'lang': m_sq.lang, 'pageno': m_sq.pageno, diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index 1c8f8a403..948b40aee 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -69,7 +69,7 @@ class ViewsTestCase(SearxTestCase): infoboxes=[], unresponsive_engines=set(), results=test_results, - results_number=lambda: 3, + number_of_results=3, results_length=lambda: len(test_results), get_timings=lambda: timings, redirect_url=None, diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py index acf1aeeb7..244d2b180 100644 --- a/tests/unit/test_webutils.py +++ b/tests/unit/test_webutils.py @@ -64,7 +64,7 @@ class TestWebUtils(SearxTestCase): class TestUnicodeWriter(SearxTestCase): def setUp(self): - self.unicode_writer = webutils.UnicodeWriter(mock.MagicMock()) + self.unicode_writer = webutils.CSVWriter(mock.MagicMock()) def test_write_row(self): row = [1, 2, 3]