mirror of
https://github.com/searxng/searxng.git
synced 2024-12-22 17:26:30 +00:00
[enh] checker: background check
See settings.yml for the options SIGUSR1 signal starts the checker. The result is available at /stats/checker
This commit is contained in:
parent
6e2872f436
commit
3a9f513521
9 changed files with 255 additions and 97 deletions
|
@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url
|
|||
from searx.results import ResultContainer
|
||||
from searx import logger
|
||||
from searx.plugins import plugins
|
||||
from searx.search.models import EngineRef, SearchQuery
|
||||
from searx.search.processors import processors, initialize as initialize_processors
|
||||
from searx.search.checker import initialize as initialize_checker
|
||||
|
||||
|
||||
logger = logger.getChild('search')
|
||||
|
@ -45,75 +47,11 @@ else:
|
|||
sys.exit(1)
|
||||
|
||||
|
||||
def initialize(settings_engines=None):
|
||||
def initialize(settings_engines=None, enable_checker=False):
|
||||
settings_engines = settings_engines or settings['engines']
|
||||
initialize_processors(settings_engines)
|
||||
|
||||
|
||||
class EngineRef:
|
||||
|
||||
__slots__ = 'name', 'category'
|
||||
|
||||
def __init__(self, name: str, category: str):
|
||||
self.name = name
|
||||
self.category = category
|
||||
|
||||
def __repr__(self):
|
||||
return "EngineRef({!r}, {!r})".format(self.name, self.category)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.name == other.name and self.category == other.category
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.name, self.category))
|
||||
|
||||
|
||||
class SearchQuery:
|
||||
"""container for all the search parameters (query, language, etc...)"""
|
||||
|
||||
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
|
||||
'timeout_limit', 'external_bang'
|
||||
|
||||
def __init__(self,
|
||||
query: str,
|
||||
engineref_list: typing.List[EngineRef],
|
||||
lang: str='all',
|
||||
safesearch: int=0,
|
||||
pageno: int=1,
|
||||
time_range: typing.Optional[str]=None,
|
||||
timeout_limit: typing.Optional[float]=None,
|
||||
external_bang: typing.Optional[str]=None):
|
||||
self.query = query
|
||||
self.engineref_list = engineref_list
|
||||
self.lang = lang
|
||||
self.safesearch = safesearch
|
||||
self.pageno = pageno
|
||||
self.time_range = time_range
|
||||
self.timeout_limit = timeout_limit
|
||||
self.external_bang = external_bang
|
||||
|
||||
@property
|
||||
def categories(self):
|
||||
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
|
||||
|
||||
def __repr__(self):
|
||||
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
|
||||
format(self.query, self.engineref_list, self.lang, self.safesearch,
|
||||
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.query == other.query\
|
||||
and self.engineref_list == other.engineref_list\
|
||||
and self.lang == other.lang\
|
||||
and self.safesearch == other.safesearch\
|
||||
and self.pageno == other.pageno\
|
||||
and self.time_range == other.time_range\
|
||||
and self.timeout_limit == other.timeout_limit\
|
||||
and self.external_bang == other.external_bang
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
|
||||
self.timeout_limit, self.external_bang))
|
||||
if enable_checker:
|
||||
initialize_checker()
|
||||
|
||||
|
||||
class Search:
|
||||
|
|
|
@ -1 +1,4 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
from .impl import Checker
|
||||
from .background import initialize, get_result
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import searx.search
|
||||
import searx.search.processors
|
||||
import searx.search.checker
|
||||
from searx.search import processors
|
||||
from searx.engines import engine_shortcuts
|
||||
|
||||
|
||||
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
|
||||
|
@ -18,20 +22,24 @@ else:
|
|||
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
|
||||
|
||||
|
||||
def iter_processor():
|
||||
if len(sys.argv) > 1:
|
||||
for name, processor in searx.search.processors.items():
|
||||
if name in sys.argv:
|
||||
def iter_processor(engine_name_list):
|
||||
if len(engine_name_list) > 0:
|
||||
for name in engine_name_list:
|
||||
name = engine_shortcuts.get(name, name)
|
||||
processor = processors.get(name)
|
||||
if processor is not None:
|
||||
yield name, processor
|
||||
else:
|
||||
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ)
|
||||
else:
|
||||
for name, processor in searx.search.processors.items():
|
||||
yield name, processor
|
||||
|
||||
|
||||
def main():
|
||||
def run(engine_name_list):
|
||||
searx.search.initialize()
|
||||
broken_urls = []
|
||||
for name, processor in iter_processor():
|
||||
for name, processor in iter_processor(engine_name_list):
|
||||
if sys.stdout.isatty():
|
||||
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ)
|
||||
checker = searx.search.checker.Checker(processor)
|
||||
|
@ -48,5 +56,13 @@ def main():
|
|||
print('Error fetching', url)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Check searx engines.')
|
||||
parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*',
|
||||
help='engines name or shortcut list. Empty for all engines.')
|
||||
args = parser.parse_args()
|
||||
run(args.engine_name_list)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
106
searx/search/checker/background.py
Normal file
106
searx/search/checker/background.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
import threading
|
||||
import os
|
||||
import signal
|
||||
|
||||
from searx import logger, settings, searx_debug
|
||||
from searx.exceptions import SearxSettingsException
|
||||
from searx.search.processors import processors
|
||||
from searx.search.checker import Checker
|
||||
from searx.shared import schedule, storage
|
||||
|
||||
|
||||
CHECKER_RESULT = 'CHECKER_RESULT'
|
||||
running = threading.Lock()
|
||||
|
||||
|
||||
def _get_interval(every, error_msg):
|
||||
if isinstance(every, int):
|
||||
every = (every, every)
|
||||
if not isinstance(every, (tuple, list))\
|
||||
or len(every) != 2\
|
||||
or not isinstance(every[0], int)\
|
||||
or not isinstance(every[1], int):
|
||||
raise SearxSettingsException(error_msg, None)
|
||||
return every
|
||||
|
||||
|
||||
def _get_every():
|
||||
every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
|
||||
return _get_interval(every, 'checker.scheduling.every is not a int or list')
|
||||
|
||||
|
||||
def get_result():
|
||||
serialized_result = storage.get_str('CHECKER_RESULT')
|
||||
if serialized_result is not None:
|
||||
return json.loads(serialized_result)
|
||||
|
||||
|
||||
def run():
|
||||
if not running.acquire(blocking=False):
|
||||
return
|
||||
try:
|
||||
logger.info('Starting checker')
|
||||
result = {}
|
||||
for name, processor in processors.items():
|
||||
logger.debug('Checking %s engine', name)
|
||||
checker = Checker(processor)
|
||||
checker.run()
|
||||
if checker.test_results.succesfull:
|
||||
result[name] = {'status': True}
|
||||
else:
|
||||
result[name] = {'status': False, 'errors': checker.test_results.errors}
|
||||
|
||||
storage.set_str('CHECKER_RESULT', json.dumps(result))
|
||||
logger.info('Check done')
|
||||
finally:
|
||||
running.release()
|
||||
|
||||
|
||||
def _run_with_delay():
|
||||
every = _get_every()
|
||||
delay = random.randint(0, every[1] - every[0])
|
||||
logger.debug('Start checker in %i seconds', delay)
|
||||
time.sleep(delay)
|
||||
run()
|
||||
|
||||
|
||||
def _start_scheduling():
|
||||
every = _get_every()
|
||||
schedule(every[0], _run_with_delay)
|
||||
run()
|
||||
|
||||
|
||||
def _signal_handler(signum, frame):
|
||||
t = threading.Thread(target=run)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
|
||||
def initialize():
|
||||
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
|
||||
signal.signal(signal.SIGUSR1, _signal_handler)
|
||||
|
||||
# special case when debug is activate
|
||||
if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
|
||||
logger.info('debug mode: checker is disabled')
|
||||
return
|
||||
|
||||
# check value of checker.scheduling.every now
|
||||
scheduling = settings.get('checker', {}).get('scheduling', None)
|
||||
if scheduling is None or not scheduling:
|
||||
logger.info('Checker scheduler is disabled')
|
||||
return
|
||||
|
||||
#
|
||||
start_after = scheduling.get('start_after', (300, 1800))
|
||||
start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
|
||||
delay = random.randint(start_after[0], start_after[1])
|
||||
logger.info('Start checker in %i seconds', delay)
|
||||
t = threading.Timer(delay, _start_scheduling)
|
||||
t.daemon = True
|
||||
t.start()
|
|
@ -1,3 +1,5 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import typing
|
||||
import types
|
||||
import functools
|
||||
|
@ -11,7 +13,7 @@ import requests.exceptions
|
|||
|
||||
from searx import poolrequests, logger
|
||||
from searx.results import ResultContainer
|
||||
from searx.search import SearchQuery, EngineRef
|
||||
from searx.search.models import SearchQuery, EngineRef
|
||||
from searx.search.processors import EngineProcessor
|
||||
|
||||
|
||||
|
@ -240,18 +242,24 @@ class ResultContainerTests:
|
|||
self._check_infoboxes(self.result_container.infoboxes)
|
||||
|
||||
def has_infobox(self):
|
||||
"""Check the ResultContainer has at least one infobox"""
|
||||
if len(self.result_container.infoboxes) == 0:
|
||||
self._record_error('No infobox')
|
||||
|
||||
def has_answer(self):
|
||||
"""Check the ResultContainer has at least one answer"""
|
||||
if len(self.result_container.answers) == 0:
|
||||
self._record_error('No answer')
|
||||
|
||||
def has_language(self, lang):
|
||||
"""Check at least one title or content of the results is written in the `lang`.
|
||||
|
||||
Detected using pycld3, may be not accurate"""
|
||||
if lang not in self.languages:
|
||||
self._record_error(lang + ' not found')
|
||||
|
||||
def not_empty(self):
|
||||
"""Check the ResultContainer has at least one answer or infobox or result"""
|
||||
result_types = set()
|
||||
results = self.result_container.get_ordered_results()
|
||||
if len(results) > 0:
|
||||
|
@ -267,6 +275,7 @@ class ResultContainerTests:
|
|||
self._record_error('No result')
|
||||
|
||||
def one_title_contains(self, title: str):
|
||||
"""Check one of the title contains `title` (case insensitive comparaison)"""
|
||||
title = title.lower()
|
||||
for result in self.result_container.get_ordered_results():
|
||||
if title in result['title'].lower():
|
||||
|
@ -287,6 +296,7 @@ class CheckerTests:
|
|||
self.result_container_tests_list = result_container_tests_list
|
||||
|
||||
def unique_results(self):
|
||||
"""Check the results of each ResultContain is unique"""
|
||||
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
|
||||
if len(urls_list[0]) > 0:
|
||||
# results on the first page
|
||||
|
|
69
searx/search/models.py
Normal file
69
searx/search/models.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import typing
|
||||
|
||||
|
||||
class EngineRef:
|
||||
|
||||
__slots__ = 'name', 'category'
|
||||
|
||||
def __init__(self, name: str, category: str):
|
||||
self.name = name
|
||||
self.category = category
|
||||
|
||||
def __repr__(self):
|
||||
return "EngineRef({!r}, {!r})".format(self.name, self.category)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.name == other.name and self.category == other.category
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.name, self.category))
|
||||
|
||||
|
||||
class SearchQuery:
|
||||
"""container for all the search parameters (query, language, etc...)"""
|
||||
|
||||
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
|
||||
'timeout_limit', 'external_bang'
|
||||
|
||||
def __init__(self,
|
||||
query: str,
|
||||
engineref_list: typing.List[EngineRef],
|
||||
lang: str='all',
|
||||
safesearch: int=0,
|
||||
pageno: int=1,
|
||||
time_range: typing.Optional[str]=None,
|
||||
timeout_limit: typing.Optional[float]=None,
|
||||
external_bang: typing.Optional[str]=None):
|
||||
self.query = query
|
||||
self.engineref_list = engineref_list
|
||||
self.lang = lang
|
||||
self.safesearch = safesearch
|
||||
self.pageno = pageno
|
||||
self.time_range = time_range
|
||||
self.timeout_limit = timeout_limit
|
||||
self.external_bang = external_bang
|
||||
|
||||
@property
|
||||
def categories(self):
|
||||
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
|
||||
|
||||
def __repr__(self):
|
||||
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
|
||||
format(self.query, self.engineref_list, self.lang, self.safesearch,
|
||||
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.query == other.query\
|
||||
and self.engineref_list == other.engineref_list\
|
||||
and self.lang == other.lang\
|
||||
and self.safesearch == other.safesearch\
|
||||
and self.pageno == other.pageno\
|
||||
and self.time_range == other.time_range\
|
||||
and self.timeout_limit == other.timeout_limit\
|
||||
and self.external_bang == other.external_bang
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
|
||||
self.timeout_limit, self.external_bang))
|
|
@ -102,24 +102,33 @@ outgoing: # communication with search engines
|
|||
# - "HTTPS rewrite"
|
||||
# - ...
|
||||
|
||||
additional_tests:
|
||||
rosebud: &test_rosebud
|
||||
matrix:
|
||||
query: rosebud
|
||||
lang: en
|
||||
result_container:
|
||||
- not_empty
|
||||
- [one_title_contains', 'citizen kane']
|
||||
test:
|
||||
- unique_results
|
||||
|
||||
tests:
|
||||
infobox: &tests_infobox
|
||||
infobox:
|
||||
matrix:
|
||||
query: ["linux", "new york", "bbc"]
|
||||
result_container:
|
||||
- has_infobox
|
||||
checker:
|
||||
# disable checker when in debug mode
|
||||
off_when_debug: True
|
||||
# scheduling: interval or int
|
||||
# use "scheduling: False" to disable scheduling
|
||||
scheduling:
|
||||
start_after: [300, 1800] # delay to start the first run of the checker
|
||||
every: [86400, 90000] # how often the checker runs
|
||||
# additional tests: only for the YAML anchors (see the engines section)
|
||||
additional_tests:
|
||||
rosebud: &test_rosebud
|
||||
matrix:
|
||||
query: rosebud
|
||||
lang: en
|
||||
result_container:
|
||||
- not_empty
|
||||
- ['one_title_contains', 'citizen kane']
|
||||
test:
|
||||
- unique_results
|
||||
# tests: only for the YAML anchors (see the engines section)
|
||||
tests:
|
||||
infobox: &tests_infobox
|
||||
infobox:
|
||||
matrix:
|
||||
query: ["linux", "new york", "bbc"]
|
||||
result_container:
|
||||
- has_infobox
|
||||
|
||||
engines:
|
||||
- name: apk mirror
|
||||
|
|
|
@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori
|
|||
from searx.utils import html_to_text, gen_useragent, dict_subset, match_language
|
||||
from searx.version import VERSION_STRING
|
||||
from searx.languages import language_codes as languages
|
||||
from searx.search import SearchWithPlugins, initialize
|
||||
from searx.search import SearchWithPlugins, initialize as search_initialize
|
||||
from searx.search.checker import get_result as checker_get_result
|
||||
from searx.query import RawTextQuery
|
||||
from searx.autocomplete import searx_bang, backends as autocomplete_backends
|
||||
from searx.plugins import plugins
|
||||
|
@ -81,7 +82,6 @@ from searx.answerers import answerers
|
|||
from searx.poolrequests import get_global_proxies
|
||||
from searx.metrology.error_recorder import errors_per_engines
|
||||
|
||||
|
||||
# serve pages with HTTP/1.1
|
||||
from werkzeug.serving import WSGIRequestHandler
|
||||
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))
|
||||
|
@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai
|
|||
# initialize the engines except on the first run of the werkzeug server.
|
||||
if not werkzeug_reloader\
|
||||
or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"):
|
||||
initialize()
|
||||
search_initialize(enable_checker=True)
|
||||
|
||||
babel = Babel(app)
|
||||
|
||||
|
@ -977,6 +977,12 @@ def stats_errors():
|
|||
return jsonify(result)
|
||||
|
||||
|
||||
@app.route('/stats/checker', methods=['GET'])
|
||||
def stats_checker():
|
||||
result = checker_get_result()
|
||||
return jsonify(result)
|
||||
|
||||
|
||||
@app.route('/robots.txt', methods=['GET'])
|
||||
def robots():
|
||||
return Response("""User-agent: *
|
||||
|
|
3
setup.py
3
setup.py
|
@ -49,7 +49,8 @@ setup(
|
|||
},
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'searx-run = searx.webapp:run'
|
||||
'searx-run = searx.webapp:run',
|
||||
'searx-checker = searx.search.checker.__main__:main'
|
||||
]
|
||||
},
|
||||
package_data={
|
||||
|
|
Loading…
Reference in a new issue