Merge pull request #2419 from dalf/checker

[enh] add checker
This commit is contained in:
Alexandre Flament 2021-01-13 15:46:48 +01:00 committed by GitHub
commit 484dc99580
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
23 changed files with 1014 additions and 70 deletions

View file

@ -41,6 +41,8 @@ RUN apk upgrade --no-cache \
openssl-dev \
tar \
git \
protoc \
protobuf-dev \
&& apk add --no-cache \
ca-certificates \
su-exec \
@ -53,6 +55,7 @@ RUN apk upgrade --no-cache \
uwsgi \
uwsgi-python3 \
brotli \
protobuf \
&& pip3 install --upgrade pip \
&& pip3 install --no-cache -r requirements.txt \
&& apk del build-dependencies \

View file

@ -42,3 +42,6 @@ static-map = /static=/usr/local/searx/searx/static
static-expires = /* 864000
static-gzip-all = True
offload-threads = %k
# Cache
cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1

View file

@ -9,3 +9,4 @@ pygments==2.1.3
python-dateutil==2.8.1
pyyaml==5.3.1
requests[socks]==2.25.1
pycld3==0.20

View file

@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
from searx.search.models import EngineRef, SearchQuery
from searx.search.processors import processors, initialize as initialize_processors
from searx.search.checker import initialize as initialize_checker
logger = logger.getChild('search')
@ -45,68 +47,11 @@ else:
sys.exit(1)
def initialize(settings_engines=None):
def initialize(settings_engines=None, enable_checker=False):
settings_engines = settings_engines or settings['engines']
initialize_processors(settings_engines)
class EngineRef:
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
'timeout_limit', 'external_bang'
def __init__(self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str='all',
safesearch: int=0,
pageno: int=1,
time_range: typing.Optional[str]=None,
timeout_limit: typing.Optional[float]=None,
external_bang: typing.Optional[str]=None):
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.query, self.engineref_list, self.lang, self.safesearch,
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
def __eq__(self, other):
return self.query == other.query\
and self.engineref_list == other.engineref_list\
and self.lang == other.lang\
and self.safesearch == other.safesearch\
and self.pageno == other.pageno\
and self.time_range == other.time_range\
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
if enable_checker:
initialize_checker()
class Search:

View file

@ -0,0 +1,4 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from .impl import Checker
from .background import initialize, get_result

View file

@ -0,0 +1,94 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import sys
import io
import os
import argparse
import logging
import searx.search
import searx.search.checker
from searx.search import processors
from searx.engines import engine_shortcuts
# configure logging
root = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
for h in root.handlers:
root.removeHandler(h)
root.addHandler(handler)
# color only for a valid terminal
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
RESET_SEQ = "\033[0m"
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8))
else:
RESET_SEQ = ""
COLOR_SEQ = ""
BOLD_SEQ = ""
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
# equivalent of 'python -u' (unbuffered stdout, stderr)
stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True)
stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True)
# iterator of processors
def iter_processor(engine_name_list):
if len(engine_name_list) > 0:
for name in engine_name_list:
name = engine_shortcuts.get(name, name)
processor = processors.get(name)
if processor is not None:
yield name, processor
else:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}')
else:
for name, processor in searx.search.processors.items():
yield name, processor
# actual check & display
def run(engine_name_list, verbose):
searx.search.initialize()
for name, processor in iter_processor(engine_name_list):
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
if not sys.stdout.isatty():
stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
checker = searx.search.checker.Checker(processor)
checker.run()
if checker.test_results.succesfull:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n')
if verbose:
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')
else:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}')
if not verbose:
errors = [test_name + ': ' + error for test_name, error in checker.test_results]
stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n')
else:
stdout.write('\n')
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')
for test_name, logs in checker.test_results.logs.items():
for log in logs:
stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n')
# call by setup.py
def main():
parser = argparse.ArgumentParser(description='Check searx engines.')
parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*',
help='engines name or shortcut list. Empty for all engines.')
parser.add_argument('--verbose', '-v',
action='store_true', dest='verbose',
help='Display details about the test results',
default=False)
args = parser.parse_args()
run(args.engine_name_list, args.verbose)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,123 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import json
import random
import time
import threading
import os
import signal
from searx import logger, settings, searx_debug
from searx.exceptions import SearxSettingsException
from searx.search.processors import processors
from searx.search.checker import Checker
from searx.shared import schedule, storage
CHECKER_RESULT = 'CHECKER_RESULT'
running = threading.Lock()
def _get_interval(every, error_msg):
if isinstance(every, int):
every = (every, every)
if not isinstance(every, (tuple, list))\
or len(every) != 2\
or not isinstance(every[0], int)\
or not isinstance(every[1], int):
raise SearxSettingsException(error_msg, None)
return every
def _get_every():
every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
return _get_interval(every, 'checker.scheduling.every is not a int or list')
def get_result():
serialized_result = storage.get_str(CHECKER_RESULT)
if serialized_result is not None:
return json.loads(serialized_result)
def _set_result(result, include_timestamp=True):
if include_timestamp:
result['timestamp'] = int(time.time() / 3600) * 3600
storage.set_str(CHECKER_RESULT, json.dumps(result))
def run():
if not running.acquire(blocking=False):
return
try:
logger.info('Starting checker')
result = {
'status': 'ok',
'engines': {}
}
for name, processor in processors.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.succesfull:
result['engines'][name] = {'success': True}
else:
result['engines'][name] = {'success': False, 'errors': checker.test_results.errors}
_set_result(result)
logger.info('Check done')
except Exception:
_set_result({'status': 'error'})
logger.exception('Error while running the checker')
finally:
running.release()
def _run_with_delay():
every = _get_every()
delay = random.randint(0, every[1] - every[0])
logger.debug('Start checker in %i seconds', delay)
time.sleep(delay)
run()
def _start_scheduling():
every = _get_every()
if schedule(every[0], _run_with_delay):
run()
def _signal_handler(signum, frame):
t = threading.Thread(target=run)
t.daemon = True
t.start()
def initialize():
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
signal.signal(signal.SIGUSR1, _signal_handler)
# disabled by default
_set_result({'status': 'disabled'})
# special case when debug is activate
if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
logger.info('debug mode: checker is disabled')
return
# check value of checker.scheduling.every now
scheduling = settings.get('checker', {}).get('scheduling', None)
if scheduling is None or not scheduling:
logger.info('Checker scheduler is disabled')
return
#
_set_result({'status': 'unknown'}, include_timestamp=False)
start_after = scheduling.get('start_after', (300, 1800))
start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
delay = random.randint(start_after[0], start_after[1])
logger.info('Start checker in %i seconds', delay)
t = threading.Timer(delay, _start_scheduling)
t.daemon = True
t.start()

View file

@ -0,0 +1,406 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import typing
import types
import functools
import itertools
from time import time
from urllib.parse import urlparse
import re
import cld3
import requests.exceptions
from searx import poolrequests, logger
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
logger = logger.getChild('searx.search.checker')
HTML_TAGS = [
'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script',
'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite',
'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small',
'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt',
'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input',
'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet',
'frame', 'frameset'
]
def get_check_no_html():
rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS]
rep += ['</' + tag + '>' for tag in HTML_TAGS]
pattern = re.compile('|'.join(rep))
def f(text):
return pattern.search(text.lower()) is None
return f
_check_no_html = get_check_no_html()
def _is_url(url):
try:
result = urlparse(url)
except ValueError:
return False
if result.scheme not in ('http', 'https'):
return False
return True
@functools.lru_cache(maxsize=8192)
def _is_url_image(image_url):
if not isinstance(image_url, str):
return False
if image_url.startswith('//'):
image_url = 'https:' + image_url
if image_url.startswith('data:'):
return image_url.startswith('data:image/')
if not _is_url(image_url):
return False
retry = 2
while retry > 0:
a = time()
try:
poolrequests.set_timeout_for_thread(10.0, time())
r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-GPC': '1',
'Cache-Control': 'max-age=0'
})
if r.headers["content-type"].startswith('image/'):
return True
return False
except requests.exceptions.Timeout:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
except requests.exceptions.RequestException:
logger.exception('Exception for %s', image_url)
return False
def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
return {
'query': search_query.query,
'lang': search_query.lang,
'pageno': search_query.pageno,
'safesearch': search_query.safesearch,
'time_range': search_query.time_range,
}
def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\
-> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]:
param1 = _search_query_to_dict(sq1)
param2 = _search_query_to_dict(sq2)
common = {}
diff = {}
for k, value1 in param1.items():
value2 = param2[k]
if value1 == value2:
common[k] = value1
else:
diff[k] = (value1, value2)
return (common, diff)
class TestResults:
__slots__ = 'errors', 'logs', 'languages'
def __init__(self):
self.errors: typing.Dict[str, typing.List[str]] = {}
self.logs: typing.Dict[str, typing.List[typing.Any]] = {}
self.languages: typing.Set[str] = set()
def add_error(self, test, message, *args):
# message to self.errors
errors_for_test = self.errors.setdefault(test, [])
if message not in errors_for_test:
errors_for_test.append(message)
# (message, *args) to self.logs
logs_for_test = self.logs.setdefault(test, [])
if (message, *args) not in logs_for_test:
logs_for_test.append((message, *args))
def add_language(self, language):
self.languages.add(language)
@property
def succesfull(self):
return len(self.errors) == 0
def __iter__(self):
for test_name, errors in self.errors.items():
for error in sorted(errors):
yield (test_name, error)
class ResultContainerTests:
__slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results'
def __init__(self,
test_results: TestResults,
test_name: str,
search_query: SearchQuery,
result_container: ResultContainer):
self.test_name = test_name
self.search_query = search_query
self.result_container = result_container
self.languages: typing.Set[str] = set()
self.test_results = test_results
self.stop_test = False
@property
def result_urls(self):
results = self.result_container.get_ordered_results()
return [result['url'] for result in results]
def _record_error(self, message: str, *args) -> None:
sq = _search_query_to_dict(self.search_query)
sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()])
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
def _add_language(self, text: str) -> typing.Optional[str]:
r = cld3.get_language(str(text)) # pylint: disable=E1101
if r is not None and r.probability >= 0.98 and r.is_reliable:
self.languages.add(r.language)
self.test_results.add_language(r.language)
return None
def _check_result(self, result):
if not _check_no_html(result.get('title', '')):
self._record_error('HTML in title', repr(result.get('title', '')))
if not _check_no_html(result.get('content', '')):
self._record_error('HTML in content', repr(result.get('content', '')))
self._add_language(result.get('title', ''))
self._add_language(result.get('content', ''))
template = result.get('template', 'default.html')
if template == 'default.html':
return
if template == 'code.html':
return
if template == 'torrent.html':
return
if template == 'map.html':
return
if template == 'images.html':
thumbnail_src = result.get('thumbnail_src')
if thumbnail_src is not None:
if not _is_url_image(thumbnail_src):
self._record_error('thumbnail_src URL is invalid', thumbnail_src)
elif not _is_url_image(result.get('img_src')):
self._record_error('img_src URL is invalid', result.get('img_src'))
if template == 'videos.html' and not _is_url_image(result.get('thumbnail')):
self._record_error('thumbnail URL is invalid', result.get('img_src'))
def _check_results(self, results: list):
for result in results:
self._check_result(result)
def _check_answers(self, answers):
for answer in answers:
if not _check_no_html(answer):
self._record_error('HTML in answer', answer)
def _check_infoboxes(self, infoboxes):
for infobox in infoboxes:
if not _check_no_html(infobox.get('content', '')):
self._record_error('HTML in infobox content', infobox.get('content', ''))
self._add_language(infobox.get('content', ''))
for attribute in infobox.get('attributes', {}):
if not _check_no_html(attribute.get('value', '')):
self._record_error('HTML in infobox attribute value', attribute.get('value', ''))
def check_basic(self):
if len(self.result_container.unresponsive_engines) > 0:
for message in self.result_container.unresponsive_engines:
self._record_error(message[1] + ' ' + (message[2] or ''))
self.stop_test = True
return
results = self.result_container.get_ordered_results()
if len(results) > 0:
self._check_results(results)
if len(self.result_container.answers) > 0:
self._check_answers(self.result_container.answers)
if len(self.result_container.infoboxes) > 0:
self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self):
"""Check the ResultContainer has at least one infobox"""
if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox')
def has_answer(self):
"""Check the ResultContainer has at least one answer"""
if len(self.result_container.answers) == 0:
self._record_error('No answer')
def has_language(self, lang):
"""Check at least one title or content of the results is written in the `lang`.
Detected using pycld3, may be not accurate"""
if lang not in self.languages:
self._record_error(lang + ' not found')
def not_empty(self):
"""Check the ResultContainer has at least one answer or infobox or result"""
result_types = set()
results = self.result_container.get_ordered_results()
if len(results) > 0:
result_types.add('results')
if len(self.result_container.answers) > 0:
result_types.add('answers')
if len(self.result_container.infoboxes) > 0:
result_types.add('infoboxes')
if len(result_types) == 0:
self._record_error('No result')
def one_title_contains(self, title: str):
"""Check one of the title contains `title` (case insensitive comparaison)"""
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():
return
self._record_error(('{!r} not found in the title'.format(title)))
class CheckerTests:
__slots__ = 'test_results', 'test_name', 'result_container_tests_list'
def __init__(self,
test_results: TestResults,
test_name: str,
result_container_tests_list: typing.List[ResultContainerTests]):
self.test_results = test_results
self.test_name = test_name
self.result_container_tests_list = result_container_tests_list
def unique_results(self):
"""Check the results of each ResultContain is unique"""
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0:
# results on the first page
for i, urls_i in enumerate(urls_list):
for j, urls_j in enumerate(urls_list):
if i < j and urls_i == urls_j:
common, diff = _search_query_diff(self.result_container_tests_list[i].search_query,
self.result_container_tests_list[j].search_query)
common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()])
diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()])
diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()])
self.test_results.add_error(self.test_name,
'results are identitical for {} and {} ({})'
.format(diff1_str, diff2_str, common_str))
class Checker:
__slots__ = 'processor', 'tests', 'test_results'
def __init__(self, processor: EngineProcessor):
self.processor = processor
self.tests = self.processor.get_tests()
self.test_results = TestResults()
@property
def engineref_list(self):
engine_name = self.processor.engine_name
engine_category = self.processor.engine.categories[0]
return [EngineRef(engine_name, engine_category)]
@staticmethod
def search_query_matrix_iterator(engineref_list, matrix):
p = []
for name, values in matrix.items():
if isinstance(values, (tuple, list)):
l = [(name, value) for value in values]
else:
l = [(name, values)]
p.append(l)
for kwargs in itertools.product(*p):
kwargs = {k: v for k, v in kwargs}
query = kwargs['query']
params = dict(kwargs)
del params['query']
yield SearchQuery(query, engineref_list, **params)
def call_test(self, obj, test_description):
if isinstance(test_description, (tuple, list)):
method, args = test_description[0], test_description[1:]
else:
method = test_description
args = ()
if isinstance(method, str) and hasattr(obj, method):
getattr(obj, method)(*args)
elif isinstance(method, types.FunctionType):
method(*args)
else:
self.test_results.add_error(obj.test_name,
'method {!r} ({}) not found for {}'
.format(method, method.__class__.__name__, obj.__class__.__name__))
def call_tests(self, obj, test_descriptions):
for test_description in test_descriptions:
self.call_test(obj, test_description)
def search(self, search_query: SearchQuery) -> ResultContainer:
result_container = ResultContainer()
engineref_category = search_query.engineref_list[0].category
params = self.processor.get_params(search_query, engineref_category)
if params is not None:
self.processor.search(search_query.query, params, result_container, time(), 5)
return result_container
def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests:
result_container = self.search(search_query)
result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container)
result_container_check.check_basic()
return result_container_check
def run_test(self, test_name):
test_parameters = self.tests[test_name]
search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
stop_test = False
if 'result_container' in test_parameters:
for rct in rct_list:
stop_test = stop_test or rct.stop_test
if not rct.stop_test:
self.call_tests(rct, test_parameters['result_container'])
if not stop_test:
if 'test' in test_parameters:
checker_tests = CheckerTests(self.test_results, test_name, rct_list)
self.call_tests(checker_tests, test_parameters['test'])
def run(self):
for test_name in self.tests:
self.run_test(test_name)

69
searx/search/models.py Normal file
View file

@ -0,0 +1,69 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import typing
class EngineRef:
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
'timeout_limit', 'external_bang'
def __init__(self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str='all',
safesearch: int=0,
pageno: int=1,
time_range: typing.Optional[str]=None,
timeout_limit: typing.Optional[float]=None,
external_bang: typing.Optional[str]=None):
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.query, self.engineref_list, self.lang, self.safesearch,
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
def __eq__(self, other):
return self.query == other.query\
and self.engineref_list == other.engineref_list\
and self.lang == other.lang\
and self.safesearch == other.safesearch\
and self.pageno == other.pageno\
and self.time_range == other.time_range\
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
def __hash__(self):
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
self.timeout_limit, self.external_bang))

View file

@ -37,3 +37,15 @@ class EngineProcessor:
@abstractmethod
def search(self, query, params, result_container, start_time, timeout_limit):
pass
def get_tests(self):
tests = getattr(self.engine, 'tests', None)
if tests is None:
tests = getattr(self.engine, 'additional_tests', {})
tests.update(self.get_default_tests())
return tests
else:
return tests
def get_default_tests(self):
return {}

View file

@ -179,15 +179,15 @@ class OnlineProcessor(EngineProcessor):
requests_exception = True
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA')
logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
suspended_time = e.suspended_time # pylint: disable=no-member
elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
result_container.add_unresponsive_engine(self.engine_name, 'too many requests')
logger.exception('engine {0} : Too many requests')
logger.exception('engine {0} : Too many requests'.format(self.engine_name))
suspended_time = e.suspended_time # pylint: disable=no-member
elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
result_container.add_unresponsive_engine(self.engine_name, 'blocked')
logger.exception('engine {0} : Searx is blocked')
logger.exception('engine {0} : Searx is blocked'.format(self.engine_name))
suspended_time = e.suspended_time # pylint: disable=no-member
else:
result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash')
@ -211,3 +211,49 @@ class OnlineProcessor(EngineProcessor):
# reset the suspend variables
self.engine.continuous_errors = 0
self.engine.suspend_end_time = 0
def get_default_tests(self):
tests = {}
tests['simple'] = {
'matrix': {'query': ('life', 'computer')},
'result_container': ['not_empty'],
}
if getattr(self.engine, 'paging', False):
tests['paging'] = {
'matrix': {'query': 'time',
'pageno': (1, 2, 3)},
'result_container': ['not_empty'],
'test': ['unique_results']
}
if 'general' in self.engine.categories:
# avoid documentation about HTML tags (<time> and <input type="time">)
tests['paging']['matrix']['query'] = 'news'
if getattr(self.engine, 'time_range', False):
tests['time_range'] = {
'matrix': {'query': 'news',
'time_range': (None, 'day')},
'result_container': ['not_empty'],
'test': ['unique_results']
}
if getattr(self.engine, 'lang', False):
tests['lang_fr'] = {
'matrix': {'query': 'paris', 'lang': 'fr'},
'result_container': ['not_empty', ('has_lang', 'fr')],
}
tests['lang_en'] = {
'matrix': {'query': 'paris', 'lang': 'en'},
'result_container': ['not_empty', ('has_lang', 'en')],
}
if getattr(self.engine, 'safesearch', False):
tests['safesearch'] = {
'matrix': {'query': 'porn',
'safesearch': (0, 2)},
'test': ['unique_results']
}
return tests

View file

@ -55,3 +55,13 @@ class OnlineCurrencyProcessor(OnlineProcessor):
params['from_name'] = iso4217_to_name(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en')
return params
def get_default_tests(self):
tests = {}
tests['currency'] = {
'matrix': {'query': '1337 usd in rmb'},
'result_container': ['has_answer'],
}
return tests

View file

@ -35,3 +35,21 @@ class OnlineDictionaryProcessor(OnlineProcessor):
params['query'] = query
return params
def get_default_tests(self):
tests = {}
if getattr(self.engine, 'paging', False):
tests['translation_paging'] = {
'matrix': {'query': 'en-es house',
'pageno': (1, 2, 3)},
'result_container': ['not_empty', ('one_title_contains', 'house')],
'test': ['unique_results']
}
else:
tests['translation'] = {
'matrix': {'query': 'en-es house'},
'result_container': ['not_empty', ('one_title_contains', 'house')],
}
return tests

View file

@ -102,6 +102,34 @@ outgoing: # communication with search engines
# - "HTTPS rewrite"
# - ...
checker:
# disable checker when in debug mode
off_when_debug: True
# scheduling: interval or int
# use "scheduling: False" to disable scheduling
scheduling:
start_after: [300, 1800] # delay to start the first run of the checker
every: [86400, 90000] # how often the checker runs
# additional tests: only for the YAML anchors (see the engines section)
additional_tests:
rosebud: &test_rosebud
matrix:
query: rosebud
lang: en
result_container:
- not_empty
- ['one_title_contains', 'citizen kane']
test:
- unique_results
# tests: only for the YAML anchors (see the engines section)
tests:
infobox: &tests_infobox
infobox:
matrix:
query: ["linux", "new york", "bbc"]
result_container:
- has_infobox
engines:
- name: apk mirror
engine: apkmirror
@ -218,6 +246,7 @@ engines:
shortcut : ddd
weight : 2
disabled : True
tests: *tests_infobox
# cloudflare protected
# - name : digbt
@ -262,6 +291,7 @@ engines:
shortcut : wd
timeout : 3.0
weight : 2
tests: *tests_infobox
- name : duckduckgo
engine : duckduckgo
@ -278,6 +308,8 @@ engines:
engine : etools
shortcut : eto
disabled : True
additional_tests:
rosebud: *test_rosebud
- name : etymonline
engine : xpath
@ -343,6 +375,8 @@ engines:
shortcut : gb
timeout : 3.0
disabled: True
additional_tests:
rosebud: *test_rosebud
- name : gentoo
engine : gentoo
@ -646,6 +680,8 @@ engines:
shortcut : qw
categories : general
disabled : True
additional_tests:
rosebud: *test_rosebud
- name : qwant images
engine : qwant
@ -745,6 +781,8 @@ engines:
shortcut : sp
timeout : 6.0
disabled : True
additional_tests:
rosebud: *test_rosebud
- name : tokyotoshokan
engine : tokyotoshokan
@ -856,6 +894,8 @@ engines:
number_of_results : 5
search_type : text
disabled : True
additional_tests:
rosebud: *test_rosebud
- name : wikisource
engine : mediawiki

31
searx/shared/__init__.py Normal file
View file

@ -0,0 +1,31 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import logging
logger = logging.getLogger('searx.shared')
try:
import uwsgi
except:
# no uwsgi
from .shared_simple import SimpleSharedDict as SharedDict, schedule
logger.info('Use shared_simple implementation')
else:
try:
uwsgi.cache_update('dummy', b'dummy')
if uwsgi.cache_get('dummy') != b'dummy':
raise Exception()
except:
# uwsgi.ini configuration problem: disable all scheduling
logger.error('uwsgi.ini configuration error, add this line to your uwsgi.ini\n'
'cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1')
from .shared_simple import SimpleSharedDict as SharedDict
def schedule(delay, func, *args):
return False
else:
# uwsgi
from .shared_uwsgi import UwsgiCacheSharedDict as SharedDict, schedule
logger.info('Use shared_uwsgi implementation')
storage = SharedDict()

View file

@ -0,0 +1,21 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from abc import ABC, abstractmethod
class SharedDict(ABC):
@abstractmethod
def get_int(self, key):
pass
@abstractmethod
def set_int(self, key, value):
pass
@abstractmethod
def get_str(self, key):
pass
@abstractmethod
def set_str(self, key, value):
pass

View file

@ -0,0 +1,39 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import threading
from . import shared_abstract
class SimpleSharedDict(shared_abstract.SharedDict):
__slots__ = 'd',
def __init__(self):
self.d = {}
def get_int(self, key):
return self.d.get(key, None)
def set_int(self, key, value):
self.d[key] = value
def get_str(self, key):
return self.d.get(key, None)
def set_str(self, key, value):
self.d[key] = value
def schedule(delay, func, *args):
def call_later():
t = threading.Timer(delay, wrapper)
t.daemon = True
t.start()
def wrapper():
call_later()
func(*args)
call_later()
return True

View file

@ -0,0 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import time
import uwsgi # pylint: disable=E0401
from . import shared_abstract
_last_signal = 10
class UwsgiCacheSharedDict(shared_abstract.SharedDict):
def get_int(self, key):
value = uwsgi.cache_get(key)
if value is None:
return value
else:
return int.from_bytes(value, 'big')
def set_int(self, key, value):
b = value.to_bytes(4, 'big')
uwsgi.cache_update(key, b)
def get_str(self, key):
value = uwsgi.cache_get(key)
if value is None:
return value
else:
return value.decode('utf-8')
def set_str(self, key, value):
b = value.encode('utf-8')
uwsgi.cache_update(key, b)
def schedule(delay, func, *args):
"""
Can be implemented using a spooler.
https://uwsgi-docs.readthedocs.io/en/latest/PythonDecorators.html
To make the uwsgi configuration simple, use the alternative implementation.
"""
global _last_signal
def sighandler(signum):
now = int(time.time())
key = 'scheduler_call_time_signal_' + str(signum)
uwsgi.lock()
try:
updating = uwsgi.cache_get(key)
if updating is not None:
updating = int.from_bytes(updating, 'big')
if now - updating < delay:
return
uwsgi.cache_update(key, now.to_bytes(4, 'big'))
finally:
uwsgi.unlock()
func(*args)
signal_num = _last_signal
_last_signal += 1
uwsgi.register_signal(signal_num, 'worker', sighandler)
uwsgi.add_timer(signal_num, delay)
return True

View file

@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori
from searx.utils import html_to_text, gen_useragent, dict_subset, match_language
from searx.version import VERSION_STRING
from searx.languages import language_codes as languages
from searx.search import SearchWithPlugins, initialize
from searx.search import SearchWithPlugins, initialize as search_initialize
from searx.search.checker import get_result as checker_get_result
from searx.query import RawTextQuery
from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins
@ -81,7 +82,6 @@ from searx.answerers import answerers
from searx.poolrequests import get_global_proxies
from searx.metrology.error_recorder import errors_per_engines
# serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))
@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai
# initialize the engines except on the first run of the werkzeug server.
if not werkzeug_reloader\
or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"):
initialize()
search_initialize(enable_checker=True)
babel = Babel(app)
@ -977,6 +977,12 @@ def stats_errors():
return jsonify(result)
@app.route('/stats/checker', methods=['GET'])
def stats_checker():
result = checker_get_result()
return jsonify(result)
@app.route('/robots.txt', methods=['GET'])
def robots():
return Response("""User-agent: *

View file

@ -49,7 +49,8 @@ setup(
},
entry_points={
'console_scripts': [
'searx-run = searx.webapp:run'
'searx-run = searx.webapp:run',
'searx-checker = searx.search.checker.__main__:main'
]
},
package_data={

View file

@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\
python3-dev python3-babel python3-venv
uwsgi uwsgi-plugin-python3
git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev
libprotobuf-dev protobuf-compiler
shellcheck"
BUILD_PACKAGES_debian="\
@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\
python python-pip python-lxml python-babel
uwsgi uwsgi-plugin-python
git base-devel libxml2
protobuf
shellcheck"
BUILD_PACKAGES_arch="\
@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\
python python-pip python-lxml python-babel
uwsgi uwsgi-plugin-python3
git @development-tools libxml2
ShellCheck"
ShellCheck protobuf-compiler protobuf-devel"
BUILD_PACKAGES_fedora="\
firefox graphviz graphviz-gd ImageMagick librsvg2-tools
@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\
python36 python36-pip python36-lxml python-babel
uwsgi uwsgi-plugin-python3
git @development-tools libxml2
ShellCheck"
ShellCheck protobuf-compiler protobuf-devel"
BUILD_PACKAGES_centos="\
firefox graphviz graphviz-gd ImageMagick librsvg2-tools

View file

@ -82,4 +82,7 @@ http = ${SEARX_INTERNAL_HTTP}
# mkdir -p /run/uwsgi/app/searx
# chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx
#
# socket = /run/uwsgi/app/searx/socket
# socket = /run/uwsgi/app/searx/socket
# Cache
cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1

View file

@ -82,3 +82,6 @@ http = ${SEARX_INTERNAL_HTTP}
# chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx
#
# socket = /run/uwsgi/app/searx/socket
# Cache
cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1