diff --git a/requirements.txt b/requirements.txt index 0bb3eafb0..9e3de3a46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ redis==4.5.5 markdown-it-py==2.2.0 typing_extensions==4.6.2 fasttext-predict==0.9.2.1 +pytomlpp==1.0.13 diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 1ab7cb4c1..23670a283 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -13,12 +13,15 @@ Accept_ header .. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept """ +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if 'text/html' not in request.accept_mimetypes: return 429, "bot detected, HTTP header Accept did not contain text/html" return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index ae630fd68..191249711 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -14,12 +14,15 @@ bot if the Accept-Encoding_ header .. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding """ +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 06743802e..558a216cf 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -11,13 +11,15 @@ if the Accept-Language_ header is unset. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ - +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if request.headers.get('Accept-Language', '').strip() == '': return 429, "bot detected, missing HTTP header Accept-Language" return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index f61f5e48c..0ef24a7b8 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -11,13 +11,15 @@ the Connection_ header is set to ``close``. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection """ - +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if request.headers.get('Connection', '').strip() == 'close': return 429, "bot detected, HTTP header 'Connection=close'" return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 892ae0bd9..3d1ec9173 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -12,11 +12,15 @@ the User-Agent_ header is unset or matches the regular expression https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ +# pylint: disable=unused-argument from typing import Optional, Tuple import re import flask +from searx.tools import config + + USER_AGENT = ( r'(' + r'unknown' @@ -44,7 +48,7 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): return ( diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index fce3f8b67..2646920c2 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -1,4 +1,5 @@ -""" +""".. _botdetection.ip_limit: + Method ``ip_limit`` ------------------- @@ -22,6 +23,8 @@ The :py:obj:`link_token` method is used to investigate whether a request is from typing import Optional, Tuple import flask +from searx.tools import config + from searx import redisdb from searx import logger @@ -56,7 +59,7 @@ API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: redis_client = redisdb.client() x_forwarded_for = request.headers.get('X-Forwarded-For', '') @@ -68,7 +71,9 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: if c > API_MAX: return 429, "BLOCK %s: API limit exceeded" - suspicious = link_token.is_suspicious(request) + suspicious = False + if cfg['botdetection.ip_limit.link_token']: + suspicious = link_token.is_suspicious(request) if suspicious: c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 71044c312..cc1e00b3c 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -38,8 +38,11 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ from typing import Optional, Tuple +from pathlib import Path import flask +import pytomlpp as toml +from searx.tools import config from searx.botdetection import ( http_accept, http_accept_encoding, @@ -49,6 +52,42 @@ from searx.botdetection import ( ip_limit, ) +LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" +"""Base configuration (schema) of the botdetection.""" + +LIMITER_CFG = Path('/etc/searxng/limiter.toml') +"""Lokal Limiter configuration.""" + +CFG_DEPRECATED = { + # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." +} + +CFG = config.Config({}, {}) + + +def init_cfg(log): + global CFG # pylint: disable=global-statement + CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) + + if not LIMITER_CFG.exists(): + log.warning("missing config file: %s", LIMITER_CFG) + return + + log.warning("load config file: %s", LIMITER_CFG) + try: + upd_cfg = toml.load(LIMITER_CFG) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", LIMITER_CFG, msg) + raise + + is_valid, issue_list = CFG.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") + CFG.update(upd_cfg) + def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: @@ -58,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val @@ -72,7 +111,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml new file mode 100644 index 000000000..30cd1b53c --- /dev/null +++ b/searx/botdetection/limiter.toml @@ -0,0 +1,3 @@ +[botdetection.ip_limit] + +link_token = true \ No newline at end of file diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index d9566b92b..92b0aa2a0 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -38,5 +38,6 @@ def init(app: flask.Flask, settings) -> bool: if not redisdb.client(): logger.error("The limiter requires Redis") return False + limiter.init_cfg(logger) app.before_request(pre_request) return True diff --git a/searx/tools/__init__.py b/searx/tools/__init__.py new file mode 100644 index 000000000..08e6d982f --- /dev/null +++ b/searx/tools/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _tools src: + +A collection of *utilities* used by SearXNG, but without SearXNG specific +peculiarities. + +""" diff --git a/searx/tools/config.py b/searx/tools/config.py new file mode 100644 index 000000000..f998031ba --- /dev/null +++ b/searx/tools/config.py @@ -0,0 +1,376 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Configuration class :py:class:`Config` with deep-update, schema validation +and deprecated names. + +The :py:class:`Config` class implements a configuration that is based on +structured dictionaries. The configuration schema is defined in a dictionary +structure and the configuration data is given in a dictionary structure. +""" +from __future__ import annotations + +import copy +import typing +import logging +import pathlib +import pytomlpp as toml + +__all__ = ['Config', 'UNSET', 'SchemaIssue'] + +log = logging.getLogger(__name__) + + +class FALSE: + """Class of ``False`` singelton""" + + # pylint: disable=multiple-statements + def __init__(self, msg): + self.msg = msg + + def __bool__(self): + return False + + def __str__(self): + return self.msg + + __repr__ = __str__ + + +UNSET = FALSE('') + + +class SchemaIssue(ValueError): + """Exception to store and/or raise a message from a schema issue.""" + + def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str): + self.level = level + super().__init__(msg) + + def __str__(self): + return f"[cfg schema {self.level}] {self.args[0]}" + + +class Config: + """Base class used for configuration""" + + UNSET = UNSET + + @classmethod + def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config: + + # init schema + + log.debug("load schema file: %s", schema_file) + cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated) + if not cfg_file.exists(): + log.warning("missing config file: %s", cfg_file) + return cfg + + # load configuration + + log.debug("load config file: %s", cfg_file) + try: + upd_cfg = toml.load(cfg_file) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", cfg_file, msg) + raise + + is_valid, issue_list = cfg.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {cfg_file} is invalid!") + cfg.update(upd_cfg) + return cfg + + def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]): + """Construtor of class Config. + + :param cfg_schema: Schema of the configuration + :param deprecated: dictionary that maps deprecated configuration names to a messages + + These values are needed for validation, see :py:obj:`validate`. + + """ + self.cfg_schema = cfg_schema + self.deprecated = deprecated + self.cfg = copy.deepcopy(cfg_schema) + + def __getitem__(self, key: str): + return self.get(key) + + def validate(self, cfg: dict): + """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`. + Validation is done by :py:obj:`validate`.""" + + return validate(self.cfg_schema, cfg, self.deprecated) + + def update(self, upd_cfg: dict): + """Update this configuration by ``upd_cfg``.""" + + dict_deepupdate(self.cfg, upd_cfg) + + def default(self, name: str): + """Returns default value of field ``name`` in ``self.cfg_schema``.""" + return value(name, self.cfg_schema) + + def get(self, name: str, default=UNSET, replace=True): + """Returns the value to which ``name`` points in the configuration. + + If there is no such ``name`` in the config and the ``default`` is + :py:obj:`UNSET`, a :py:obj:`KeyError` is raised. + """ + + parent = self._get_parent_dict(name) + val = parent.get(name.split('.')[-1], UNSET) + if val is UNSET: + if default is UNSET: + raise KeyError(name) + val = default + + if replace and isinstance(val, str): + val = val % self + return val + + def set(self, name: str, val): + """Set the value to which ``name`` points in the configuration. + + If there is no such ``name`` in the config, a :py:obj:`KeyError` is + raised. + """ + parent = self._get_parent_dict(name) + parent[name.split('.')[-1]] = val + + def _get_parent_dict(self, name): + parent_name = '.'.join(name.split('.')[:-1]) + if parent_name: + parent = value(parent_name, self.cfg) + else: + parent = self.cfg + if (parent is UNSET) or (not isinstance(parent, dict)): + raise KeyError(parent_name) + return parent + + def path(self, name: str, default=UNSET): + """Get a :py:class:`pathlib.Path` object from a config string.""" + + val = self.get(name, default) + if val is UNSET: + if default is UNSET: + raise KeyError(name) + return default + return pathlib.Path(str(val)) + + def pyobj(self, name, default=UNSET): + """Get python object refered by full qualiffied name (FQN) in the config + string.""" + + fqn = self.get(name, default) + if fqn is UNSET: + if default is UNSET: + raise KeyError(name) + return default + (modulename, name) = str(fqn).rsplit('.', 1) + m = __import__(modulename, {}, {}, [name], 0) + return getattr(m, name) + + +# working with dictionaries + + +def value(name: str, data_dict: dict): + """Returns the value to which ``name`` points in the ``dat_dict``. + + .. code: python + + >>> data_dict = { + "foo": {"bar": 1 }, + "bar": {"foo": 2 }, + "foobar": [1, 2, 3], + } + >>> value('foobar', data_dict) + [1, 2, 3] + >>> value('foo.bar', data_dict) + 1 + >>> value('foo.bar.xxx', data_dict) + + + """ + + ret_val = data_dict + for part in name.split('.'): + if isinstance(ret_val, dict): + ret_val = ret_val.get(part, UNSET) + if ret_val is UNSET: + break + return ret_val + + +def validate( + schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str] +) -> typing.Tuple[bool, list]: + + """Deep validation of dictionary in ``data_dict`` against dictionary in + ``schema_dict``. Argument deprecated is a dictionary that maps deprecated + configuration names to a messages:: + + deprecated = { + "foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'", + "..." : "..." + } + + The function returns a python tuple ``(is_valid, issue_list)``: + + ``is_valid``: + A bool value indicating ``data_dict`` is valid or not. + + ``issue_list``: + A list of messages (:py:obj:`SchemaIssue`) from the validation:: + + [schema warn] data_dict: deprecated 'fontlib.foo': + [schema invalid] data_dict: key unknown 'fontlib.foo' + [schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ... + + If ``schema_dict`` or ``data_dict`` is not a dictionary type a + :py:obj:`SchemaIssue` is raised. + + """ + names = [] + is_valid = True + issue_list = [] + + if not isinstance(schema_dict, dict): + raise SchemaIssue('invalid', "schema_dict is not a dict type") + if not isinstance(data_dict, dict): + raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type") + + is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated) + return is_valid, issue_list + + +def _validate( + names: typing.List, + issue_list: typing.List, + schema_dict: typing.Dict, + data_dict: typing.Dict, + deprecated: typing.Dict[str, str], +) -> typing.Tuple[bool, typing.List]: + + is_valid = True + + for key, data_value in data_dict.items(): + + names.append(key) + name = '.'.join(names) + + deprecated_msg = deprecated.get(name) + # print("XXX %s: key %s // data_value: %s" % (name, key, data_value)) + if deprecated_msg: + issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}")) + + schema_value = value(name, schema_dict) + # print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value)) + if schema_value is UNSET: + if not deprecated_msg: + issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict")) + is_valid = False + + elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck + issue_list.append( + SchemaIssue( + 'invalid', + (f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"), + ) + ) + is_valid = False + + elif isinstance(data_value, dict): + _valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated) + is_valid = is_valid and _valid + names.pop() + + return is_valid, issue_list + + +def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None): + """Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``. + + For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``: + + 0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a + :py:obj:`TypeError`. + + 1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``. + + 2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a + (deep-) copy of ``upd_val``. + + 3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the + list in ``upd_val``. + + 4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in + ``upd_val``. + """ + # pylint: disable=too-many-branches + if not isinstance(base_dict, dict): + raise TypeError("argument 'base_dict' is not a ditionary type") + if not isinstance(upd_dict, dict): + raise TypeError("argument 'upd_dict' is not a ditionary type") + + if names is None: + names = [] + + for upd_key, upd_val in upd_dict.items(): + # For each upd_key & upd_val pair in upd_dict: + + if isinstance(upd_val, dict): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, recursively deep-update it + if not isinstance(base_dict[upd_key], dict): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict") + dict_deepupdate( + base_dict[upd_key], + upd_val, + names + + [ + upd_key, + ], + ) + + else: + # if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val + base_dict[upd_key] = copy.deepcopy(upd_val) + + elif isinstance(upd_val, list): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, base_dict[up_key] is extended by + # the list from upd_val + if not isinstance(base_dict[upd_key], list): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict") + base_dict[upd_key].extend(upd_val) + + else: + # if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the + # list in upd_val. + base_dict[upd_key] = copy.deepcopy(upd_val) + + elif isinstance(upd_val, set): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val + if not isinstance(base_dict[upd_key], set): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict") + base_dict[upd_key].update(upd_val.copy()) + + else: + # if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the + # set in upd_val + base_dict[upd_key] = upd_val.copy() + + else: + # for any other type of upd_val replace or add base_dict[upd_key] by a copy + # of upd_val + base_dict[upd_key] = copy.copy(upd_val)