mirror of
https://github.com/searxng/searxng.git
synced 2025-01-09 18:05:26 +00:00
Merge pull request #2357 / limiter -> botdetection
The monolithic implementation of the limiter was divided into methods and implemented in the Python package searx.botdetection. Detailed documentation on the methods has been added. The methods are divided into two groups: 1. Probe HTTP headers - Method http_accept - Method http_accept_encoding - Method http_accept_language - Method http_connection - Method http_user_agent 2. Rate limit: - Method ip_limit - Method link_token (new) The (reduced) implementation of the limiter is now in the module searx.botdetection.limiter. The first group was transferred unchanged to this module. The ip_limit contains the sliding windows implemented by the limiter so far. This merge also fixes some long outstandig issue: - limiter does not evaluate the Accept-Language correct [1] - limiter needs a IPv6 prefix to block networks instead of IPs [2] Without additional configuration the limiter works as before (apart from the bugfixes). For the commissioning of additional methods (link_toke), a configuration must be made in an additional configuration file. Without this configuration, the limiter runs as before (zero configuration). The ip_limit Method implements the sliding windows of the vanilla limiter, additionally the link_token method can be used in this method. The link_token method can be used to investigate whether a request is suspicious. To activate the link_token method in the ip_limit method add the following to your /etc/searxng/limiter.toml:: [botdetection.ip_limit] link_token = true [1] https://github.com/searxng/searxng/issues/2455 [2] https://github.com/searxng/searxng/issues/2477
This commit is contained in:
commit
80aaef6c95
22 changed files with 1273 additions and 138 deletions
|
@ -235,7 +235,7 @@ Global Settings
|
|||
|
||||
``limiter`` :
|
||||
Rate limit the number of request on the instance, block some bots. The
|
||||
:ref:`limiter plugin` requires a :ref:`settings redis` database.
|
||||
:ref:`limiter src` requires a :ref:`settings redis` database.
|
||||
|
||||
.. _image_proxy:
|
||||
|
||||
|
|
45
docs/src/searx.botdetection.rst
Normal file
45
docs/src/searx.botdetection.rst
Normal file
|
@ -0,0 +1,45 @@
|
|||
.. _botdetection:
|
||||
|
||||
=============
|
||||
Bot Detection
|
||||
=============
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
.. automodule:: searx.botdetection
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.limiter
|
||||
:members:
|
||||
|
||||
|
||||
Rate limit
|
||||
==========
|
||||
|
||||
.. automodule:: searx.botdetection.ip_limit
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.link_token
|
||||
:members:
|
||||
|
||||
|
||||
Probe HTTP headers
|
||||
==================
|
||||
|
||||
.. automodule:: searx.botdetection.http_accept
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_accept_encoding
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_accept_language
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_connection
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_user_agent
|
||||
:members:
|
|
@ -1,13 +0,0 @@
|
|||
.. _limiter plugin:
|
||||
|
||||
==============
|
||||
Limiter Plugin
|
||||
==============
|
||||
|
||||
.. sidebar:: info
|
||||
|
||||
The :ref:`limiter plugin` requires a :ref:`Redis <settings redis>` database.
|
||||
|
||||
.. automodule:: searx.plugins.limiter
|
||||
:members:
|
||||
|
|
@ -16,3 +16,4 @@ redis==4.5.5
|
|||
markdown-it-py==2.2.0
|
||||
typing_extensions==4.6.3
|
||||
fasttext-predict==0.9.2.1
|
||||
pytomlpp==1.0.13
|
||||
|
|
27
searx/botdetection/__init__.py
Normal file
27
searx/botdetection/__init__.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _botdetection src:
|
||||
|
||||
X-Forwarded-For
|
||||
===============
|
||||
|
||||
.. attention::
|
||||
|
||||
A correct setup of the HTTP request headers ``X-Forwarded-For`` and
|
||||
``X-Real-IP`` is essential to be able to assign a request to an IP correctly:
|
||||
|
||||
- `NGINX RequestHeader`_
|
||||
- `Apache RequestHeader`_
|
||||
|
||||
.. _NGINX RequestHeader:
|
||||
https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site
|
||||
.. _Apache RequestHeader:
|
||||
https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site
|
||||
|
||||
.. autofunction:: searx.botdetection.get_real_ip
|
||||
|
||||
"""
|
||||
|
||||
from ._helpers import dump_request
|
||||
from ._helpers import get_real_ip
|
||||
from ._helpers import too_many_requests
|
121
searx/botdetection/_helpers.py
Normal file
121
searx/botdetection/_helpers.py
Normal file
|
@ -0,0 +1,121 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
# pylint: disable=missing-module-docstring, invalid-name
|
||||
from __future__ import annotations
|
||||
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
IPv6Address,
|
||||
ip_address,
|
||||
ip_network,
|
||||
)
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from searx import logger
|
||||
|
||||
logger = logger.getChild('botdetection')
|
||||
|
||||
|
||||
def dump_request(request: flask.Request):
|
||||
return (
|
||||
request.path
|
||||
+ " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
|
||||
+ " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
|
||||
+ " || form: %s" % request.form
|
||||
+ " || Accept: %s" % request.headers.get('Accept')
|
||||
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
|
||||
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
|
||||
+ " || Content-Type: %s" % request.headers.get('Content-Type')
|
||||
+ " || Content-Length: %s" % request.headers.get('Content-Length')
|
||||
+ " || Connection: %s" % request.headers.get('Connection')
|
||||
+ " || User-Agent: %s" % request.headers.get('User-Agent')
|
||||
)
|
||||
|
||||
|
||||
def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
|
||||
"""Returns a HTTP 429 response object and writes a ERROR message to the
|
||||
'botdetection' logger. This function is used in part by the filter methods
|
||||
to return the default ``Too Many Requests`` response.
|
||||
|
||||
"""
|
||||
|
||||
logger.debug("BLOCK %s: %s", network.compressed, log_msg)
|
||||
return flask.make_response(('Too Many Requests', 429))
|
||||
|
||||
|
||||
def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network:
|
||||
"""Returns the (client) network of whether the real_ip is part of."""
|
||||
|
||||
ip = ip_address(real_ip)
|
||||
if isinstance(ip, IPv6Address):
|
||||
prefix = cfg['real_ip.ipv6_prefix']
|
||||
else:
|
||||
prefix = cfg['real_ip.ipv4_prefix']
|
||||
network = ip_network(f"{real_ip}/{prefix}", strict=False)
|
||||
# logger.debug("get_network(): %s", network.compressed)
|
||||
return network
|
||||
|
||||
|
||||
def get_real_ip(request: flask.Request) -> str:
|
||||
"""Returns real IP of the request. Since not all proxies set all the HTTP
|
||||
headers and incoming headers can be faked it may happen that the IP cannot
|
||||
be determined correctly.
|
||||
|
||||
.. sidebar:: :py:obj:`flask.Request.remote_addr`
|
||||
|
||||
SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
|
||||
|
||||
This function tries to get the remote IP in the order listed below,
|
||||
additional some tests are done and if inconsistencies or errors are
|
||||
detected, they are logged.
|
||||
|
||||
The remote IP of the request is taken from (first match):
|
||||
|
||||
- X-Forwarded-For_ header
|
||||
- `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
|
||||
- :py:obj:`flask.Request.remote_addr`
|
||||
|
||||
.. _ProxyFix:
|
||||
https://werkzeug.palletsprojects.com/middleware/proxy_fix/
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
"""
|
||||
|
||||
forwarded_for = request.headers.get("X-Forwarded-For")
|
||||
real_ip = request.headers.get('X-Real-IP')
|
||||
remote_addr = request.remote_addr
|
||||
# logger.debug(
|
||||
# "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
|
||||
# )
|
||||
|
||||
if not forwarded_for:
|
||||
logger.error("X-Forwarded-For header is not set!")
|
||||
else:
|
||||
from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
|
||||
|
||||
forwarded_for = [x.strip() for x in forwarded_for.split(',')]
|
||||
x_for: int = get_cfg()['real_ip.x_for']
|
||||
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
|
||||
|
||||
if not real_ip:
|
||||
logger.error("X-Real-IP header is not set!")
|
||||
|
||||
if forwarded_for and real_ip and forwarded_for != real_ip:
|
||||
logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
|
||||
|
||||
if forwarded_for and remote_addr and forwarded_for != remote_addr:
|
||||
logger.warning(
|
||||
"IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
|
||||
)
|
||||
|
||||
if real_ip and remote_addr and real_ip != remote_addr:
|
||||
logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
|
||||
|
||||
request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
|
||||
# logger.debug("get_real_ip() -> %s", request_ip)
|
||||
return request_ip
|
39
searx/botdetection/http_accept.py
Normal file
39
searx/botdetection/http_accept.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept``
|
||||
----------------------
|
||||
|
||||
The ``http_accept`` method evaluates a request as the request of a bot if the
|
||||
Accept_ header ..
|
||||
|
||||
- did not contain ``text/html``
|
||||
|
||||
.. _Accept:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
if 'text/html' not in request.accept_mimetypes:
|
||||
return too_many_requests(network, "HTTP header Accept did not contain text/html")
|
||||
return None
|
41
searx/botdetection/http_accept_encoding.py
Normal file
41
searx/botdetection/http_accept_encoding.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept_encoding``
|
||||
-------------------------------
|
||||
|
||||
The ``http_accept_encoding`` method evaluates a request as the request of a
|
||||
bot if the Accept-Encoding_ header ..
|
||||
|
||||
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
|
||||
- did not contain ``text/html``
|
||||
|
||||
.. _Accept-Encoding:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
|
||||
if not ('gzip' in accept_list or 'deflate' in accept_list):
|
||||
return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
|
||||
return None
|
35
searx/botdetection/http_accept_language.py
Normal file
35
searx/botdetection/http_accept_language.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept_language``
|
||||
-------------------------------
|
||||
|
||||
The ``http_accept_language`` method evaluates a request as the request of a bot
|
||||
if the Accept-Language_ header is unset.
|
||||
|
||||
.. _Accept-Language:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
if request.headers.get('Accept-Language', '').strip() == '':
|
||||
return too_many_requests(network, "missing HTTP header Accept-Language")
|
||||
return None
|
37
searx/botdetection/http_connection.py
Normal file
37
searx/botdetection/http_connection.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_connection``
|
||||
--------------------------
|
||||
|
||||
The ``http_connection`` method evaluates a request as the request of a bot if
|
||||
the Connection_ header is set to ``close``.
|
||||
|
||||
.. _Connection:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
if request.headers.get('Connection', '').strip() == 'close':
|
||||
return too_many_requests(network, "HTTP header 'Connection=close")
|
||||
return None
|
67
searx/botdetection/http_user_agent.py
Normal file
67
searx/botdetection/http_user_agent.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_user_agent``
|
||||
--------------------------
|
||||
|
||||
The ``http_user_agent`` method evaluates a request as the request of a bot if
|
||||
the User-Agent_ header is unset or matches the regular expression
|
||||
:py:obj:`USER_AGENT`.
|
||||
|
||||
.. _User-Agent:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
USER_AGENT = (
|
||||
r'('
|
||||
+ r'unknown'
|
||||
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
||||
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
||||
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
||||
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
||||
+ r'|ZmEu|BLEXBot|bitlybot'
|
||||
# unmaintained Farside instances
|
||||
+ r'|'
|
||||
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
||||
# other bots and client to block
|
||||
+ '|.*PetalBot.*'
|
||||
+ r')'
|
||||
)
|
||||
"""Regular expression that matches to User-Agent_ from known *bots*"""
|
||||
|
||||
_regexp = None
|
||||
|
||||
|
||||
def regexp_user_agent():
|
||||
global _regexp # pylint: disable=global-statement
|
||||
if not _regexp:
|
||||
_regexp = re.compile(USER_AGENT)
|
||||
return _regexp
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
user_agent = request.headers.get('User-Agent', 'unknown')
|
||||
if regexp_user_agent().match(user_agent):
|
||||
return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
|
||||
return None
|
146
searx/botdetection/ip_limit.py
Normal file
146
searx/botdetection/ip_limit.py
Normal file
|
@ -0,0 +1,146 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _botdetection.ip_limit:
|
||||
|
||||
Method ``ip_limit``
|
||||
-------------------
|
||||
|
||||
The ``ip_limit`` method counts request from an IP in *sliding windows*. If
|
||||
there are to many requests in a sliding window, the request is evaluated as a
|
||||
bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
|
||||
header. To take privacy only the hash value of an IP is stored in the redis DB
|
||||
and at least for a maximum of 10 minutes.
|
||||
|
||||
The :py:obj:`.link_token` method can be used to investigate whether a request is
|
||||
*suspicious*. To activate the :py:obj:`.link_token` method in the
|
||||
:py:obj:`.ip_limit` method add the following to your
|
||||
``/etc/searxng/limiter.toml``:
|
||||
|
||||
.. code:: toml
|
||||
|
||||
[botdetection.ip_limit]
|
||||
link_token = true
|
||||
|
||||
If the :py:obj:`.link_token` method is activated and a request is *suspicious*
|
||||
the request rates are reduced:
|
||||
|
||||
- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
|
||||
- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
|
||||
|
||||
To intercept bots that get their IPs from a range of IPs, there is a
|
||||
:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored
|
||||
for a longer time. IPs stored in this sliding window have a maximum of
|
||||
:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP
|
||||
makes a request that is not suspicious, the sliding window for this IP is
|
||||
droped.
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
from searx.tools import config
|
||||
|
||||
from searx import redisdb
|
||||
from searx import logger
|
||||
from searx.redislib import incr_sliding_window, drop_counter
|
||||
|
||||
from . import link_token
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
logger = logger.getChild('botdetection.ip_limit')
|
||||
|
||||
BURST_WINDOW = 20
|
||||
"""Time (sec) before sliding window for *burst* requests expires."""
|
||||
|
||||
BURST_MAX = 15
|
||||
"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
|
||||
|
||||
BURST_MAX_SUSPICIOUS = 2
|
||||
"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
|
||||
|
||||
LONG_WINDOW = 600
|
||||
"""Time (sec) before the longer sliding window expires."""
|
||||
|
||||
LONG_MAX = 150
|
||||
"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
|
||||
|
||||
LONG_MAX_SUSPICIOUS = 10
|
||||
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
|
||||
|
||||
API_WONDOW = 3600
|
||||
"""Time (sec) before sliding window for API requests (format != html) expires."""
|
||||
|
||||
API_MAX = 4
|
||||
"""Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
|
||||
|
||||
SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30
|
||||
"""Time (sec) before sliding window for one suspicious IP expires."""
|
||||
|
||||
SUSPICIOUS_IP_MAX = 3
|
||||
"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
# pylint: disable=too-many-return-statements
|
||||
redis_client = redisdb.client()
|
||||
|
||||
if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']:
|
||||
logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed)
|
||||
return None
|
||||
|
||||
if request.args.get('format', 'html') != 'html':
|
||||
c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW)
|
||||
if c > API_MAX:
|
||||
return too_many_requests(network, "too many request in API_WINDOW")
|
||||
|
||||
if cfg['botdetection.ip_limit.link_token']:
|
||||
|
||||
suspicious = link_token.is_suspicious(network, request, True)
|
||||
|
||||
if not suspicious:
|
||||
# this IP is no longer suspicious: release ip again / delete the counter of this IP
|
||||
drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed)
|
||||
return None
|
||||
|
||||
# this IP is suspicious: count requests from this IP
|
||||
c = incr_sliding_window(
|
||||
redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW
|
||||
)
|
||||
if c > SUSPICIOUS_IP_MAX:
|
||||
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
|
||||
return flask.redirect(flask.url_for('index'), code=302)
|
||||
|
||||
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
|
||||
if c > BURST_MAX_SUSPICIOUS:
|
||||
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
|
||||
|
||||
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
|
||||
if c > LONG_MAX_SUSPICIOUS:
|
||||
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
|
||||
|
||||
return None
|
||||
|
||||
# vanilla limiter without extensions counts BURST_MAX and LONG_MAX
|
||||
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
|
||||
if c > BURST_MAX:
|
||||
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)")
|
||||
|
||||
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
|
||||
if c > LONG_MAX:
|
||||
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)")
|
||||
|
||||
return None
|
118
searx/botdetection/limiter.py
Normal file
118
searx/botdetection/limiter.py
Normal file
|
@ -0,0 +1,118 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _limiter src:
|
||||
|
||||
Limiter
|
||||
=======
|
||||
|
||||
.. sidebar:: info
|
||||
|
||||
The limiter requires a :ref:`Redis <settings redis>` database.
|
||||
|
||||
Bot protection / IP rate limitation. The intention of rate limitation is to
|
||||
limit suspicious requests from an IP. The motivation behind this is the fact
|
||||
that SearXNG passes through requests from bots and is thus classified as a bot
|
||||
itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked
|
||||
by the search engine (the origin) in some other way.
|
||||
|
||||
To avoid blocking, the requests from bots to SearXNG must also be blocked, this
|
||||
is the task of the limiter. To perform this task, the limiter uses the methods
|
||||
from the :py:obj:`searx.botdetection`.
|
||||
|
||||
To enable the limiter activate:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
server:
|
||||
...
|
||||
limiter: true # rate limit the number of request on the instance, block some bots
|
||||
|
||||
and set the redis-url connection. Check the value, it depends on your redis DB
|
||||
(see :ref:`settings redis`), by example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
redis:
|
||||
url: unix:///usr/local/searxng-redis/run/redis.sock?db=0
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx.tools import config
|
||||
from searx import logger
|
||||
|
||||
from . import (
|
||||
http_accept,
|
||||
http_accept_encoding,
|
||||
http_accept_language,
|
||||
http_connection,
|
||||
http_user_agent,
|
||||
ip_limit,
|
||||
)
|
||||
|
||||
from ._helpers import (
|
||||
get_network,
|
||||
get_real_ip,
|
||||
dump_request,
|
||||
)
|
||||
|
||||
logger = logger.getChild('botdetection.limiter')
|
||||
|
||||
CFG: config.Config = None # type: ignore
|
||||
|
||||
LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
|
||||
"""Base configuration (schema) of the botdetection."""
|
||||
|
||||
LIMITER_CFG = Path('/etc/searxng/limiter.toml')
|
||||
"""Lokal Limiter configuration."""
|
||||
|
||||
CFG_DEPRECATED = {
|
||||
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
|
||||
}
|
||||
|
||||
|
||||
def get_cfg() -> config.Config:
|
||||
global CFG # pylint: disable=global-statement
|
||||
if CFG is None:
|
||||
CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED)
|
||||
return CFG
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> werkzeug.Response | None:
|
||||
|
||||
cfg = get_cfg()
|
||||
real_ip = get_real_ip(request)
|
||||
network = get_network(real_ip, cfg)
|
||||
if network.is_link_local:
|
||||
return None
|
||||
|
||||
if request.path == '/healthz':
|
||||
return None
|
||||
|
||||
for func in [
|
||||
http_user_agent,
|
||||
]:
|
||||
val = func.filter_request(network, request, cfg)
|
||||
if val is not None:
|
||||
return val
|
||||
|
||||
if request.path == '/search':
|
||||
|
||||
for func in [
|
||||
http_accept,
|
||||
http_accept_encoding,
|
||||
http_accept_language,
|
||||
http_connection,
|
||||
http_user_agent,
|
||||
ip_limit,
|
||||
]:
|
||||
val = func.filter_request(network, request, cfg)
|
||||
if val is not None:
|
||||
return val
|
||||
logger.debug(f"OK {network}: %s", dump_request(flask.request))
|
||||
return None
|
22
searx/botdetection/limiter.toml
Normal file
22
searx/botdetection/limiter.toml
Normal file
|
@ -0,0 +1,22 @@
|
|||
[real_ip]
|
||||
|
||||
# Number of values to trust for X-Forwarded-For.
|
||||
|
||||
x_for = 1
|
||||
|
||||
# The prefix defines the number of leading bits in an address that are compared
|
||||
# to determine whether or not an address is part of a (client) network.
|
||||
|
||||
ipv4_prefix = 32
|
||||
ipv6_prefix = 48
|
||||
|
||||
[botdetection.ip_limit]
|
||||
|
||||
# To get unlimited access in a local network, by default link-lokal addresses
|
||||
# (networks) are not monitored by the ip_limit
|
||||
filter_link_local = false
|
||||
|
||||
# acrivate link_token method in the ip_limit method
|
||||
link_token = false
|
||||
|
||||
|
156
searx/botdetection/link_token.py
Normal file
156
searx/botdetection/link_token.py
Normal file
|
@ -0,0 +1,156 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``link_token``
|
||||
---------------------
|
||||
|
||||
The ``link_token`` method evaluates a request as :py:obj:`suspicious
|
||||
<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
|
||||
client. By adding a random component (the token) in the URL, a bot can not send
|
||||
a ping by request a static URL.
|
||||
|
||||
.. note::
|
||||
|
||||
This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
|
||||
|
||||
To get in use of this method a flask URL route needs to be added:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@app.route('/client<token>.css', methods=['GET', 'POST'])
|
||||
def client_token(token=None):
|
||||
link_token.ping(request, token)
|
||||
return Response('', mimetype='text/css')
|
||||
|
||||
And in the HTML template from flask a stylesheet link is needed (the value of
|
||||
``link_token`` comes from :py:obj:`get_token`):
|
||||
|
||||
.. code:: html
|
||||
|
||||
<link rel="stylesheet"
|
||||
href="{{ url_for('client_token', token=link_token) }}"
|
||||
type="text/css" />
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import string
|
||||
import random
|
||||
import flask
|
||||
|
||||
from searx import logger
|
||||
from searx import redisdb
|
||||
from searx.redislib import secret_hash
|
||||
|
||||
from ._helpers import (
|
||||
get_network,
|
||||
get_real_ip,
|
||||
)
|
||||
|
||||
TOKEN_LIVE_TIME = 600
|
||||
"""Livetime (sec) of limiter's CSS token."""
|
||||
|
||||
PING_LIVE_TIME = 3600
|
||||
"""Livetime (sec) of the ping-key from a client (request)"""
|
||||
|
||||
PING_KEY = 'SearXNG_limiter.ping'
|
||||
"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
|
||||
|
||||
TOKEN_KEY = 'SearXNG_limiter.token'
|
||||
"""Key for which the current token is stored in the DB"""
|
||||
|
||||
logger = logger.getChild('botdetection.link_token')
|
||||
|
||||
|
||||
def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False):
|
||||
"""Checks whether a valid ping is exists for this (client) network, if not
|
||||
this request is rated as *suspicious*. If a valid ping exists and argument
|
||||
``renew`` is ``True`` the expire time of this ping is reset to
|
||||
:py:obj:`PING_LIVE_TIME`.
|
||||
|
||||
"""
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
return False
|
||||
|
||||
ping_key = get_ping_key(network, request)
|
||||
if not redis_client.get(ping_key):
|
||||
logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key)
|
||||
return True
|
||||
|
||||
if renew:
|
||||
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
|
||||
|
||||
logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key)
|
||||
return False
|
||||
|
||||
|
||||
def ping(request: flask.Request, token: str):
|
||||
"""This function is called by a request to URL ``/client<token>.css``. If
|
||||
``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
|
||||
The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
|
||||
|
||||
"""
|
||||
from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import
|
||||
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
return
|
||||
if not token_is_valid(token):
|
||||
return
|
||||
|
||||
cfg = limiter.get_cfg()
|
||||
real_ip = get_real_ip(request)
|
||||
network = get_network(real_ip, cfg)
|
||||
|
||||
ping_key = get_ping_key(network, request)
|
||||
logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key)
|
||||
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
|
||||
|
||||
|
||||
def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str:
|
||||
"""Generates a hashed key that fits (more or less) to a *WEB-browser
|
||||
session* in a network."""
|
||||
return (
|
||||
PING_KEY
|
||||
+ "["
|
||||
+ secret_hash(
|
||||
network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
|
||||
)
|
||||
+ "]"
|
||||
)
|
||||
|
||||
|
||||
def token_is_valid(token) -> bool:
|
||||
valid = token == get_token()
|
||||
logger.debug("token is valid --> %s", valid)
|
||||
return valid
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
"""Returns current token. If there is no currently active token a new token
|
||||
is generated randomly and stored in the redis DB.
|
||||
|
||||
- :py:obj:`TOKEN_LIVE_TIME`
|
||||
- :py:obj:`TOKEN_KEY`
|
||||
|
||||
"""
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
# This function is also called when limiter is inactive / no redis DB
|
||||
# (see render function in webapp.py)
|
||||
return '12345678'
|
||||
token = redis_client.get(TOKEN_KEY)
|
||||
if token:
|
||||
token = token.decode('UTF-8')
|
||||
else:
|
||||
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
|
||||
redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME)
|
||||
return token
|
|
@ -1,119 +1,32 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
# pyright: basic
|
||||
"""Some bot protection / rate limitation
|
||||
"""see :ref:`limiter src`"""
|
||||
|
||||
To monitor rate limits and protect privacy the IP addresses are getting stored
|
||||
with a hash so the limiter plugin knows who to block. A redis database is
|
||||
needed to store the hash values.
|
||||
|
||||
Enable the plugin in ``settings.yml``:
|
||||
|
||||
- ``server.limiter: true``
|
||||
- ``redis.url: ...`` check the value, see :ref:`settings redis`
|
||||
"""
|
||||
|
||||
import re
|
||||
from flask import request
|
||||
import flask
|
||||
|
||||
from searx import redisdb
|
||||
from searx.plugins import logger
|
||||
from searx.redislib import incr_sliding_window
|
||||
from searx.botdetection import limiter
|
||||
|
||||
name = "Request limiter"
|
||||
description = "Limit the number of request"
|
||||
default_on = False
|
||||
preference_section = 'service'
|
||||
|
||||
logger = logger.getChild('limiter')
|
||||
|
||||
block_user_agent = re.compile(
|
||||
r'('
|
||||
+ r'unknown'
|
||||
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
||||
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
||||
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
||||
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
||||
+ r'|ZmEu|BLEXBot|bitlybot'
|
||||
# unmaintained Farside instances
|
||||
+ r'|'
|
||||
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
||||
+ '|.*PetalBot.*'
|
||||
+ r')'
|
||||
)
|
||||
|
||||
|
||||
def is_accepted_request() -> bool:
|
||||
# pylint: disable=too-many-return-statements
|
||||
redis_client = redisdb.client()
|
||||
user_agent = request.headers.get('User-Agent', 'unknown')
|
||||
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
|
||||
|
||||
if request.path == '/healthz':
|
||||
return True
|
||||
|
||||
if block_user_agent.match(user_agent):
|
||||
logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent))
|
||||
return False
|
||||
|
||||
if request.path == '/search':
|
||||
|
||||
c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
|
||||
c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
|
||||
if c_burst > 15 or c_10min > 150:
|
||||
logger.debug("BLOCK %s: to many request", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if len(request.headers.get('Accept-Language', '').strip()) == '':
|
||||
logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if request.headers.get('Connection') == 'close':
|
||||
logger.debug("BLOCK %s: got Connection=close", x_forwarded_for)
|
||||
return False
|
||||
|
||||
accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
|
||||
if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
|
||||
logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if 'text/html' not in request.accept_mimetypes:
|
||||
logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if request.args.get('format', 'html') != 'html':
|
||||
c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
|
||||
if c > 4:
|
||||
logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for)
|
||||
return False
|
||||
|
||||
logger.debug(
|
||||
"OK %s: '%s'" % (x_forwarded_for, request.path)
|
||||
+ " || form: %s" % request.form
|
||||
+ " || Accept: %s" % request.headers.get('Accept', '')
|
||||
+ " || Accept-Language: %s" % request.headers.get('Accept-Language', '')
|
||||
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '')
|
||||
+ " || Content-Type: %s" % request.headers.get('Content-Type', '')
|
||||
+ " || Content-Length: %s" % request.headers.get('Content-Length', '')
|
||||
+ " || Connection: %s" % request.headers.get('Connection', '')
|
||||
+ " || User-Agent: %s" % user_agent
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def pre_request():
|
||||
if not is_accepted_request():
|
||||
return 'Too Many Requests', 429
|
||||
return None
|
||||
"""See :ref:`flask.Flask.before_request`"""
|
||||
return limiter.filter_request(flask.request)
|
||||
|
||||
|
||||
def init(app, settings):
|
||||
def init(app: flask.Flask, settings) -> bool:
|
||||
if not settings['server']['limiter']:
|
||||
return False
|
||||
|
||||
if not redisdb.client():
|
||||
logger.error("The limiter requires Redis") # pylint: disable=undefined-variable
|
||||
logger.error("The limiter requires Redis")
|
||||
return False
|
||||
|
||||
app.before_request(pre_request)
|
||||
return True
|
||||
|
|
|
@ -1,21 +1,11 @@
|
|||
'''
|
||||
searx is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
# pylint: disable=missing-module-docstring,invalid-name
|
||||
|
||||
searx is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
||||
|
||||
(C) 2015 by Adam Tauber, <asciimoo@gmail.com>
|
||||
'''
|
||||
from flask_babel import gettext
|
||||
import re
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.botdetection._helpers import get_real_ip
|
||||
|
||||
name = gettext('Self Information')
|
||||
description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".')
|
||||
|
@ -28,18 +18,11 @@ query_examples = ''
|
|||
p = re.compile('.*user[ -]agent.*', re.IGNORECASE)
|
||||
|
||||
|
||||
# attach callback to the post search hook
|
||||
# request: flask request object
|
||||
# ctx: the whole local context of the pre search hook
|
||||
def post_search(request, search):
|
||||
if search.search_query.pageno > 1:
|
||||
return True
|
||||
if search.search_query.query == 'ip':
|
||||
x_forwarded_for = request.headers.getlist("X-Forwarded-For")
|
||||
if x_forwarded_for:
|
||||
ip = x_forwarded_for[0]
|
||||
else:
|
||||
ip = request.remote_addr
|
||||
ip = get_real_ip(request)
|
||||
search.result_container.answers['ip'] = {'answer': ip}
|
||||
elif p.match(search.search_query.query):
|
||||
ua = request.user_agent
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ url_for('static', filename='css/searxng.min.css') }}" type="text/css" media="screen" />
|
||||
{% endif %}
|
||||
{% if get_setting('server.limiter') %}
|
||||
<link rel="stylesheet" href="{{ url_for('client_token', token=link_token) }}" type="text/css" />
|
||||
{% endif %}
|
||||
{% block styles %}{% endblock %}
|
||||
<!--[if gte IE 9]>-->
|
||||
<script src="{{ url_for('static', filename='js/searxng.head.min.js') }}" client_settings="{{ client_settings }}"></script>
|
||||
|
|
8
searx/tools/__init__.py
Normal file
8
searx/tools/__init__.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _tools src:
|
||||
|
||||
A collection of *utilities* used by SearXNG, but without SearXNG specific
|
||||
peculiarities.
|
||||
|
||||
"""
|
376
searx/tools/config.py
Normal file
376
searx/tools/config.py
Normal file
|
@ -0,0 +1,376 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Configuration class :py:class:`Config` with deep-update, schema validation
|
||||
and deprecated names.
|
||||
|
||||
The :py:class:`Config` class implements a configuration that is based on
|
||||
structured dictionaries. The configuration schema is defined in a dictionary
|
||||
structure and the configuration data is given in a dictionary structure.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import typing
|
||||
import logging
|
||||
import pathlib
|
||||
import pytomlpp as toml
|
||||
|
||||
__all__ = ['Config', 'UNSET', 'SchemaIssue']
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FALSE:
|
||||
"""Class of ``False`` singelton"""
|
||||
|
||||
# pylint: disable=multiple-statements
|
||||
def __init__(self, msg):
|
||||
self.msg = msg
|
||||
|
||||
def __bool__(self):
|
||||
return False
|
||||
|
||||
def __str__(self):
|
||||
return self.msg
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
UNSET = FALSE('<UNSET>')
|
||||
|
||||
|
||||
class SchemaIssue(ValueError):
|
||||
"""Exception to store and/or raise a message from a schema issue."""
|
||||
|
||||
def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str):
|
||||
self.level = level
|
||||
super().__init__(msg)
|
||||
|
||||
def __str__(self):
|
||||
return f"[cfg schema {self.level}] {self.args[0]}"
|
||||
|
||||
|
||||
class Config:
|
||||
"""Base class used for configuration"""
|
||||
|
||||
UNSET = UNSET
|
||||
|
||||
@classmethod
|
||||
def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config:
|
||||
|
||||
# init schema
|
||||
|
||||
log.debug("load schema file: %s", schema_file)
|
||||
cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated)
|
||||
if not cfg_file.exists():
|
||||
log.warning("missing config file: %s", cfg_file)
|
||||
return cfg
|
||||
|
||||
# load configuration
|
||||
|
||||
log.debug("load config file: %s", cfg_file)
|
||||
try:
|
||||
upd_cfg = toml.load(cfg_file)
|
||||
except toml.DecodeError as exc:
|
||||
msg = str(exc).replace('\t', '').replace('\n', ' ')
|
||||
log.error("%s: %s", cfg_file, msg)
|
||||
raise
|
||||
|
||||
is_valid, issue_list = cfg.validate(upd_cfg)
|
||||
for msg in issue_list:
|
||||
log.error(str(msg))
|
||||
if not is_valid:
|
||||
raise TypeError(f"schema of {cfg_file} is invalid!")
|
||||
cfg.update(upd_cfg)
|
||||
return cfg
|
||||
|
||||
def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
|
||||
"""Construtor of class Config.
|
||||
|
||||
:param cfg_schema: Schema of the configuration
|
||||
:param deprecated: dictionary that maps deprecated configuration names to a messages
|
||||
|
||||
These values are needed for validation, see :py:obj:`validate`.
|
||||
|
||||
"""
|
||||
self.cfg_schema = cfg_schema
|
||||
self.deprecated = deprecated
|
||||
self.cfg = copy.deepcopy(cfg_schema)
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
return self.get(key)
|
||||
|
||||
def validate(self, cfg: dict):
|
||||
"""Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`.
|
||||
Validation is done by :py:obj:`validate`."""
|
||||
|
||||
return validate(self.cfg_schema, cfg, self.deprecated)
|
||||
|
||||
def update(self, upd_cfg: dict):
|
||||
"""Update this configuration by ``upd_cfg``."""
|
||||
|
||||
dict_deepupdate(self.cfg, upd_cfg)
|
||||
|
||||
def default(self, name: str):
|
||||
"""Returns default value of field ``name`` in ``self.cfg_schema``."""
|
||||
return value(name, self.cfg_schema)
|
||||
|
||||
def get(self, name: str, default=UNSET, replace=True):
|
||||
"""Returns the value to which ``name`` points in the configuration.
|
||||
|
||||
If there is no such ``name`` in the config and the ``default`` is
|
||||
:py:obj:`UNSET`, a :py:obj:`KeyError` is raised.
|
||||
"""
|
||||
|
||||
parent = self._get_parent_dict(name)
|
||||
val = parent.get(name.split('.')[-1], UNSET)
|
||||
if val is UNSET:
|
||||
if default is UNSET:
|
||||
raise KeyError(name)
|
||||
val = default
|
||||
|
||||
if replace and isinstance(val, str):
|
||||
val = val % self
|
||||
return val
|
||||
|
||||
def set(self, name: str, val):
|
||||
"""Set the value to which ``name`` points in the configuration.
|
||||
|
||||
If there is no such ``name`` in the config, a :py:obj:`KeyError` is
|
||||
raised.
|
||||
"""
|
||||
parent = self._get_parent_dict(name)
|
||||
parent[name.split('.')[-1]] = val
|
||||
|
||||
def _get_parent_dict(self, name):
|
||||
parent_name = '.'.join(name.split('.')[:-1])
|
||||
if parent_name:
|
||||
parent = value(parent_name, self.cfg)
|
||||
else:
|
||||
parent = self.cfg
|
||||
if (parent is UNSET) or (not isinstance(parent, dict)):
|
||||
raise KeyError(parent_name)
|
||||
return parent
|
||||
|
||||
def path(self, name: str, default=UNSET):
|
||||
"""Get a :py:class:`pathlib.Path` object from a config string."""
|
||||
|
||||
val = self.get(name, default)
|
||||
if val is UNSET:
|
||||
if default is UNSET:
|
||||
raise KeyError(name)
|
||||
return default
|
||||
return pathlib.Path(str(val))
|
||||
|
||||
def pyobj(self, name, default=UNSET):
|
||||
"""Get python object refered by full qualiffied name (FQN) in the config
|
||||
string."""
|
||||
|
||||
fqn = self.get(name, default)
|
||||
if fqn is UNSET:
|
||||
if default is UNSET:
|
||||
raise KeyError(name)
|
||||
return default
|
||||
(modulename, name) = str(fqn).rsplit('.', 1)
|
||||
m = __import__(modulename, {}, {}, [name], 0)
|
||||
return getattr(m, name)
|
||||
|
||||
|
||||
# working with dictionaries
|
||||
|
||||
|
||||
def value(name: str, data_dict: dict):
|
||||
"""Returns the value to which ``name`` points in the ``dat_dict``.
|
||||
|
||||
.. code: python
|
||||
|
||||
>>> data_dict = {
|
||||
"foo": {"bar": 1 },
|
||||
"bar": {"foo": 2 },
|
||||
"foobar": [1, 2, 3],
|
||||
}
|
||||
>>> value('foobar', data_dict)
|
||||
[1, 2, 3]
|
||||
>>> value('foo.bar', data_dict)
|
||||
1
|
||||
>>> value('foo.bar.xxx', data_dict)
|
||||
<UNSET>
|
||||
|
||||
"""
|
||||
|
||||
ret_val = data_dict
|
||||
for part in name.split('.'):
|
||||
if isinstance(ret_val, dict):
|
||||
ret_val = ret_val.get(part, UNSET)
|
||||
if ret_val is UNSET:
|
||||
break
|
||||
return ret_val
|
||||
|
||||
|
||||
def validate(
|
||||
schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str]
|
||||
) -> typing.Tuple[bool, list]:
|
||||
|
||||
"""Deep validation of dictionary in ``data_dict`` against dictionary in
|
||||
``schema_dict``. Argument deprecated is a dictionary that maps deprecated
|
||||
configuration names to a messages::
|
||||
|
||||
deprecated = {
|
||||
"foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'",
|
||||
"..." : "..."
|
||||
}
|
||||
|
||||
The function returns a python tuple ``(is_valid, issue_list)``:
|
||||
|
||||
``is_valid``:
|
||||
A bool value indicating ``data_dict`` is valid or not.
|
||||
|
||||
``issue_list``:
|
||||
A list of messages (:py:obj:`SchemaIssue`) from the validation::
|
||||
|
||||
[schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']>
|
||||
[schema invalid] data_dict: key unknown 'fontlib.foo'
|
||||
[schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ...
|
||||
|
||||
If ``schema_dict`` or ``data_dict`` is not a dictionary type a
|
||||
:py:obj:`SchemaIssue` is raised.
|
||||
|
||||
"""
|
||||
names = []
|
||||
is_valid = True
|
||||
issue_list = []
|
||||
|
||||
if not isinstance(schema_dict, dict):
|
||||
raise SchemaIssue('invalid', "schema_dict is not a dict type")
|
||||
if not isinstance(data_dict, dict):
|
||||
raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type")
|
||||
|
||||
is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated)
|
||||
return is_valid, issue_list
|
||||
|
||||
|
||||
def _validate(
|
||||
names: typing.List,
|
||||
issue_list: typing.List,
|
||||
schema_dict: typing.Dict,
|
||||
data_dict: typing.Dict,
|
||||
deprecated: typing.Dict[str, str],
|
||||
) -> typing.Tuple[bool, typing.List]:
|
||||
|
||||
is_valid = True
|
||||
|
||||
for key, data_value in data_dict.items():
|
||||
|
||||
names.append(key)
|
||||
name = '.'.join(names)
|
||||
|
||||
deprecated_msg = deprecated.get(name)
|
||||
# print("XXX %s: key %s // data_value: %s" % (name, key, data_value))
|
||||
if deprecated_msg:
|
||||
issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}"))
|
||||
|
||||
schema_value = value(name, schema_dict)
|
||||
# print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value))
|
||||
if schema_value is UNSET:
|
||||
if not deprecated_msg:
|
||||
issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict"))
|
||||
is_valid = False
|
||||
|
||||
elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck
|
||||
issue_list.append(
|
||||
SchemaIssue(
|
||||
'invalid',
|
||||
(f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"),
|
||||
)
|
||||
)
|
||||
is_valid = False
|
||||
|
||||
elif isinstance(data_value, dict):
|
||||
_valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated)
|
||||
is_valid = is_valid and _valid
|
||||
names.pop()
|
||||
|
||||
return is_valid, issue_list
|
||||
|
||||
|
||||
def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None):
|
||||
"""Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``.
|
||||
|
||||
For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``:
|
||||
|
||||
0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a
|
||||
:py:obj:`TypeError`.
|
||||
|
||||
1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``.
|
||||
|
||||
2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a
|
||||
(deep-) copy of ``upd_val``.
|
||||
|
||||
3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the
|
||||
list in ``upd_val``.
|
||||
|
||||
4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in
|
||||
``upd_val``.
|
||||
"""
|
||||
# pylint: disable=too-many-branches
|
||||
if not isinstance(base_dict, dict):
|
||||
raise TypeError("argument 'base_dict' is not a ditionary type")
|
||||
if not isinstance(upd_dict, dict):
|
||||
raise TypeError("argument 'upd_dict' is not a ditionary type")
|
||||
|
||||
if names is None:
|
||||
names = []
|
||||
|
||||
for upd_key, upd_val in upd_dict.items():
|
||||
# For each upd_key & upd_val pair in upd_dict:
|
||||
|
||||
if isinstance(upd_val, dict):
|
||||
|
||||
if upd_key in base_dict:
|
||||
# if base_dict[upd_key] exists, recursively deep-update it
|
||||
if not isinstance(base_dict[upd_key], dict):
|
||||
raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict")
|
||||
dict_deepupdate(
|
||||
base_dict[upd_key],
|
||||
upd_val,
|
||||
names
|
||||
+ [
|
||||
upd_key,
|
||||
],
|
||||
)
|
||||
|
||||
else:
|
||||
# if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val
|
||||
base_dict[upd_key] = copy.deepcopy(upd_val)
|
||||
|
||||
elif isinstance(upd_val, list):
|
||||
|
||||
if upd_key in base_dict:
|
||||
# if base_dict[upd_key] exists, base_dict[up_key] is extended by
|
||||
# the list from upd_val
|
||||
if not isinstance(base_dict[upd_key], list):
|
||||
raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict")
|
||||
base_dict[upd_key].extend(upd_val)
|
||||
|
||||
else:
|
||||
# if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the
|
||||
# list in upd_val.
|
||||
base_dict[upd_key] = copy.deepcopy(upd_val)
|
||||
|
||||
elif isinstance(upd_val, set):
|
||||
|
||||
if upd_key in base_dict:
|
||||
# if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val
|
||||
if not isinstance(base_dict[upd_key], set):
|
||||
raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict")
|
||||
base_dict[upd_key].update(upd_val.copy())
|
||||
|
||||
else:
|
||||
# if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the
|
||||
# set in upd_val
|
||||
base_dict[upd_key] = upd_val.copy()
|
||||
|
||||
else:
|
||||
# for any other type of upd_val replace or add base_dict[upd_key] by a copy
|
||||
# of upd_val
|
||||
base_dict[upd_key] = copy.copy(upd_val)
|
|
@ -94,6 +94,7 @@ from searx.utils import (
|
|||
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
|
||||
from searx.query import RawTextQuery
|
||||
from searx.plugins import Plugin, plugins, initialize as plugin_initialize
|
||||
from searx.botdetection import link_token
|
||||
from searx.plugins.oa_doi_rewrite import get_doi_resolver
|
||||
from searx.preferences import (
|
||||
Preferences,
|
||||
|
@ -416,6 +417,7 @@ def render(template_name: str, **kwargs):
|
|||
kwargs['endpoint'] = 'results' if 'q' in kwargs else request.endpoint
|
||||
kwargs['cookies'] = request.cookies
|
||||
kwargs['errors'] = request.errors
|
||||
kwargs['link_token'] = link_token.get_token()
|
||||
|
||||
# values from the preferences
|
||||
kwargs['preferences'] = request.preferences
|
||||
|
@ -642,6 +644,12 @@ def health():
|
|||
return Response('OK', mimetype='text/plain')
|
||||
|
||||
|
||||
@app.route('/client<token>.css', methods=['GET', 'POST'])
|
||||
def client_token(token=None):
|
||||
link_token.ping(request, token)
|
||||
return Response('', mimetype='text/css')
|
||||
|
||||
|
||||
@app.route('/search', methods=['GET', 'POST'])
|
||||
def search():
|
||||
"""Search query in q and return results.
|
||||
|
|
|
@ -50,9 +50,13 @@ class SelfIPTest(SearxTestCase):
|
|||
self.assertTrue(len(store.plugins) == 1)
|
||||
|
||||
# IP test
|
||||
request = Mock(remote_addr='127.0.0.1')
|
||||
request.headers.getlist.return_value = []
|
||||
search = get_search_mock(query='ip', pageno=1)
|
||||
request = Mock()
|
||||
request.remote_addr = '127.0.0.1'
|
||||
request.headers = {'X-Forwarded-For': '1.2.3.4, 127.0.0.1', 'X-Real-IP': '127.0.0.1'}
|
||||
search = get_search_mock(
|
||||
query='ip',
|
||||
pageno=1,
|
||||
)
|
||||
store.call(store.plugins, 'post_search', request, search)
|
||||
self.assertTrue('127.0.0.1' in search.result_container.answers["ip"]["answer"])
|
||||
|
||||
|
@ -62,7 +66,6 @@ class SelfIPTest(SearxTestCase):
|
|||
|
||||
# User agent test
|
||||
request = Mock(user_agent='Mock')
|
||||
request.headers.getlist.return_value = []
|
||||
|
||||
search = get_search_mock(query='user-agent', pageno=1)
|
||||
store.call(store.plugins, 'post_search', request, search)
|
||||
|
@ -98,7 +101,6 @@ class HashPluginTest(SearxTestCase):
|
|||
self.assertTrue(len(store.plugins) == 1)
|
||||
|
||||
request = Mock(remote_addr='127.0.0.1')
|
||||
request.headers.getlist.return_value = []
|
||||
|
||||
# MD5
|
||||
search = get_search_mock(query='md5 test', pageno=1)
|
||||
|
|
Loading…
Reference in a new issue