From dba569462d0e9c4dbd77a54bb42ef5c3b1916142 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 19 Apr 2023 17:20:03 +0200 Subject: [PATCH 01/10] [mod] limiter: reduce request rates for requests without a ping Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 27 +++++++++++++++++++++++++-- searx/templates/simple/base.html | 3 +++ searx/webapp.py | 8 +++++++- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 46c82f588..c7d74248b 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -18,7 +18,7 @@ from flask import request from searx import redisdb from searx.plugins import logger -from searx.redislib import incr_sliding_window +from searx.redislib import incr_sliding_window, secret_hash name = "Request limiter" description = "Limit the number of request" @@ -41,6 +41,18 @@ block_user_agent = re.compile( + r')' ) +PING_KEY = 'SearXNG_limiter.ping' +TOKEN_KEY = 'SearXNG_limiter.token' + + +def ping(): + redis_client = redisdb.client() + user_agent = request.headers.get('User-Agent', 'unknown') + x_forwarded_for = request.headers.get('X-Forwarded-For', '') + + ping_key = PING_KEY + user_agent + x_forwarded_for + redis_client.set(secret_hash(ping_key), 1, ex=600) + def is_accepted_request() -> bool: # pylint: disable=too-many-return-statements @@ -57,9 +69,20 @@ def is_accepted_request() -> bool: if request.path == '/search': + c_burst_max = 2 + c_10min_max = 10 + + ping_key = PING_KEY + user_agent + x_forwarded_for + if redis_client.get(secret_hash(ping_key)): + logger.debug('got a ping') + c_burst_max = 15 + c_10min_max = 150 + else: + logger.debug('missing a ping') + c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) - if c_burst > 15 or c_10min > 150: + if c_burst > c_burst_max or c_10min > c_10min_max: logger.debug("BLOCK %s: to many request", x_forwarded_for) return False diff --git a/searx/templates/simple/base.html b/searx/templates/simple/base.html index a31ff07ee..dfe4ea265 100644 --- a/searx/templates/simple/base.html +++ b/searx/templates/simple/base.html @@ -17,6 +17,9 @@ {% else %} {% endif %} + {% if get_setting('server.limiter') %} + + {% endif %} {% block styles %}{% endblock %} diff --git a/searx/webapp.py b/searx/webapp.py index 79255652f..67265e542 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -93,7 +93,7 @@ from searx.utils import ( ) from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH from searx.query import RawTextQuery -from searx.plugins import Plugin, plugins, initialize as plugin_initialize +from searx.plugins import limiter, Plugin, plugins, initialize as plugin_initialize from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import ( Preferences, @@ -642,6 +642,12 @@ def health(): return Response('OK', mimetype='text/plain') +@app.route('/limiter.css', methods=['GET', 'POST']) +def limiter_css(): + limiter.ping() + return Response('', mimetype='text/css') + + @app.route('/search', methods=['GET', 'POST']) def search(): """Search query in q and return results. From 5226044c13817688a5ca3461743844dca4ed3d2b Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 19 Apr 2023 18:59:23 +0200 Subject: [PATCH 02/10] [mod] limiter: add random token to the limiter URL By adding a random component in the limiter URL a bot can no longer send a ping by request a static URL. Related: https://github.com/searxng/searxng/pull/2357#issuecomment-1518525094 Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 25 ++++++++++++++++++++++++- searx/templates/simple/base.html | 2 +- searx/webapp.py | 8 +++++--- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index c7d74248b..69bd576d4 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -14,6 +14,8 @@ Enable the plugin in ``settings.yml``: """ import re +import string +import random from flask import request from searx import redisdb @@ -54,6 +56,27 @@ def ping(): redis_client.set(secret_hash(ping_key), 1, ex=600) +def get_token(): + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) + redis_client.set(TOKEN_KEY, token, ex=600) + return token + + +def token_is_valid(token): + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + def is_accepted_request() -> bool: # pylint: disable=too-many-return-statements redis_client = redisdb.client() @@ -83,7 +106,7 @@ def is_accepted_request() -> bool: c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) if c_burst > c_burst_max or c_10min > c_10min_max: - logger.debug("BLOCK %s: to many request", x_forwarded_for) + logger.debug("BLOCK %s: too many request", x_forwarded_for) return False if len(request.headers.get('Accept-Language', '').strip()) == '': diff --git a/searx/templates/simple/base.html b/searx/templates/simple/base.html index dfe4ea265..9f7cdbb8e 100644 --- a/searx/templates/simple/base.html +++ b/searx/templates/simple/base.html @@ -18,7 +18,7 @@ {% endif %} {% if get_setting('server.limiter') %} - + {% endif %} {% block styles %}{% endblock %} diff --git a/searx/webapp.py b/searx/webapp.py index 67265e542..815bfcabd 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -416,6 +416,7 @@ def render(template_name: str, **kwargs): kwargs['endpoint'] = 'results' if 'q' in kwargs else request.endpoint kwargs['cookies'] = request.cookies kwargs['errors'] = request.errors + kwargs['limiter_token'] = limiter.get_token() # values from the preferences kwargs['preferences'] = request.preferences @@ -642,9 +643,10 @@ def health(): return Response('OK', mimetype='text/plain') -@app.route('/limiter.css', methods=['GET', 'POST']) -def limiter_css(): - limiter.ping() +@app.route('/limiter.css', methods=['GET', 'POST']) +def limiter_css(token=None): + if limiter.token_is_valid(token): + limiter.ping() return Response('', mimetype='text/css') From 1ec325adccc427fe05cf08da9a2d9d63da7365f4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 23 May 2023 18:16:37 +0200 Subject: [PATCH 03/10] [mod] limiter -> botdetection: modularization and documentation In order to be able to meet the outstanding requirements, the implementation is modularized and supplemented with documentation. This patch does not contain functional change, except it fixes issue #2455 ---- Aktivate limiter in the settings.yml and simulate a bot request by:: curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \ -H 'Accept: text/html' -H 'User-Agent: xyz' \ -H 'Accept-Encoding: gzip' \ 'http://127.0.0.1:8888/search?q=foo' In the LOG: DEBUG searx.botdetection.link_token : missing ping for this request: ..... Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time before you get a "Too Many Requests" response. Closes: https://github.com/searxng/searxng/issues/2455 Signed-off-by: Markus Heiser --- docs/admin/engines/settings.rst | 2 +- docs/src/searx.botdetection.rst | 45 ++++++ docs/src/searx.plugins.limiter.rst | 13 -- searx/botdetection/__init__.py | 26 ++++ searx/botdetection/http_accept.py | 24 ++++ searx/botdetection/http_accept_encoding.py | 26 ++++ searx/botdetection/http_accept_language.py | 23 +++ searx/botdetection/http_connection.py | 23 +++ searx/botdetection/http_user_agent.py | 54 +++++++ searx/botdetection/ip_limit.py | 90 ++++++++++++ searx/botdetection/limiter.py | 79 +++++++++++ searx/botdetection/link_token.py | 126 +++++++++++++++++ searx/plugins/limiter.py | 157 +++------------------ searx/templates/simple/base.html | 2 +- searx/webapp.py | 12 +- 15 files changed, 541 insertions(+), 161 deletions(-) create mode 100644 docs/src/searx.botdetection.rst delete mode 100644 docs/src/searx.plugins.limiter.rst create mode 100644 searx/botdetection/__init__.py create mode 100644 searx/botdetection/http_accept.py create mode 100644 searx/botdetection/http_accept_encoding.py create mode 100644 searx/botdetection/http_accept_language.py create mode 100644 searx/botdetection/http_connection.py create mode 100644 searx/botdetection/http_user_agent.py create mode 100644 searx/botdetection/ip_limit.py create mode 100644 searx/botdetection/limiter.py create mode 100644 searx/botdetection/link_token.py diff --git a/docs/admin/engines/settings.rst b/docs/admin/engines/settings.rst index f9a1dad4f..63478f441 100644 --- a/docs/admin/engines/settings.rst +++ b/docs/admin/engines/settings.rst @@ -235,7 +235,7 @@ Global Settings ``limiter`` : Rate limit the number of request on the instance, block some bots. The - :ref:`limiter plugin` requires a :ref:`settings redis` database. + :ref:`limiter src` requires a :ref:`settings redis` database. .. _image_proxy: diff --git a/docs/src/searx.botdetection.rst b/docs/src/searx.botdetection.rst new file mode 100644 index 000000000..85e0ce4cd --- /dev/null +++ b/docs/src/searx.botdetection.rst @@ -0,0 +1,45 @@ +.. _botdetection: + +============= +Bot Detection +============= + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.botdetection + :members: + +.. automodule:: searx.botdetection.limiter + :members: + + +Rate limit +========== + +.. automodule:: searx.botdetection.ip_limit + :members: + +.. automodule:: searx.botdetection.link_token + :members: + + +Probe HTTP headers +================== + +.. automodule:: searx.botdetection.http_accept + :members: + +.. automodule:: searx.botdetection.http_accept_encoding + :members: + +.. automodule:: searx.botdetection.http_accept_language + :members: + +.. automodule:: searx.botdetection.http_connection + :members: + +.. automodule:: searx.botdetection.http_user_agent + :members: diff --git a/docs/src/searx.plugins.limiter.rst b/docs/src/searx.plugins.limiter.rst deleted file mode 100644 index 75d06f5c2..000000000 --- a/docs/src/searx.plugins.limiter.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _limiter plugin: - -============== -Limiter Plugin -============== - -.. sidebar:: info - - The :ref:`limiter plugin` requires a :ref:`Redis ` database. - -.. automodule:: searx.plugins.limiter - :members: - diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py new file mode 100644 index 000000000..78a7d30f3 --- /dev/null +++ b/searx/botdetection/__init__.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection src: + +Bot detection methods +--------------------- + +The methods implemented in this python package are use by the :ref:`limiter src`. + +""" + +import flask + + +def dump_request(request: flask.Request): + return ( + "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path) + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py new file mode 100644 index 000000000..1ab7cb4c1 --- /dev/null +++ b/searx/botdetection/http_accept.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept`` +---------------------- + +The ``http_accept`` method evaluates a request as the request of a bot if the +Accept_ header .. + +- did not contain ``text/html`` + +.. _Accept: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept + +""" + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if 'text/html' not in request.accept_mimetypes: + return 429, "bot detected, HTTP header Accept did not contain text/html" + return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py new file mode 100644 index 000000000..ae630fd68 --- /dev/null +++ b/searx/botdetection/http_accept_encoding.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_encoding`` +------------------------------- + +The ``http_accept_encoding`` method evaluates a request as the request of a +bot if the Accept-Encoding_ header .. + +- did not contain ``gzip`` AND ``deflate`` (if both values are missed) +- did not contain ``text/html`` + +.. _Accept-Encoding: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding + +""" + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] + if not ('gzip' in accept_list or 'deflate' in accept_list): + return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" + return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py new file mode 100644 index 000000000..06743802e --- /dev/null +++ b/searx/botdetection/http_accept_language.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_language`` +------------------------------- + +The ``http_accept_language`` method evaluates a request as the request of a bot +if the Accept-Language_ header is unset. + +.. _Accept-Language: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if request.headers.get('Accept-Language', '').strip() == '': + return 429, "bot detected, missing HTTP header Accept-Language" + return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py new file mode 100644 index 000000000..f61f5e48c --- /dev/null +++ b/searx/botdetection/http_connection.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_connection`` +-------------------------- + +The ``http_connection`` method evaluates a request as the request of a bot if +the Connection_ header is set to ``close``. + +.. _Connection: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection + +""" + + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if request.headers.get('Connection', '').strip() == 'close': + return 429, "bot detected, HTTP header 'Connection=close'" + return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py new file mode 100644 index 000000000..892ae0bd9 --- /dev/null +++ b/searx/botdetection/http_user_agent.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + +from typing import Optional, Tuple +import re +import flask + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return ( + 429, + f"bot detected, HTTP header User-Agent: {user_agent}", + ) + return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py new file mode 100644 index 000000000..fce3f8b67 --- /dev/null +++ b/searx/botdetection/ip_limit.py @@ -0,0 +1,90 @@ +""" +Method ``ip_limit`` +------------------- + +The ``ip_limit`` method counts request from an IP in *sliding windows*. If +there are to many requests in a sliding window, the request is evaluated as a +bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ +header. To take privacy only the hash value of an IP is stored in the redis DB +and at least for a maximum of 10 minutes. + +The :py:obj:`link_token` method is used to investigate whether a request is +*suspicious*. If the :py:obj:`link_token` method is activated and a request is +*suspicious* the request rates are reduced: + +- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` +- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" + +from typing import Optional, Tuple +import flask + +from searx import redisdb +from searx import logger +from searx.redislib import incr_sliding_window + +from . import link_token + +logger = logger.getChild('botdetection.ip_limit') + +BURST_WINDOW = 20 +"""Time (sec) before sliding window for *burst* requests expires.""" + +BURST_MAX = 15 +"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" + +BURST_MAX_SUSPICIOUS = 2 +"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" + +LONG_WINDOW = 600 +"""Time (sec) before the longer sliding window expires.""" + +LONG_MAX = 150 +"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" + +LONG_MAX_SUSPICIOUS = 10 +"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" + +API_WONDOW = 3600 +"""Time (sec) before sliding window for API requests (format != html) expires.""" + +API_MAX = 4 +"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + redis_client = redisdb.client() + + x_forwarded_for = request.headers.get('X-Forwarded-For', '') + if not x_forwarded_for: + logger.error("missing HTTP header X-Forwarded-For") + + if request.args.get('format', 'html') != 'html': + c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW) + if c > API_MAX: + return 429, "BLOCK %s: API limit exceeded" + + suspicious = link_token.is_suspicious(request) + + if suspicious: + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + if c > BURST_MAX_SUSPICIOUS: + return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" + + c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + if c > LONG_MAX_SUSPICIOUS: + return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" + + else: + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + if c > BURST_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" + + c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + if c > LONG_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX" + return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py new file mode 100644 index 000000000..71044c312 --- /dev/null +++ b/searx/botdetection/limiter.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _limiter src: + +Limiter +======= + +.. sidebar:: info + + The limiter requires a :ref:`Redis ` database. + +Bot protection / IP rate limitation. The intention of rate limitation is to +limit suspicious requests from an IP. The motivation behind this is the fact +that SearXNG passes through requests from bots and is thus classified as a bot +itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked +by the search engine (the origin) in some other way. + +To avoid blocking, the requests from bots to SearXNG must also be blocked, this +is the task of the limiter. To perform this task, the limiter uses the methods +from the :py:obj:`searx.botdetection`. + +To enable the limiter activate: + +.. code:: yaml + + server: + ... + limiter: true # rate limit the number of request on the instance, block some bots + +and set the redis-url connection. Check the value, it depends on your redis DB +(see :ref:`settings redis`), by example: + +.. code:: yaml + + redis: + url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 + +""" + +from typing import Optional, Tuple +import flask + +from searx.botdetection import ( + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, +) + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + + if request.path == '/healthz': + return None + + for func in [ + http_user_agent, + ]: + val = func.filter_request(request) + if val is not None: + return val + + if request.path == '/search': + + for func in [ + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ]: + val = func.filter_request(request) + if val is not None: + return val + + return None diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py new file mode 100644 index 000000000..8ef215f6c --- /dev/null +++ b/searx/botdetection/link_token.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``link_token`` +--------------------- + +The ``link_token`` method evaluates a request as :py:obj:`suspicious +` if the URL ``/client.css`` is not requested by the +client. By adding a random component (the token) in the URL a bot can not send +a ping by request a static URL. + +.. note:: + + This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. + +To get in use of this method a flask URL route needs to be added: + +.. code:: python + + @app.route('/client.css', methods=['GET', 'POST']) + def client_token(token=None): + link_token.ping(request, token) + return Response('', mimetype='text/css') + +And in the HTML template from flask a stylesheet link is needed (the value of +``link_token`` comes from :py:obj:`get_token`): + +.. code:: html + + + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" + +import string +import random +import flask + +from searx import logger +from searx import redisdb +from searx.redislib import secret_hash + +TOKEN_LIVE_TIME = 600 +"""Livetime (sec) of limiter's CSS token.""" + +PING_KEY = 'SearXNG_limiter.ping' +TOKEN_KEY = 'SearXNG_limiter.token' + +logger = logger.getChild('botdetection.link_token') + + +def is_suspicious(request: flask.Request): + """Checks if there is a valid ping for this request, if not this request is + rated as *suspicious*""" + redis_client = redisdb.client() + if not redis_client: + return False + + ping_key = get_ping_key(request) + if not redis_client.get(ping_key): + logger.warning( + "missing ping (IP: %s) / request: %s", + request.headers.get('X-Forwarded-For', ''), + ping_key, + ) + return True + + logger.debug("found ping for this request: %s", ping_key) + return False + + +def ping(request: flask.Request, token: str): + """This function is called by a request to URL ``/client.css``""" + redis_client = redisdb.client() + if not redis_client: + return + if not token_is_valid(token): + return + ping_key = get_ping_key(request) + logger.debug("store ping for: %s", ping_key) + redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME) + + +def get_ping_key(request: flask.Request): + """Generates a hashed key that fits (more or less) to a request. At least + X-Forwarded-For_ is needed to be able to assign the request to an IP. + + """ + return secret_hash( + PING_KEY + + request.headers.get('X-Forwarded-For', '') + + request.headers.get('Accept-Language', '') + + request.headers.get('User-Agent', '') + ) + + +def token_is_valid(token) -> bool: + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + +def get_token() -> str: + """Returns current token. If there is no currently active token a new token + is generated randomly and stored in the redis DB. + + - :py:obj:`TOKEN_LIVE_TIME` + - :py:obj:`TOKEN_KEY` + + """ + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) + return token diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 69bd576d4..d9566b92b 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -1,165 +1,42 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pyright: basic -"""Some bot protection / rate limitation +"""see :ref:`limiter src`""" -To monitor rate limits and protect privacy the IP addresses are getting stored -with a hash so the limiter plugin knows who to block. A redis database is -needed to store the hash values. - -Enable the plugin in ``settings.yml``: - -- ``server.limiter: true`` -- ``redis.url: ...`` check the value, see :ref:`settings redis` -""" - -import re -import string -import random -from flask import request +import flask from searx import redisdb from searx.plugins import logger -from searx.redislib import incr_sliding_window, secret_hash +from searx.botdetection import limiter +from searx.botdetection import dump_request name = "Request limiter" description = "Limit the number of request" default_on = False preference_section = 'service' + logger = logger.getChild('limiter') -block_user_agent = re.compile( - r'(' - + r'unknown' - + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' - + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' - + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' - + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' - + r'|ZmEu|BLEXBot|bitlybot' - # unmaintained Farside instances - + r'|' - + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') - + '|.*PetalBot.*' - + r')' -) - -PING_KEY = 'SearXNG_limiter.ping' -TOKEN_KEY = 'SearXNG_limiter.token' - - -def ping(): - redis_client = redisdb.client() - user_agent = request.headers.get('User-Agent', 'unknown') - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - - ping_key = PING_KEY + user_agent + x_forwarded_for - redis_client.set(secret_hash(ping_key), 1, ex=600) - - -def get_token(): - redis_client = redisdb.client() - if not redis_client: - # This function is also called when limiter is inactive / no redis DB - # (see render function in webapp.py) - return '12345678' - token = redis_client.get(TOKEN_KEY) - if token: - token = token.decode('UTF-8') - else: - token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) - redis_client.set(TOKEN_KEY, token, ex=600) - return token - - -def token_is_valid(token): - valid = token == get_token() - logger.debug("token is valid --> %s", valid) - return valid - - -def is_accepted_request() -> bool: - # pylint: disable=too-many-return-statements - redis_client = redisdb.client() - user_agent = request.headers.get('User-Agent', 'unknown') - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - - if request.path == '/healthz': - return True - - if block_user_agent.match(user_agent): - logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent)) - return False - - if request.path == '/search': - - c_burst_max = 2 - c_10min_max = 10 - - ping_key = PING_KEY + user_agent + x_forwarded_for - if redis_client.get(secret_hash(ping_key)): - logger.debug('got a ping') - c_burst_max = 15 - c_10min_max = 150 - else: - logger.debug('missing a ping') - - c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) - c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) - if c_burst > c_burst_max or c_10min > c_10min_max: - logger.debug("BLOCK %s: too many request", x_forwarded_for) - return False - - if len(request.headers.get('Accept-Language', '').strip()) == '': - logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for) - return False - - if request.headers.get('Connection') == 'close': - logger.debug("BLOCK %s: got Connection=close", x_forwarded_for) - return False - - accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] - if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list: - logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for) - return False - - if 'text/html' not in request.accept_mimetypes: - logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for) - return False - - if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600) - if c > 4: - logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for) - return False - - logger.debug( - "OK %s: '%s'" % (x_forwarded_for, request.path) - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept', '') - + " || Accept-Language: %s" % request.headers.get('Accept-Language', '') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '') - + " || Content-Type: %s" % request.headers.get('Content-Type', '') - + " || Content-Length: %s" % request.headers.get('Content-Length', '') - + " || Connection: %s" % request.headers.get('Connection', '') - + " || User-Agent: %s" % user_agent - ) - - return True - def pre_request(): - if not is_accepted_request(): - return 'Too Many Requests', 429 + """See :ref:`flask.Flask.before_request`""" + + val = limiter.filter_request(flask.request) + if val is not None: + http_status, msg = val + client_ip = flask.request.headers.get('X-Forwarded-For', '') + logger.error("BLOCK (IP %s): %s" % (client_ip, msg)) + return 'Too Many Requests', http_status + + logger.debug("OK: %s" % dump_request(flask.request)) return None -def init(app, settings): +def init(app: flask.Flask, settings) -> bool: if not settings['server']['limiter']: return False - if not redisdb.client(): - logger.error("The limiter requires Redis") # pylint: disable=undefined-variable + logger.error("The limiter requires Redis") return False - app.before_request(pre_request) return True diff --git a/searx/templates/simple/base.html b/searx/templates/simple/base.html index 9f7cdbb8e..3c6ed11c7 100644 --- a/searx/templates/simple/base.html +++ b/searx/templates/simple/base.html @@ -18,7 +18,7 @@ {% endif %} {% if get_setting('server.limiter') %} - + {% endif %} {% block styles %}{% endblock %} diff --git a/searx/webapp.py b/searx/webapp.py index 815bfcabd..d6322447a 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -93,7 +93,8 @@ from searx.utils import ( ) from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH from searx.query import RawTextQuery -from searx.plugins import limiter, Plugin, plugins, initialize as plugin_initialize +from searx.plugins import Plugin, plugins, initialize as plugin_initialize +from searx.botdetection import link_token from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import ( Preferences, @@ -416,7 +417,7 @@ def render(template_name: str, **kwargs): kwargs['endpoint'] = 'results' if 'q' in kwargs else request.endpoint kwargs['cookies'] = request.cookies kwargs['errors'] = request.errors - kwargs['limiter_token'] = limiter.get_token() + kwargs['link_token'] = link_token.get_token() # values from the preferences kwargs['preferences'] = request.preferences @@ -643,10 +644,9 @@ def health(): return Response('OK', mimetype='text/plain') -@app.route('/limiter.css', methods=['GET', 'POST']) -def limiter_css(token=None): - if limiter.token_is_valid(token): - limiter.ping() +@app.route('/client.css', methods=['GET', 'POST']) +def client_token(token=None): + link_token.ping(request, token) return Response('', mimetype='text/css') From 66fdec0eb92bf11c0bc477d6fb1df3dc783e4dcb Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 May 2023 17:24:43 +0200 Subject: [PATCH 04/10] [mod] limiter: add config file /etc/searxng/limiter.toml Signed-off-by: Markus Heiser --- requirements.txt | 1 + searx/botdetection/http_accept.py | 5 +- searx/botdetection/http_accept_encoding.py | 5 +- searx/botdetection/http_accept_language.py | 6 +- searx/botdetection/http_connection.py | 6 +- searx/botdetection/http_user_agent.py | 6 +- searx/botdetection/ip_limit.py | 11 +- searx/botdetection/limiter.py | 43 ++- searx/botdetection/limiter.toml | 3 + searx/plugins/limiter.py | 1 + searx/tools/__init__.py | 8 + searx/tools/config.py | 376 +++++++++++++++++++++ 12 files changed, 459 insertions(+), 12 deletions(-) create mode 100644 searx/botdetection/limiter.toml create mode 100644 searx/tools/__init__.py create mode 100644 searx/tools/config.py diff --git a/requirements.txt b/requirements.txt index 0bb3eafb0..9e3de3a46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ redis==4.5.5 markdown-it-py==2.2.0 typing_extensions==4.6.2 fasttext-predict==0.9.2.1 +pytomlpp==1.0.13 diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 1ab7cb4c1..23670a283 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -13,12 +13,15 @@ Accept_ header .. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept """ +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if 'text/html' not in request.accept_mimetypes: return 429, "bot detected, HTTP header Accept did not contain text/html" return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index ae630fd68..191249711 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -14,12 +14,15 @@ bot if the Accept-Encoding_ header .. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding """ +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 06743802e..558a216cf 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -11,13 +11,15 @@ if the Accept-Language_ header is unset. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ - +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if request.headers.get('Accept-Language', '').strip() == '': return 429, "bot detected, missing HTTP header Accept-Language" return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index f61f5e48c..0ef24a7b8 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -11,13 +11,15 @@ the Connection_ header is set to ``close``. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection """ - +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if request.headers.get('Connection', '').strip() == 'close': return 429, "bot detected, HTTP header 'Connection=close'" return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 892ae0bd9..3d1ec9173 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -12,11 +12,15 @@ the User-Agent_ header is unset or matches the regular expression https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ +# pylint: disable=unused-argument from typing import Optional, Tuple import re import flask +from searx.tools import config + + USER_AGENT = ( r'(' + r'unknown' @@ -44,7 +48,7 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): return ( diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index fce3f8b67..2646920c2 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -1,4 +1,5 @@ -""" +""".. _botdetection.ip_limit: + Method ``ip_limit`` ------------------- @@ -22,6 +23,8 @@ The :py:obj:`link_token` method is used to investigate whether a request is from typing import Optional, Tuple import flask +from searx.tools import config + from searx import redisdb from searx import logger @@ -56,7 +59,7 @@ API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: redis_client = redisdb.client() x_forwarded_for = request.headers.get('X-Forwarded-For', '') @@ -68,7 +71,9 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: if c > API_MAX: return 429, "BLOCK %s: API limit exceeded" - suspicious = link_token.is_suspicious(request) + suspicious = False + if cfg['botdetection.ip_limit.link_token']: + suspicious = link_token.is_suspicious(request) if suspicious: c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 71044c312..cc1e00b3c 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -38,8 +38,11 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ from typing import Optional, Tuple +from pathlib import Path import flask +import pytomlpp as toml +from searx.tools import config from searx.botdetection import ( http_accept, http_accept_encoding, @@ -49,6 +52,42 @@ from searx.botdetection import ( ip_limit, ) +LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" +"""Base configuration (schema) of the botdetection.""" + +LIMITER_CFG = Path('/etc/searxng/limiter.toml') +"""Lokal Limiter configuration.""" + +CFG_DEPRECATED = { + # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." +} + +CFG = config.Config({}, {}) + + +def init_cfg(log): + global CFG # pylint: disable=global-statement + CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) + + if not LIMITER_CFG.exists(): + log.warning("missing config file: %s", LIMITER_CFG) + return + + log.warning("load config file: %s", LIMITER_CFG) + try: + upd_cfg = toml.load(LIMITER_CFG) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", LIMITER_CFG, msg) + raise + + is_valid, issue_list = CFG.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") + CFG.update(upd_cfg) + def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: @@ -58,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val @@ -72,7 +111,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml new file mode 100644 index 000000000..30cd1b53c --- /dev/null +++ b/searx/botdetection/limiter.toml @@ -0,0 +1,3 @@ +[botdetection.ip_limit] + +link_token = true \ No newline at end of file diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index d9566b92b..92b0aa2a0 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -38,5 +38,6 @@ def init(app: flask.Flask, settings) -> bool: if not redisdb.client(): logger.error("The limiter requires Redis") return False + limiter.init_cfg(logger) app.before_request(pre_request) return True diff --git a/searx/tools/__init__.py b/searx/tools/__init__.py new file mode 100644 index 000000000..08e6d982f --- /dev/null +++ b/searx/tools/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _tools src: + +A collection of *utilities* used by SearXNG, but without SearXNG specific +peculiarities. + +""" diff --git a/searx/tools/config.py b/searx/tools/config.py new file mode 100644 index 000000000..f998031ba --- /dev/null +++ b/searx/tools/config.py @@ -0,0 +1,376 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Configuration class :py:class:`Config` with deep-update, schema validation +and deprecated names. + +The :py:class:`Config` class implements a configuration that is based on +structured dictionaries. The configuration schema is defined in a dictionary +structure and the configuration data is given in a dictionary structure. +""" +from __future__ import annotations + +import copy +import typing +import logging +import pathlib +import pytomlpp as toml + +__all__ = ['Config', 'UNSET', 'SchemaIssue'] + +log = logging.getLogger(__name__) + + +class FALSE: + """Class of ``False`` singelton""" + + # pylint: disable=multiple-statements + def __init__(self, msg): + self.msg = msg + + def __bool__(self): + return False + + def __str__(self): + return self.msg + + __repr__ = __str__ + + +UNSET = FALSE('') + + +class SchemaIssue(ValueError): + """Exception to store and/or raise a message from a schema issue.""" + + def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str): + self.level = level + super().__init__(msg) + + def __str__(self): + return f"[cfg schema {self.level}] {self.args[0]}" + + +class Config: + """Base class used for configuration""" + + UNSET = UNSET + + @classmethod + def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config: + + # init schema + + log.debug("load schema file: %s", schema_file) + cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated) + if not cfg_file.exists(): + log.warning("missing config file: %s", cfg_file) + return cfg + + # load configuration + + log.debug("load config file: %s", cfg_file) + try: + upd_cfg = toml.load(cfg_file) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", cfg_file, msg) + raise + + is_valid, issue_list = cfg.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {cfg_file} is invalid!") + cfg.update(upd_cfg) + return cfg + + def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]): + """Construtor of class Config. + + :param cfg_schema: Schema of the configuration + :param deprecated: dictionary that maps deprecated configuration names to a messages + + These values are needed for validation, see :py:obj:`validate`. + + """ + self.cfg_schema = cfg_schema + self.deprecated = deprecated + self.cfg = copy.deepcopy(cfg_schema) + + def __getitem__(self, key: str): + return self.get(key) + + def validate(self, cfg: dict): + """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`. + Validation is done by :py:obj:`validate`.""" + + return validate(self.cfg_schema, cfg, self.deprecated) + + def update(self, upd_cfg: dict): + """Update this configuration by ``upd_cfg``.""" + + dict_deepupdate(self.cfg, upd_cfg) + + def default(self, name: str): + """Returns default value of field ``name`` in ``self.cfg_schema``.""" + return value(name, self.cfg_schema) + + def get(self, name: str, default=UNSET, replace=True): + """Returns the value to which ``name`` points in the configuration. + + If there is no such ``name`` in the config and the ``default`` is + :py:obj:`UNSET`, a :py:obj:`KeyError` is raised. + """ + + parent = self._get_parent_dict(name) + val = parent.get(name.split('.')[-1], UNSET) + if val is UNSET: + if default is UNSET: + raise KeyError(name) + val = default + + if replace and isinstance(val, str): + val = val % self + return val + + def set(self, name: str, val): + """Set the value to which ``name`` points in the configuration. + + If there is no such ``name`` in the config, a :py:obj:`KeyError` is + raised. + """ + parent = self._get_parent_dict(name) + parent[name.split('.')[-1]] = val + + def _get_parent_dict(self, name): + parent_name = '.'.join(name.split('.')[:-1]) + if parent_name: + parent = value(parent_name, self.cfg) + else: + parent = self.cfg + if (parent is UNSET) or (not isinstance(parent, dict)): + raise KeyError(parent_name) + return parent + + def path(self, name: str, default=UNSET): + """Get a :py:class:`pathlib.Path` object from a config string.""" + + val = self.get(name, default) + if val is UNSET: + if default is UNSET: + raise KeyError(name) + return default + return pathlib.Path(str(val)) + + def pyobj(self, name, default=UNSET): + """Get python object refered by full qualiffied name (FQN) in the config + string.""" + + fqn = self.get(name, default) + if fqn is UNSET: + if default is UNSET: + raise KeyError(name) + return default + (modulename, name) = str(fqn).rsplit('.', 1) + m = __import__(modulename, {}, {}, [name], 0) + return getattr(m, name) + + +# working with dictionaries + + +def value(name: str, data_dict: dict): + """Returns the value to which ``name`` points in the ``dat_dict``. + + .. code: python + + >>> data_dict = { + "foo": {"bar": 1 }, + "bar": {"foo": 2 }, + "foobar": [1, 2, 3], + } + >>> value('foobar', data_dict) + [1, 2, 3] + >>> value('foo.bar', data_dict) + 1 + >>> value('foo.bar.xxx', data_dict) + + + """ + + ret_val = data_dict + for part in name.split('.'): + if isinstance(ret_val, dict): + ret_val = ret_val.get(part, UNSET) + if ret_val is UNSET: + break + return ret_val + + +def validate( + schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str] +) -> typing.Tuple[bool, list]: + + """Deep validation of dictionary in ``data_dict`` against dictionary in + ``schema_dict``. Argument deprecated is a dictionary that maps deprecated + configuration names to a messages:: + + deprecated = { + "foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'", + "..." : "..." + } + + The function returns a python tuple ``(is_valid, issue_list)``: + + ``is_valid``: + A bool value indicating ``data_dict`` is valid or not. + + ``issue_list``: + A list of messages (:py:obj:`SchemaIssue`) from the validation:: + + [schema warn] data_dict: deprecated 'fontlib.foo': + [schema invalid] data_dict: key unknown 'fontlib.foo' + [schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ... + + If ``schema_dict`` or ``data_dict`` is not a dictionary type a + :py:obj:`SchemaIssue` is raised. + + """ + names = [] + is_valid = True + issue_list = [] + + if not isinstance(schema_dict, dict): + raise SchemaIssue('invalid', "schema_dict is not a dict type") + if not isinstance(data_dict, dict): + raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type") + + is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated) + return is_valid, issue_list + + +def _validate( + names: typing.List, + issue_list: typing.List, + schema_dict: typing.Dict, + data_dict: typing.Dict, + deprecated: typing.Dict[str, str], +) -> typing.Tuple[bool, typing.List]: + + is_valid = True + + for key, data_value in data_dict.items(): + + names.append(key) + name = '.'.join(names) + + deprecated_msg = deprecated.get(name) + # print("XXX %s: key %s // data_value: %s" % (name, key, data_value)) + if deprecated_msg: + issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}")) + + schema_value = value(name, schema_dict) + # print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value)) + if schema_value is UNSET: + if not deprecated_msg: + issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict")) + is_valid = False + + elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck + issue_list.append( + SchemaIssue( + 'invalid', + (f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"), + ) + ) + is_valid = False + + elif isinstance(data_value, dict): + _valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated) + is_valid = is_valid and _valid + names.pop() + + return is_valid, issue_list + + +def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None): + """Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``. + + For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``: + + 0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a + :py:obj:`TypeError`. + + 1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``. + + 2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a + (deep-) copy of ``upd_val``. + + 3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the + list in ``upd_val``. + + 4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in + ``upd_val``. + """ + # pylint: disable=too-many-branches + if not isinstance(base_dict, dict): + raise TypeError("argument 'base_dict' is not a ditionary type") + if not isinstance(upd_dict, dict): + raise TypeError("argument 'upd_dict' is not a ditionary type") + + if names is None: + names = [] + + for upd_key, upd_val in upd_dict.items(): + # For each upd_key & upd_val pair in upd_dict: + + if isinstance(upd_val, dict): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, recursively deep-update it + if not isinstance(base_dict[upd_key], dict): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict") + dict_deepupdate( + base_dict[upd_key], + upd_val, + names + + [ + upd_key, + ], + ) + + else: + # if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val + base_dict[upd_key] = copy.deepcopy(upd_val) + + elif isinstance(upd_val, list): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, base_dict[up_key] is extended by + # the list from upd_val + if not isinstance(base_dict[upd_key], list): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict") + base_dict[upd_key].extend(upd_val) + + else: + # if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the + # list in upd_val. + base_dict[upd_key] = copy.deepcopy(upd_val) + + elif isinstance(upd_val, set): + + if upd_key in base_dict: + # if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val + if not isinstance(base_dict[upd_key], set): + raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict") + base_dict[upd_key].update(upd_val.copy()) + + else: + # if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the + # set in upd_val + base_dict[upd_key] = upd_val.copy() + + else: + # for any other type of upd_val replace or add base_dict[upd_key] by a copy + # of upd_val + base_dict[upd_key] = copy.copy(upd_val) From 9d7456fd6c49fbd96f03f6a5dedd6ba05e924d0a Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 27 May 2023 18:58:06 +0200 Subject: [PATCH 05/10] [fix] limiter.toml: botdetection.ip_limit turn off link_token by default To activate the ``link_token`` method in the ``ip_limit`` method add the following to your ``/etc/searxng/limiter.toml``:: [botdetection.ip_limit] link_token = true Related: https://github.com/searxng/searxng/pull/2357#issuecomment-1554116941 Signed-off-by: Markus Heiser --- searx/botdetection/ip_limit.py | 15 ++++++++++++--- searx/botdetection/limiter.toml | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 2646920c2..e72015190 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -9,9 +9,18 @@ bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. To take privacy only the hash value of an IP is stored in the redis DB and at least for a maximum of 10 minutes. -The :py:obj:`link_token` method is used to investigate whether a request is -*suspicious*. If the :py:obj:`link_token` method is activated and a request is -*suspicious* the request rates are reduced: +The :py:obj:`.link_token` method can be used to investigate whether a request is +*suspicious*. To activate the :py:obj:`.link_token` method in the +:py:obj:`.ip_limit` method add the following to your +``/etc/searxng/limiter.toml``: + +.. code:: toml + + [botdetection.ip_limit] + link_token = true + +If the :py:obj:`.link_token` method is activated and a request is *suspicious* +the request rates are reduced: - :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` - :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index 30cd1b53c..28c4e7589 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,3 +1,3 @@ [botdetection.ip_limit] -link_token = true \ No newline at end of file +link_token = false \ No newline at end of file From 52f1452c09ab2ec74aa5898d9ea749f33a71a814 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 27 May 2023 21:36:34 +0200 Subject: [PATCH 06/10] [mod] limiter: ip_limt - monitore suspicious IPs To intercept bots that get their IPs from a range of IPs, there is a ``SUSPICIOUS_IP_WINDOW``. In this window the suspicious IPs are stored for a longer time. IPs stored in this sliding window have a maximum of ``SUSPICIOUS_IP_MAX`` accesses before they are blocked. As soon as the IP makes a request that is not suspicious, the sliding window for this IP is droped. Signed-off-by: Markus Heiser --- searx/botdetection/ip_limit.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index e72015190..9cffff7f0 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -25,6 +25,13 @@ the request rates are reduced: - :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` - :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` +To intercept bots that get their IPs from a range of IPs, there is a +:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored +for a longer time. IPs stored in this sliding window have a maximum of +:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP +makes a request that is not suspicious, the sliding window for this IP is +droped. + .. _X-Forwarded-For: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For @@ -37,7 +44,7 @@ from searx.tools import config from searx import redisdb from searx import logger -from searx.redislib import incr_sliding_window +from searx.redislib import incr_sliding_window, drop_counter from . import link_token @@ -67,6 +74,12 @@ API_WONDOW = 3600 API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" +SUSPICIOUS_IP_WINDOW = 3600 * 24 +"""Time (sec) before sliding window for one suspicious IP expires.""" + +SUSPICIOUS_IP_MAX = 3 +"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" + def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: redis_client = redisdb.client() @@ -81,10 +94,18 @@ def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple return 429, "BLOCK %s: API limit exceeded" suspicious = False + suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for + if cfg['botdetection.ip_limit.link_token']: suspicious = link_token.is_suspicious(request) if suspicious: + + # this IP is suspicious: count requests from this IP + c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW) + if c > SUSPICIOUS_IP_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW" + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" @@ -94,6 +115,11 @@ def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" else: + + if cfg['botdetection.ip_limit.link_token']: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, suspicious_ip_counter) + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) if c > BURST_MAX: return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" From b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 28 May 2023 18:58:31 +0200 Subject: [PATCH 07/10] [mod] botdetection - improve ip_limit and link_token methods - counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 16 +--- searx/botdetection/_helpers.py | 93 ++++++++++++++++++++++ searx/botdetection/http_accept.py | 8 +- searx/botdetection/http_accept_encoding.py | 8 +- searx/botdetection/http_accept_language.py | 8 +- searx/botdetection/http_connection.py | 8 +- searx/botdetection/http_user_agent.py | 11 ++- searx/botdetection/ip_limit.py | 61 +++++++------- searx/botdetection/limiter.py | 11 ++- searx/botdetection/link_token.py | 43 +++++++--- searx/plugins/limiter.py | 14 +--- 11 files changed, 197 insertions(+), 84 deletions(-) create mode 100644 searx/botdetection/_helpers.py diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index 78a7d30f3..b4de0f9c8 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -9,18 +9,4 @@ The methods implemented in this python package are use by the :ref:`limiter src` """ -import flask - - -def dump_request(request: flask.Request): - return ( - "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path) - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept') - + " || Accept-Language: %s" % request.headers.get('Accept-Language') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') - + " || Content-Type: %s" % request.headers.get('Content-Type') - + " || Content-Length: %s" % request.headers.get('Content-Length') - + " || Connection: %s" % request.headers.get('Connection') - + " || User-Agent: %s" % request.headers.get('User-Agent') - ) +from ._helpers import dump_request diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py new file mode 100644 index 000000000..b034b980b --- /dev/null +++ b/searx/botdetection/_helpers.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, invalid-name + +from typing import Optional +import flask +import werkzeug + +from searx import logger + +logger = logger.getChild('botdetection') + + +def dump_request(request: flask.Request): + return ( + "%s: %s" % (get_real_ip(request), request.path) + + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) + + +def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]: + log_prefix = 'BLOCK %s: ' % get_real_ip(request) + logger.debug(log_prefix + log_msg) + return flask.make_response(('Too Many Requests', 429)) + + +def get_real_ip(request: flask.Request) -> str: + """Returns real IP of the request. Since not all proxies set all the HTTP + headers and incoming headers can be faked it may happen that the IP cannot + be determined correctly. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + This function tries to get the remote IP in the order listed below, + additional some tests are done and if inconsistencies or errors are + detected, they are logged. + + The remote IP of the request is taken from (first match): + + - X-Forwarded-For_ header + - `X-real-IP header `__ + - :py:obj:`flask.Request.remote_addr` + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + """ + + forwarded_for = request.headers.get("X-Forwarded-For") + real_ip = request.headers.get('X-Real-IP') + remote_addr = request.remote_addr + logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr) + + if not forwarded_for: + logger.error("X-Forwarded-For header is not set!") + else: + from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import + + forwarded_for = [x.strip() for x in forwarded_for.split(',')] + x_for: int = get_cfg()['real_ip.x_for'] + forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] + + if not real_ip: + logger.error("X-Real-IP header is not set!") + + if forwarded_for and real_ip and forwarded_for != real_ip: + logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) + + if forwarded_for and remote_addr and forwarded_for != remote_addr: + logger.warning( + "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for + ) + + if real_ip and remote_addr and real_ip != remote_addr: + logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) + + request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' + logger.debug("get_real_ip() -> %s", request_ip) + return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 23670a283..60e2330ae 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -15,13 +15,15 @@ Accept_ header .. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if 'text/html' not in request.accept_mimetypes: - return 429, "bot detected, HTTP header Accept did not contain text/html" + return too_many_requests(request, "HTTP header Accept did not contain text/html") return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 191249711..5301c5d9d 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -16,14 +16,16 @@ bot if the Accept-Encoding_ header .. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): - return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" + return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate") return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 558a216cf..060f67ec0 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -13,13 +13,15 @@ if the Accept-Language_ header is unset. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if request.headers.get('Accept-Language', '').strip() == '': - return 429, "bot detected, missing HTTP header Accept-Language" + return too_many_requests(request, "missing HTTP header Accept-Language") return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index 0ef24a7b8..e718dfe3f 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -13,13 +13,15 @@ the Connection_ header is set to ``close``. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if request.headers.get('Connection', '').strip() == 'close': - return 429, "bot detected, HTTP header 'Connection=close'" + return too_many_requests(request, "HTTP header 'Connection=close") return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 3d1ec9173..70309e975 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -14,11 +14,13 @@ the User-Agent_ header is unset or matches the regular expression """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import re import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests USER_AGENT = ( @@ -48,11 +50,8 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): - return ( - 429, - f"bot detected, HTTP header User-Agent: {user_agent}", - ) + return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}") return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 9cffff7f0..e7fa57187 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint """.. _botdetection.ip_limit: Method ``ip_limit`` @@ -37,16 +39,18 @@ droped. """ -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config - from searx import redisdb from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token +from ._helpers import too_many_requests + logger = logger.getChild('botdetection.ip_limit') @@ -81,50 +85,51 @@ SUSPICIOUS_IP_MAX = 3 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: + # pylint: disable=too-many-return-statements redis_client = redisdb.client() - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - if not x_forwarded_for: + client_ip = request.headers.get('X-Forwarded-For', '') + if not client_ip: logger.error("missing HTTP header X-Forwarded-For") if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW) + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) if c > API_MAX: - return 429, "BLOCK %s: API limit exceeded" - - suspicious = False - suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for + return too_many_requests(request, "too many request in API_WINDOW") if cfg['botdetection.ip_limit.link_token']: - suspicious = link_token.is_suspicious(request) - if suspicious: + suspicious = link_token.is_suspicious(request, True) + + if not suspicious: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip) + return None # this IP is suspicious: count requests from this IP - c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW) if c > SUSPICIOUS_IP_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW" + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip) + return flask.redirect(flask.url_for('index'), code=302) - c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: - return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" + return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) if c > LONG_MAX_SUSPICIOUS: - return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" + return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") - else: + return None - if cfg['botdetection.ip_limit.link_token']: - # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, suspicious_ip_counter) + # vanilla limiter without extensions counts BURST_MAX and LONG_MAX + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + if c > BURST_MAX: + return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)") - c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) - if c > BURST_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + if c > LONG_MAX: + return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)") - c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) - if c > LONG_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX" return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index cc1e00b3c..93826684f 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -42,6 +42,7 @@ from pathlib import Path import flask import pytomlpp as toml +from searx import logger from searx.tools import config from searx.botdetection import ( http_accept, @@ -62,7 +63,13 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = config.Config({}, {}) +CFG = None + + +def get_cfg() -> config.Config: + if CFG is None: + init_cfg(logger) + return CFG def init_cfg(log): @@ -73,7 +80,7 @@ def init_cfg(log): log.warning("missing config file: %s", LIMITER_CFG) return - log.warning("load config file: %s", LIMITER_CFG) + log.info("load config file: %s", LIMITER_CFG) try: upd_cfg = toml.load(LIMITER_CFG) except toml.DecodeError as exc: diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 8ef215f6c..376d06d61 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -47,15 +47,24 @@ from searx.redislib import secret_hash TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" +PING_LIVE_TIME = 3600 +"""Livetime (sec) of the ping-key from a client (request)""" + PING_KEY = 'SearXNG_limiter.ping' +"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" + TOKEN_KEY = 'SearXNG_limiter.token' +"""Key for which the current token is stored in the DB""" logger = logger.getChild('botdetection.link_token') -def is_suspicious(request: flask.Request): +def is_suspicious(request: flask.Request, renew: bool = False): """Checks if there is a valid ping for this request, if not this request is - rated as *suspicious*""" + rated as *suspicious*. If a valid ping exists and argument ``renew`` is + ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. + + """ redis_client = redisdb.client() if not redis_client: return False @@ -69,12 +78,19 @@ def is_suspicious(request: flask.Request): ) return True - logger.debug("found ping for this request: %s", ping_key) + if renew: + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + logger.debug("found ping for client request: %s", ping_key) return False def ping(request: flask.Request, token: str): - """This function is called by a request to URL ``/client.css``""" + """This function is called by a request to URL ``/client.css``. If + ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. + The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. + + """ redis_client = redisdb.client() if not redis_client: return @@ -82,19 +98,24 @@ def ping(request: flask.Request, token: str): return ping_key = get_ping_key(request) logger.debug("store ping for: %s", ping_key) - redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME) + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) def get_ping_key(request: flask.Request): - """Generates a hashed key that fits (more or less) to a request. At least - X-Forwarded-For_ is needed to be able to assign the request to an IP. + """Generates a hashed key that fits (more or less) to a client (request). + At least X-Forwarded-For_ is needed to be able to assign the request to an + IP. """ - return secret_hash( + return ( PING_KEY - + request.headers.get('X-Forwarded-For', '') - + request.headers.get('Accept-Language', '') - + request.headers.get('User-Agent', '') + + "[" + + secret_hash( + request.headers.get('X-Forwarded-For', '') + + request.headers.get('Accept-Language', '') + + request.headers.get('User-Agent', '') + ) + + "]" ) diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 92b0aa2a0..7edbb1ce0 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -20,16 +20,10 @@ logger = logger.getChild('limiter') def pre_request(): """See :ref:`flask.Flask.before_request`""" - - val = limiter.filter_request(flask.request) - if val is not None: - http_status, msg = val - client_ip = flask.request.headers.get('X-Forwarded-For', '') - logger.error("BLOCK (IP %s): %s" % (client_ip, msg)) - return 'Too Many Requests', http_status - - logger.debug("OK: %s" % dump_request(flask.request)) - return None + ret_val = limiter.filter_request(flask.request) + if ret_val is None: + logger.debug("OK: %s" % dump_request(flask.request)) + return ret_val def init(app: flask.Flask, settings) -> bool: From 38431d2e142b7da6a9b48aad203f02a2eff7e6fd Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 29 May 2023 19:46:37 +0200 Subject: [PATCH 08/10] [fix] correct determination of the IP for the request For correct determination of the IP to the request the function botdetection.get_real_ip() is implemented. This fonction is used in the ip_limit and link_token method of the botdetection and it is used in the self_info plugin. A documentation about the X-Forwarded-For header has been added. [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566211059 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 20 +++++++++++++++++--- searx/botdetection/ip_limit.py | 6 ++---- searx/botdetection/limiter.toml | 7 ++++++- searx/botdetection/link_token.py | 7 +++---- searx/plugins/self_info.py | 31 +++++++------------------------ tests/unit/test_plugins.py | 12 +++++++----- 6 files changed, 42 insertions(+), 41 deletions(-) diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index b4de0f9c8..c903b0bb4 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -2,11 +2,25 @@ # lint: pylint """.. _botdetection src: -Bot detection methods ---------------------- +X-Forwarded-For +=============== -The methods implemented in this python package are use by the :ref:`limiter src`. +.. attention:: + + A correct setup of the HTTP request headers ``X-Forwarded-For`` and + ``X-Real-IP`` is essential to be able to assign a request to an IP correctly: + + - `NGINX RequestHeader`_ + - `Apache RequestHeader`_ + +.. _NGINX RequestHeader: + https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site +.. _Apache RequestHeader: + https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site + +.. autofunction:: searx.botdetection.get_real_ip """ from ._helpers import dump_request +from ._helpers import get_real_ip diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index e7fa57187..268285dd9 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -49,7 +49,7 @@ from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token -from ._helpers import too_many_requests +from ._helpers import too_many_requests, get_real_ip logger = logger.getChild('botdetection.ip_limit') @@ -89,9 +89,7 @@ def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkz # pylint: disable=too-many-return-statements redis_client = redisdb.client() - client_ip = request.headers.get('X-Forwarded-For', '') - if not client_ip: - logger.error("missing HTTP header X-Forwarded-For") + client_ip = get_real_ip(request) if request.args.get('format', 'html') != 'html': c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index 28c4e7589..af797d32c 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,3 +1,8 @@ [botdetection.ip_limit] -link_token = false \ No newline at end of file +link_token = false + +[real_ip] + +# Number of values to trust for X-Forwarded-For. +x_for = 1 diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 376d06d61..a83214a33 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -43,6 +43,7 @@ import flask from searx import logger from searx import redisdb from searx.redislib import secret_hash +from ._helpers import get_real_ip TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" @@ -73,7 +74,7 @@ def is_suspicious(request: flask.Request, renew: bool = False): if not redis_client.get(ping_key): logger.warning( "missing ping (IP: %s) / request: %s", - request.headers.get('X-Forwarded-For', ''), + get_real_ip(request), ping_key, ) return True @@ -111,9 +112,7 @@ def get_ping_key(request: flask.Request): PING_KEY + "[" + secret_hash( - request.headers.get('X-Forwarded-For', '') - + request.headers.get('Accept-Language', '') - + request.headers.get('User-Agent', '') + get_real_ip(request) + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') ) + "]" ) diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py index fbe4518b5..8079ee0d4 100644 --- a/searx/plugins/self_info.py +++ b/searx/plugins/self_info.py @@ -1,21 +1,11 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring,invalid-name -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2015 by Adam Tauber, -''' -from flask_babel import gettext import re +from flask_babel import gettext + +from searx.botdetection._helpers import get_real_ip name = gettext('Self Information') description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".') @@ -28,18 +18,11 @@ query_examples = '' p = re.compile('.*user[ -]agent.*', re.IGNORECASE) -# attach callback to the post search hook -# request: flask request object -# ctx: the whole local context of the pre search hook def post_search(request, search): if search.search_query.pageno > 1: return True if search.search_query.query == 'ip': - x_forwarded_for = request.headers.getlist("X-Forwarded-For") - if x_forwarded_for: - ip = x_forwarded_for[0] - else: - ip = request.remote_addr + ip = get_real_ip(request) search.result_container.answers['ip'] = {'answer': ip} elif p.match(search.search_query.query): ua = request.user_agent diff --git a/tests/unit/test_plugins.py b/tests/unit/test_plugins.py index 28df835e5..0d555fdc0 100644 --- a/tests/unit/test_plugins.py +++ b/tests/unit/test_plugins.py @@ -50,9 +50,13 @@ class SelfIPTest(SearxTestCase): self.assertTrue(len(store.plugins) == 1) # IP test - request = Mock(remote_addr='127.0.0.1') - request.headers.getlist.return_value = [] - search = get_search_mock(query='ip', pageno=1) + request = Mock() + request.remote_addr = '127.0.0.1' + request.headers = {'X-Forwarded-For': '1.2.3.4, 127.0.0.1', 'X-Real-IP': '127.0.0.1'} + search = get_search_mock( + query='ip', + pageno=1, + ) store.call(store.plugins, 'post_search', request, search) self.assertTrue('127.0.0.1' in search.result_container.answers["ip"]["answer"]) @@ -62,7 +66,6 @@ class SelfIPTest(SearxTestCase): # User agent test request = Mock(user_agent='Mock') - request.headers.getlist.return_value = [] search = get_search_mock(query='user-agent', pageno=1) store.call(store.plugins, 'post_search', request, search) @@ -98,7 +101,6 @@ class HashPluginTest(SearxTestCase): self.assertTrue(len(store.plugins) == 1) request = Mock(remote_addr='127.0.0.1') - request.headers.getlist.return_value = [] # MD5 search = get_search_mock(query='md5 test', pageno=1) From 281e36f4b7848374535d5e953050ae73423191ca Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 15:41:48 +0200 Subject: [PATCH 09/10] [fix] limiter: replace real_ip by IPv4/v6 network Closes: https://github.com/searxng/searxng/issues/2477 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 1 + searx/botdetection/_helpers.py | 42 ++++++++++++--- searx/botdetection/http_accept.py | 16 ++++-- searx/botdetection/http_accept_encoding.py | 16 ++++-- searx/botdetection/http_accept_language.py | 14 +++-- searx/botdetection/http_connection.py | 16 ++++-- searx/botdetection/http_user_agent.py | 16 ++++-- searx/botdetection/ip_limit.py | 49 ++++++++++------- searx/botdetection/limiter.py | 61 ++++++++++------------ searx/botdetection/limiter.toml | 22 ++++++-- searx/botdetection/link_token.py | 54 +++++++++++-------- searx/plugins/limiter.py | 7 +-- 12 files changed, 208 insertions(+), 106 deletions(-) diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index c903b0bb4..fcd8e5630 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -24,3 +24,4 @@ X-Forwarded-For from ._helpers import dump_request from ._helpers import get_real_ip +from ._helpers import too_many_requests diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py index b034b980b..8e0156d6e 100644 --- a/searx/botdetection/_helpers.py +++ b/searx/botdetection/_helpers.py @@ -1,11 +1,19 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pylint: disable=missing-module-docstring, invalid-name +from __future__ import annotations -from typing import Optional +from ipaddress import ( + IPv4Network, + IPv6Network, + IPv6Address, + ip_address, + ip_network, +) import flask import werkzeug +from searx.tools import config from searx import logger logger = logger.getChild('botdetection') @@ -13,7 +21,7 @@ logger = logger.getChild('botdetection') def dump_request(request: flask.Request): return ( - "%s: %s" % (get_real_ip(request), request.path) + request.path + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + " || form: %s" % request.form @@ -27,12 +35,30 @@ def dump_request(request: flask.Request): ) -def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]: - log_prefix = 'BLOCK %s: ' % get_real_ip(request) - logger.debug(log_prefix + log_msg) +def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: + """Returns a HTTP 429 response object and writes a ERROR message to the + 'botdetection' logger. This function is used in part by the filter methods + to return the default ``Too Many Requests`` response. + + """ + + logger.debug("BLOCK %s: %s", network.compressed, log_msg) return flask.make_response(('Too Many Requests', 429)) +def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network: + """Returns the (client) network of whether the real_ip is part of.""" + + ip = ip_address(real_ip) + if isinstance(ip, IPv6Address): + prefix = cfg['real_ip.ipv6_prefix'] + else: + prefix = cfg['real_ip.ipv4_prefix'] + network = ip_network(f"{real_ip}/{prefix}", strict=False) + # logger.debug("get_network(): %s", network.compressed) + return network + + def get_real_ip(request: flask.Request) -> str: """Returns real IP of the request. Since not all proxies set all the HTTP headers and incoming headers can be faked it may happen that the IP cannot @@ -63,7 +89,9 @@ def get_real_ip(request: flask.Request) -> str: forwarded_for = request.headers.get("X-Forwarded-For") real_ip = request.headers.get('X-Real-IP') remote_addr = request.remote_addr - logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr) + # logger.debug( + # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr + # ) if not forwarded_for: logger.error("X-Forwarded-For header is not set!") @@ -89,5 +117,5 @@ def get_real_ip(request: flask.Request) -> str: logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' - logger.debug("get_real_ip() -> %s", request_ip) + # logger.debug("get_real_ip() -> %s", request_ip) return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 60e2330ae..b78a86278 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -15,7 +15,12 @@ Accept_ header .. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -23,7 +28,12 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if 'text/html' not in request.accept_mimetypes: - return too_many_requests(request, "HTTP header Accept did not contain text/html") + return too_many_requests(network, "HTTP header Accept did not contain text/html") return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 5301c5d9d..60718a4ca 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -16,7 +16,12 @@ bot if the Accept-Encoding_ header .. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -24,8 +29,13 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): - return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate") + return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 060f67ec0..395d28bfd 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -12,8 +12,12 @@ if the Accept-Language_ header is unset. """ # pylint: disable=unused-argument +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) -from typing import Optional import flask import werkzeug @@ -21,7 +25,11 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: if request.headers.get('Accept-Language', '').strip() == '': - return too_many_requests(request, "missing HTTP header Accept-Language") + return too_many_requests(network, "missing HTTP header Accept-Language") return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index e718dfe3f..ee0d80a23 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -13,7 +13,12 @@ the Connection_ header is set to ``close``. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -21,7 +26,12 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if request.headers.get('Connection', '').strip() == 'close': - return too_many_requests(request, "HTTP header 'Connection=close") + return too_many_requests(network, "HTTP header 'Connection=close") return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 70309e975..17025f68b 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -14,8 +14,13 @@ the User-Agent_ header is unset or matches the regular expression """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations import re +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -50,8 +55,13 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): - return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}") + return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 268285dd9..46e026371 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -38,8 +38,12 @@ droped. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For """ +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) -from typing import Optional import flask import werkzeug from searx.tools import config @@ -49,7 +53,7 @@ from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token -from ._helpers import too_many_requests, get_real_ip +from ._helpers import too_many_requests logger = logger.getChild('botdetection.ip_limit') @@ -85,49 +89,58 @@ SUSPICIOUS_IP_MAX = 3 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + # pylint: disable=too-many-return-statements redis_client = redisdb.client() - client_ip = get_real_ip(request) + if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: + logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) + return None if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW) if c > API_MAX: - return too_many_requests(request, "too many request in API_WINDOW") + return too_many_requests(network, "too many request in API_WINDOW") if cfg['botdetection.ip_limit.link_token']: - suspicious = link_token.is_suspicious(request, True) + suspicious = link_token.is_suspicious(network, request, True) if not suspicious: # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip) + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) return None # this IP is suspicious: count requests from this IP - c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW) + c = incr_sliding_window( + redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW + ) if c > SUSPICIOUS_IP_MAX: - logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip) + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) return flask.redirect(flask.url_for('index'), code=302) - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: - return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) if c > LONG_MAX_SUSPICIOUS: - return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") return None # vanilla limiter without extensions counts BURST_MAX and LONG_MAX - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX: - return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)") + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) if c > LONG_MAX: - return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)") + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 93826684f..18ffc8407 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -37,14 +37,16 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ -from typing import Optional, Tuple +from __future__ import annotations + from pathlib import Path import flask -import pytomlpp as toml +import werkzeug -from searx import logger from searx.tools import config -from searx.botdetection import ( +from searx import logger + +from . import ( http_accept, http_accept_encoding, http_accept_language, @@ -53,6 +55,16 @@ from searx.botdetection import ( ip_limit, ) +from ._helpers import ( + get_network, + get_real_ip, + dump_request, +) + +logger = logger.getChild('botdetection.limiter') + +CFG: config.Config = None # type: ignore + LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" """Base configuration (schema) of the botdetection.""" @@ -63,40 +75,21 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = None - def get_cfg() -> config.Config: + global CFG # pylint: disable=global-statement if CFG is None: - init_cfg(logger) + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) return CFG -def init_cfg(log): - global CFG # pylint: disable=global-statement - CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) +def filter_request(request: flask.Request) -> werkzeug.Response | None: - if not LIMITER_CFG.exists(): - log.warning("missing config file: %s", LIMITER_CFG) - return - - log.info("load config file: %s", LIMITER_CFG) - try: - upd_cfg = toml.load(LIMITER_CFG) - except toml.DecodeError as exc: - msg = str(exc).replace('\t', '').replace('\n', ' ') - log.error("%s: %s", LIMITER_CFG, msg) - raise - - is_valid, issue_list = CFG.validate(upd_cfg) - for msg in issue_list: - log.error(str(msg)) - if not is_valid: - raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") - CFG.update(upd_cfg) - - -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + cfg = get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + if network.is_link_local: + return None if request.path == '/healthz': return None @@ -104,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val @@ -118,8 +111,8 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val - + logger.debug(f"OK {network}: %s", dump_request(flask.request)) return None diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index af797d32c..71a231e8f 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,8 +1,22 @@ -[botdetection.ip_limit] - -link_token = false - [real_ip] # Number of values to trust for X-Forwarded-For. + x_for = 1 + +# The prefix defines the number of leading bits in an address that are compared +# to determine whether or not an address is part of a (client) network. + +ipv4_prefix = 32 +ipv6_prefix = 48 + +[botdetection.ip_limit] + +# To get unlimited access in a local network, by default link-lokal addresses +# (networks) are not monitored by the ip_limit +filter_link_local = false + +# acrivate link_token method in the ip_limit method +link_token = false + + diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index a83214a33..11a6a56b5 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -6,7 +6,7 @@ Method ``link_token`` The ``link_token`` method evaluates a request as :py:obj:`suspicious ` if the URL ``/client.css`` is not requested by the -client. By adding a random component (the token) in the URL a bot can not send +client. By adding a random component (the token) in the URL, a bot can not send a ping by request a static URL. .. note:: @@ -35,6 +35,11 @@ And in the HTML template from flask a stylesheet link is needed (the value of https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For """ +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) import string import random @@ -43,7 +48,11 @@ import flask from searx import logger from searx import redisdb from searx.redislib import secret_hash -from ._helpers import get_real_ip + +from ._helpers import ( + get_network, + get_real_ip, +) TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" @@ -60,29 +69,26 @@ TOKEN_KEY = 'SearXNG_limiter.token' logger = logger.getChild('botdetection.link_token') -def is_suspicious(request: flask.Request, renew: bool = False): - """Checks if there is a valid ping for this request, if not this request is - rated as *suspicious*. If a valid ping exists and argument ``renew`` is - ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): + """Checks whether a valid ping is exists for this (client) network, if not + this request is rated as *suspicious*. If a valid ping exists and argument + ``renew`` is ``True`` the expire time of this ping is reset to + :py:obj:`PING_LIVE_TIME`. """ redis_client = redisdb.client() if not redis_client: return False - ping_key = get_ping_key(request) + ping_key = get_ping_key(network, request) if not redis_client.get(ping_key): - logger.warning( - "missing ping (IP: %s) / request: %s", - get_real_ip(request), - ping_key, - ) + logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key) return True if renew: redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) - logger.debug("found ping for client request: %s", ping_key) + logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) return False @@ -92,27 +98,31 @@ def ping(request: flask.Request, token: str): The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. """ + from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import + redis_client = redisdb.client() if not redis_client: return if not token_is_valid(token): return - ping_key = get_ping_key(request) - logger.debug("store ping for: %s", ping_key) + + cfg = limiter.get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + + ping_key = get_ping_key(network, request) + logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) -def get_ping_key(request: flask.Request): - """Generates a hashed key that fits (more or less) to a client (request). - At least X-Forwarded-For_ is needed to be able to assign the request to an - IP. - - """ +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: + """Generates a hashed key that fits (more or less) to a *WEB-browser + session* in a network.""" return ( PING_KEY + "[" + secret_hash( - get_real_ip(request) + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') + network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') ) + "]" ) diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 7edbb1ce0..a8beb5e88 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -8,7 +8,6 @@ import flask from searx import redisdb from searx.plugins import logger from searx.botdetection import limiter -from searx.botdetection import dump_request name = "Request limiter" description = "Limit the number of request" @@ -20,10 +19,7 @@ logger = logger.getChild('limiter') def pre_request(): """See :ref:`flask.Flask.before_request`""" - ret_val = limiter.filter_request(flask.request) - if ret_val is None: - logger.debug("OK: %s" % dump_request(flask.request)) - return ret_val + return limiter.filter_request(flask.request) def init(app: flask.Flask, settings) -> bool: @@ -32,6 +28,5 @@ def init(app: flask.Flask, settings) -> bool: if not redisdb.client(): logger.error("The limiter requires Redis") return False - limiter.init_cfg(logger) app.before_request(pre_request) return True From 80af38d37b21dc6e5edbf27bd22310db42a6f923 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 16:00:49 +0200 Subject: [PATCH 10/10] [mod] increase SUSPICIOUS_IP_WINDOW from one day to 30 days In my tests I see bots rotating IPs (with endless IP lists). If such a bot has 100 IPs and has three attempts (SUSPICIOUS_IP_MAX = 3) then it can successfully send up to 300 requests in one day while rotating the IP. To block the bots for a longer period of time the SUSPICIOUS_IP_WINDOW, as the time period in which an IP is observed, must be increased. For normal WEB-browsers this is no problem, because the SUSPICIOUS_IP_WINDOW is deleted as soon as the CSS with the token is loaded. SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 Time (sec) before sliding window for one suspicious IP expires. SUSPICIOUS_IP_MAX = 3 Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" Signed-off-by: Markus Heiser --- searx/botdetection/ip_limit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 46e026371..bb4229f0e 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -82,7 +82,7 @@ API_WONDOW = 3600 API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" -SUSPICIOUS_IP_WINDOW = 3600 * 24 +SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 """Time (sec) before sliding window for one suspicious IP expires.""" SUSPICIOUS_IP_MAX = 3