[mod] botdetection - improve ip_limit and link_token methods

- counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the
  request is validated by the link_token method [1]

- renew a ping-key on validation [2], this is needed for infinite scrolling,
  where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in
  the vanilla limiter

- normalize the counter names of the ip_limit method to 'ip_limit.*'

- just integrate the ip_limit method straight forward in the limiter plugin /
  non intermediate code --> ip_limit now returns None or a werkzeug.Response
  object that can be passed by the plugin to the flask application / non
  intermediate code that returns a tuple

[1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277
[2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206
[3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-05-28 18:58:31 +02:00
parent 52f1452c09
commit b8c7c2c9aa
11 changed files with 197 additions and 84 deletions

View file

@ -9,18 +9,4 @@ The methods implemented in this python package are use by the :ref:`limiter src`
"""
import flask
def dump_request(request: flask.Request):
return (
"%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path)
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ " || Content-Type: %s" % request.headers.get('Content-Type')
+ " || Content-Length: %s" % request.headers.get('Content-Length')
+ " || Connection: %s" % request.headers.get('Connection')
+ " || User-Agent: %s" % request.headers.get('User-Agent')
)
from ._helpers import dump_request

View file

@ -0,0 +1,93 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-module-docstring, invalid-name
from typing import Optional
import flask
import werkzeug
from searx import logger
logger = logger.getChild('botdetection')
def dump_request(request: flask.Request):
return (
"%s: %s" % (get_real_ip(request), request.path)
+ " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ " || Content-Type: %s" % request.headers.get('Content-Type')
+ " || Content-Length: %s" % request.headers.get('Content-Length')
+ " || Connection: %s" % request.headers.get('Connection')
+ " || User-Agent: %s" % request.headers.get('User-Agent')
)
def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]:
log_prefix = 'BLOCK %s: ' % get_real_ip(request)
logger.debug(log_prefix + log_msg)
return flask.make_response(('Too Many Requests', 429))
def get_real_ip(request: flask.Request) -> str:
"""Returns real IP of the request. Since not all proxies set all the HTTP
headers and incoming headers can be faked it may happen that the IP cannot
be determined correctly.
.. sidebar:: :py:obj:`flask.Request.remote_addr`
SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
This function tries to get the remote IP in the order listed below,
additional some tests are done and if inconsistencies or errors are
detected, they are logged.
The remote IP of the request is taken from (first match):
- X-Forwarded-For_ header
- `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
- :py:obj:`flask.Request.remote_addr`
.. _ProxyFix:
https://werkzeug.palletsprojects.com/middleware/proxy_fix/
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""
forwarded_for = request.headers.get("X-Forwarded-For")
real_ip = request.headers.get('X-Real-IP')
remote_addr = request.remote_addr
logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr)
if not forwarded_for:
logger.error("X-Forwarded-For header is not set!")
else:
from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
forwarded_for = [x.strip() for x in forwarded_for.split(',')]
x_for: int = get_cfg()['real_ip.x_for']
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
if not real_ip:
logger.error("X-Real-IP header is not set!")
if forwarded_for and real_ip and forwarded_for != real_ip:
logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
if forwarded_for and remote_addr and forwarded_for != remote_addr:
logger.warning(
"IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
)
if real_ip and remote_addr and real_ip != remote_addr:
logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
logger.debug("get_real_ip() -> %s", request_ip)
return request_ip

View file

@ -15,13 +15,15 @@ Accept_ header ..
"""
# pylint: disable=unused-argument
from typing import Optional, Tuple
from typing import Optional
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
if 'text/html' not in request.accept_mimetypes:
return 429, "bot detected, HTTP header Accept did not contain text/html"
return too_many_requests(request, "HTTP header Accept did not contain text/html")
return None

View file

@ -16,14 +16,16 @@ bot if the Accept-Encoding_ header ..
"""
# pylint: disable=unused-argument
from typing import Optional, Tuple
from typing import Optional
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list):
return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"
return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate")
return None

View file

@ -13,13 +13,15 @@ if the Accept-Language_ header is unset.
"""
# pylint: disable=unused-argument
from typing import Optional, Tuple
from typing import Optional
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
if request.headers.get('Accept-Language', '').strip() == '':
return 429, "bot detected, missing HTTP header Accept-Language"
return too_many_requests(request, "missing HTTP header Accept-Language")
return None

View file

@ -13,13 +13,15 @@ the Connection_ header is set to ``close``.
"""
# pylint: disable=unused-argument
from typing import Optional, Tuple
from typing import Optional
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
if request.headers.get('Connection', '').strip() == 'close':
return 429, "bot detected, HTTP header 'Connection=close'"
return too_many_requests(request, "HTTP header 'Connection=close")
return None

View file

@ -14,11 +14,13 @@ the User-Agent_ header is unset or matches the regular expression
"""
# pylint: disable=unused-argument
from typing import Optional, Tuple
from typing import Optional
import re
import flask
import werkzeug
from searx.tools import config
from ._helpers import too_many_requests
USER_AGENT = (
@ -48,11 +50,8 @@ def regexp_user_agent():
return _regexp
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
user_agent = request.headers.get('User-Agent', 'unknown')
if regexp_user_agent().match(user_agent):
return (
429,
f"bot detected, HTTP header User-Agent: {user_agent}",
)
return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}")
return None

View file

@ -1,3 +1,5 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _botdetection.ip_limit:
Method ``ip_limit``
@ -37,16 +39,18 @@ droped.
"""
from typing import Optional, Tuple
from typing import Optional
import flask
import werkzeug
from searx.tools import config
from searx import redisdb
from searx import logger
from searx.redislib import incr_sliding_window, drop_counter
from . import link_token
from ._helpers import too_many_requests
logger = logger.getChild('botdetection.ip_limit')
@ -81,50 +85,51 @@ SUSPICIOUS_IP_MAX = 3
"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
# pylint: disable=too-many-return-statements
redis_client = redisdb.client()
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
if not x_forwarded_for:
client_ip = request.headers.get('X-Forwarded-For', '')
if not client_ip:
logger.error("missing HTTP header X-Forwarded-For")
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW)
c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW)
if c > API_MAX:
return 429, "BLOCK %s: API limit exceeded"
suspicious = False
suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for
return too_many_requests(request, "too many request in API_WINDOW")
if cfg['botdetection.ip_limit.link_token']:
suspicious = link_token.is_suspicious(request)
if suspicious:
suspicious = link_token.is_suspicious(request, True)
if not suspicious:
# this IP is no longer suspicious: release ip again / delete the counter of this IP
drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip)
return None
# this IP is suspicious: count requests from this IP
c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW)
c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW)
if c > SUSPICIOUS_IP_MAX:
return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW"
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip)
return flask.redirect(flask.url_for('index'), code=302)
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
if c > BURST_MAX_SUSPICIOUS:
return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS"
return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
if c > LONG_MAX_SUSPICIOUS:
return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS"
return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
else:
return None
if cfg['botdetection.ip_limit.link_token']:
# this IP is no longer suspicious: release ip again / delete the counter of this IP
drop_counter(redis_client, suspicious_ip_counter)
# vanilla limiter without extensions counts BURST_MAX and LONG_MAX
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW)
if c > BURST_MAX:
return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)")
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
if c > BURST_MAX:
return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX"
c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW)
if c > LONG_MAX:
return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)")
c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
if c > LONG_MAX:
return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX"
return None

View file

@ -42,6 +42,7 @@ from pathlib import Path
import flask
import pytomlpp as toml
from searx import logger
from searx.tools import config
from searx.botdetection import (
http_accept,
@ -62,7 +63,13 @@ CFG_DEPRECATED = {
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
}
CFG = config.Config({}, {})
CFG = None
def get_cfg() -> config.Config:
if CFG is None:
init_cfg(logger)
return CFG
def init_cfg(log):
@ -73,7 +80,7 @@ def init_cfg(log):
log.warning("missing config file: %s", LIMITER_CFG)
return
log.warning("load config file: %s", LIMITER_CFG)
log.info("load config file: %s", LIMITER_CFG)
try:
upd_cfg = toml.load(LIMITER_CFG)
except toml.DecodeError as exc:

View file

@ -47,15 +47,24 @@ from searx.redislib import secret_hash
TOKEN_LIVE_TIME = 600
"""Livetime (sec) of limiter's CSS token."""
PING_LIVE_TIME = 3600
"""Livetime (sec) of the ping-key from a client (request)"""
PING_KEY = 'SearXNG_limiter.ping'
"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
TOKEN_KEY = 'SearXNG_limiter.token'
"""Key for which the current token is stored in the DB"""
logger = logger.getChild('botdetection.link_token')
def is_suspicious(request: flask.Request):
def is_suspicious(request: flask.Request, renew: bool = False):
"""Checks if there is a valid ping for this request, if not this request is
rated as *suspicious*"""
rated as *suspicious*. If a valid ping exists and argument ``renew`` is
``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`.
"""
redis_client = redisdb.client()
if not redis_client:
return False
@ -69,12 +78,19 @@ def is_suspicious(request: flask.Request):
)
return True
logger.debug("found ping for this request: %s", ping_key)
if renew:
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
logger.debug("found ping for client request: %s", ping_key)
return False
def ping(request: flask.Request, token: str):
"""This function is called by a request to URL ``/client<token>.css``"""
"""This function is called by a request to URL ``/client<token>.css``. If
``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`.
"""
redis_client = redisdb.client()
if not redis_client:
return
@ -82,19 +98,24 @@ def ping(request: flask.Request, token: str):
return
ping_key = get_ping_key(request)
logger.debug("store ping for: %s", ping_key)
redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME)
redis_client.set(ping_key, 1, ex=PING_LIVE_TIME)
def get_ping_key(request: flask.Request):
"""Generates a hashed key that fits (more or less) to a request. At least
X-Forwarded-For_ is needed to be able to assign the request to an IP.
"""Generates a hashed key that fits (more or less) to a client (request).
At least X-Forwarded-For_ is needed to be able to assign the request to an
IP.
"""
return secret_hash(
return (
PING_KEY
+ request.headers.get('X-Forwarded-For', '')
+ request.headers.get('Accept-Language', '')
+ request.headers.get('User-Agent', '')
+ "["
+ secret_hash(
request.headers.get('X-Forwarded-For', '')
+ request.headers.get('Accept-Language', '')
+ request.headers.get('User-Agent', '')
)
+ "]"
)

View file

@ -20,16 +20,10 @@ logger = logger.getChild('limiter')
def pre_request():
"""See :ref:`flask.Flask.before_request`"""
val = limiter.filter_request(flask.request)
if val is not None:
http_status, msg = val
client_ip = flask.request.headers.get('X-Forwarded-For', '<unknown>')
logger.error("BLOCK (IP %s): %s" % (client_ip, msg))
return 'Too Many Requests', http_status
logger.debug("OK: %s" % dump_request(flask.request))
return None
ret_val = limiter.filter_request(flask.request)
if ret_val is None:
logger.debug("OK: %s" % dump_request(flask.request))
return ret_val
def init(app: flask.Flask, settings) -> bool: